Blame - src/add.c - platform/external/XNNPACK

blob: 8f41ebab8d23b836695f1c8675d62524d71afa70 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <assert.h>
				10	#include <math.h>
				11	#include <stddef.h>
				12	#include <stdint.h>
				13	#include <stdlib.h>
				14
				15	#include <xnnpack.h>
				16	#include <xnnpack/allocator.h>
				17	#include <xnnpack/operator.h>
				18	#include <xnnpack/requantization.h>
				19	#include <xnnpack/log.h>
				20	#include <xnnpack/params.h>
				21
				22
				23	enum xnn_status xnn_create_add_nc_q8(
				24	size_t channels,
				25	size_t a_stride,
				26	size_t b_stride,
				27	size_t sum_stride,
				28	uint8_t a_zero_point,
				29	float a_scale,
				30	uint8_t b_zero_point,
				31	float b_scale,
				32	uint8_t sum_zero_point,
				33	float sum_scale,
				34	uint8_t sum_min,
				35	uint8_t sum_max,
				36	uint32_t flags,
				37	xnn_operator_t* add_op_out)
				38	{
				39	xnn_operator_t add_op = NULL;
				40	enum xnn_status status = xnn_status_uninitialized;
				41
				42	if (!xnn_params.initialized) {
				43	xnn_log_error("failed to create Add operator: XNNPACK is not initialized");
				44	goto error;
				45	}
				46
				47	status = xnn_status_invalid_parameter;
				48
				49	if (channels == 0) {
				50	xnn_log_error(
				51	"failed to create Add operator with %zu channels: number of channels must be non-zero", channels);
				52	goto error;
				53	}
				54
				55	if (a_stride < channels) {
				56	xnn_log_error(
				57	"failed to create Add operator with A element stride of %zu: "
				58	"stride must be at least as large as the number of channels (%zu)",
				59	a_stride, channels);
				60	goto error;
				61	}
				62
				63	if (b_stride < channels) {
				64	xnn_log_error(
				65	"failed to create Add operator with B element stride of %zu: "
				66	"stride must be at least as large as the number of channels (%zu)",
				67	b_stride, channels);
				68	goto error;
				69	}
				70
				71	if (sum_stride < channels) {
				72	xnn_log_error(
				73	"failed to create Add operator with Sum element stride of %zu: "
				74	"stride must be at least as large as the number of channels (%zu)",
				75	sum_stride, channels);
				76	goto error;
				77	}
				78
				79	if (a_scale <= 0.0f \|\| !isnormal(a_scale)) {
				80	xnn_log_error(
				81	"failed to create Add operator with %.7g A scale: scale must be finite, normalized, and positive", a_scale);
				82	goto error;
				83	}
				84
				85	if (b_scale <= 0.0f \|\| !isnormal(b_scale)) {
				86	xnn_log_error(
				87	"failed to create Add operator with %.7g B scale: scale must be finite, normalized, and positive", b_scale);
				88	goto error;
				89	}
				90
				91	if (sum_scale <= 0.0f \|\| !isnormal(sum_scale)) {
				92	xnn_log_error(
				93	"failed to create Add operator with %.7g output scale: scale must be finite, normalized, and positive",
				94	sum_scale);
				95	goto error;
				96	}
				97
				98	if (sum_min >= sum_max) {
				99	xnn_log_error(
				100	"failed to create Add operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
				101	sum_min, sum_max);
				102	goto error;
				103	}
				104
				105	status = xnn_status_unsupported_parameter;
				106
				107	const float a_output_scale = a_scale / sum_scale;
				108	if (a_output_scale < 0x1.0p-14f \|\| a_output_scale >= 0x1.0p+8f) {
				109	xnn_log_error(
				110	"failed to create Add operator with %.7g A-to-output scale ratio: scale ratio must be in [2-14, 28) range",
				111	a_output_scale);
				112	goto error;
				113	}
				114
				115	const float b_output_scale = b_scale / sum_scale;
				116	if (b_output_scale < 0x1.0p-14f \|\| b_output_scale >= 0x1.0p+8f) {
				117	xnn_log_error(
				118	"failed to create Add operator with %.7g A-to-output scale ratio: scale ratio must be in [2-14, 28) range",
				119	b_output_scale);
				120	goto error;
				121	}
				122
				123	status = xnn_status_out_of_memory;
				124
				125	add_op = xnn_allocate_zero_memory(sizeof(struct xnn_operator));
				126	if (add_op == NULL) {
				127	xnn_log_error("failed to allocate %zu bytes for Add operator descriptor", sizeof(struct xnn_operator));
				128	goto error;
				129	}
				130
				131	add_op->channels = channels;
				132	add_op->input_pixel_stride = a_stride;
				133	add_op->input2_pixel_stride = b_stride;
				134	add_op->output_pixel_stride = sum_stride;
				135	add_op->q8_add_params =
				136	xnn_compute_q8_add_params(
				137	a_zero_point, b_zero_point, sum_zero_point,
				138	a_scale / sum_scale, b_scale / sum_scale,
				139	sum_min, sum_max);
				140
				141	add_op->type = xnn_operator_type_add_q8;
				142	add_op->ukernel.type = xnn_ukernel_type_add;
				143
				144	add_op->state = xnn_run_state_invalid;
				145
				146	*add_op_out = add_op;
				147	return xnn_status_success;
				148
				149	error:
				150	xnn_delete_operator(add_op);
				151	return status;
				152	}
				153
				154	enum xnn_status xnn_create_add_nc_f32(
				155	size_t channels,
				156	size_t a_stride,
				157	size_t b_stride,
				158	size_t sum_stride,
				159	float sum_min,
				160	float sum_max,
				161	uint32_t flags,
				162	xnn_operator_t* add_op_out)
				163	{
				164	xnn_operator_t add_op = NULL;
				165	enum xnn_status status = xnn_status_uninitialized;
				166
				167	if (!xnn_params.initialized) {
				168	xnn_log_error("failed to create Add operator: XNNPACK is not initialized");
				169	goto error;
				170	}
				171
				172	status = xnn_status_invalid_parameter;
				173
				174	if (channels == 0) {
				175	xnn_log_error(
				176	"failed to create add operator with %zu channels: number of channels must be non-zero", channels);
				177	goto error;
				178	}
				179
				180	if (a_stride < channels) {
				181	xnn_log_error(
				182	"failed to create Add operator with A element stride of %zu: "
				183	"stride must be at least as large as the number of channels (%zu)",
				184	a_stride, channels);
				185	goto error;
				186	}
				187
				188	if (b_stride < channels) {
				189	xnn_log_error(
				190	"failed to create Add operator with B element stride of %zu: "
				191	"stride must be at least as large as the number of channels (%zu)",
				192	b_stride, channels);
				193	goto error;
				194	}
				195
				196	if (sum_stride < channels) {
				197	xnn_log_error(
				198	"failed to create Add operator with Sum element stride of %zu: "
				199	"stride must be at least as large as the number of channels (%zu)",
				200	sum_stride, channels);
				201	goto error;
				202	}
				203
				204	if (isnan(sum_min)) {
				205	xnn_log_error(
				206	"failed to create Add operator with NaN output lower bound: lower bound must be non-NaN");
				207	goto error;
				208	}
				209
				210	if (isnan(sum_max)) {
				211	xnn_log_error(
				212	"failed to create Add operator with NaN output upper bound: upper bound must be non-NaN");
				213	goto error;
				214	}
				215
				216	if (sum_min >= sum_max) {
				217	xnn_log_error(
				218	"failed to create Add operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
				219	sum_min, sum_max);
				220	goto error;
				221	}
				222
				223	status = xnn_status_out_of_memory;
				224
				225	add_op = xnn_allocate_zero_memory(sizeof(struct xnn_operator));
				226	if (add_op == NULL) {
				227	xnn_log_error("failed to allocate %zu bytes for Add operator descriptor", sizeof(struct xnn_operator));
				228	goto error;
				229	}
				230
				231	add_op->channels = channels;
				232	add_op->input_pixel_stride = a_stride;
				233	add_op->input2_pixel_stride = b_stride;
				234	add_op->output_pixel_stride = sum_stride;
				235	add_op->f32_output_params = xnn_compute_f32_output_params(sum_min, sum_max);
				236
				237	add_op->type = xnn_operator_type_add_f32;
				238	add_op->ukernel.type = xnn_ukernel_type_add;
				239
				240	add_op->state = xnn_run_state_invalid;
				241
				242	*add_op_out = add_op;
				243	return xnn_status_success;
				244
				245	error:
				246	xnn_delete_operator(add_op);
				247	return status;
				248	}
				249
				250	enum xnn_status xnn_setup_add_nc_q8(
				251	xnn_operator_t add_op,
				252	size_t batch_size,
				253	const uint8_t* a,
				254	const uint8_t* b,
				255	uint8_t* sum,
				256	pthreadpool_t threadpool)
				257	{
				258	if (add_op->type != xnn_operator_type_add_q8) {
				259	xnn_log_error("failed to setup Add (Q8) operator: operator type mismatch");
				260	return xnn_status_invalid_parameter;
				261	}
				262	add_op->state = xnn_run_state_invalid;
				263
				264	if (!xnn_params.initialized) {
				265	xnn_log_error("failed to setup Add operator: XNNPACK is not initialized");
				266	return xnn_status_uninitialized;
				267	}
				268
				269	if (batch_size == 0) {
				270	add_op->state = xnn_run_state_skip;
				271	return xnn_status_success;
				272	}
				273
				274	const size_t channels = add_op->channels;
				275	const size_t a_stride = add_op->input_pixel_stride;
				276	const size_t b_stride = add_op->input2_pixel_stride;
				277	const size_t sum_stride = add_op->output_pixel_stride;
				278	if ((((a_stride ^ channels) \| (b_stride ^ channels) \| (sum_stride ^ channels)) == 0) \|\| batch_size == 1) {
				279	const size_t block_size = 4096;
				280	add_op->context.add_contiguous = (struct add_contiguous_context) {
				281	.a = a,
				282	.b = b,
				283	.y = sum,
				284	.params.q8 = add_op->q8_add_params,
				285	.ukernel = xnn_params.q8.vadd,
				286	};
				287	add_op->compute.type = xnn_parallelization_type_1d_tile_1d;
				288	add_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_add_contiguous;
				289	add_op->compute.range[0] = batch_size * channels * sizeof(uint8_t);
				290	add_op->compute.tile[0] = block_size;
				291	} else {
				292	add_op->context.add_strided = (struct add_strided_context) {
				293	.a = a,
				294	.a_stride = a_stride * sizeof(uint8_t),
				295	.b = b,
				296	.b_stride = b_stride * sizeof(uint8_t),
				297	.y = sum,
				298	.y_stride = sum_stride * sizeof(uint8_t),
				299	.n = channels,
				300	.params.q8 = add_op->q8_add_params,
				301	.ukernel = xnn_params.q8.vadd,
				302	};
				303	add_op->compute.type = xnn_parallelization_type_1d_tile_1d;
				304	add_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_add_strided;
				305	add_op->compute.range[0] = batch_size;
				306	add_op->compute.tile[0] = 1;
				307	}
				308	add_op->state = xnn_run_state_ready;
				309
				310	return xnn_status_success;
				311	}
				312
				313	enum xnn_status xnn_setup_add_nc_f32(
				314	xnn_operator_t add_op,
				315	size_t batch_size,
				316	const float* a,
				317	const float* b,
				318	float* sum,
				319	pthreadpool_t threadpool)
				320	{
				321	if (add_op->type != xnn_operator_type_add_f32) {
				322	xnn_log_error("failed to setup Add (F32) operator: operator type mismatch");
				323	return xnn_status_invalid_parameter;
				324	}
				325	add_op->state = xnn_run_state_invalid;
				326
				327	if (!xnn_params.initialized) {
				328	xnn_log_error("failed to setup Add operator: XNNPACK is not initialized");
				329	return xnn_status_uninitialized;
				330	}
				331
				332	if (batch_size == 0) {
				333	add_op->state = xnn_run_state_skip;
				334	return xnn_status_success;
				335	}
				336
				337	const size_t channels = add_op->channels;
				338	const size_t a_stride = add_op->input_pixel_stride;
				339	const size_t b_stride = add_op->input2_pixel_stride;
				340	const size_t sum_stride = add_op->output_pixel_stride;
				341	if ((((a_stride ^ channels) \| (b_stride ^ channels) \| (sum_stride ^ channels)) == 0) \|\| batch_size == 1) {
				342	const size_t block_size = 4096;
				343	add_op->context.add_contiguous = (struct add_contiguous_context) {
				344	.a = a,
				345	.b = b,
				346	.y = sum,
				347	.params.f32 = add_op->f32_output_params,
				348	.ukernel = xnn_params.f32.vadd,
				349	};
				350	add_op->compute.type = xnn_parallelization_type_1d_tile_1d;
				351	add_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_add_contiguous;
				352	add_op->compute.range[0] = batch_size * channels * sizeof(float);
				353	add_op->compute.tile[0] = block_size;
				354	} else {
				355	add_op->context.add_strided = (struct add_strided_context) {
				356	.a = a,
				357	.a_stride = a_stride * sizeof(float),
				358	.b = b,
				359	.b_stride = b_stride * sizeof(float),
				360	.y = sum,
				361	.y_stride = sum_stride * sizeof(float),
				362	.n = channels * sizeof(float),
				363	.params.f32 = add_op->f32_output_params,
				364	.ukernel = xnn_params.f32.vadd,
				365	};
				366	add_op->compute.type = xnn_parallelization_type_1d_tile_1d;
				367	add_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_add_strided;
				368	add_op->compute.range[0] = batch_size;
				369	add_op->compute.tile[0] = 1;
				370	}
				371	add_op->state = xnn_run_state_ready;
				372
				373	return xnn_status_success;
				374	}