Blame - src/qc8-igemm/gen/4x2-minmax-fp32-scalar-magic.c - platform/external/XNNPACK

blob: ec9f89e916e32db66d006840fdc2b541ccef25a8 [file] [log] [blame]

Marat Dukhan	d602154	2021-06-30 09:04:20 -0700	[diff] [blame]	1	// Auto-generated file. Do not edit!
				2	// Template: src/qs8-igemm/scalar.c.in
				3	// Generator: tools/xngen
				4	//
				5	// Copyright 2021 Google LLC
				6	//
				7	// This source code is licensed under the BSD-style license found in the
				8	// LICENSE file in the root directory of this source tree.
				9
				10	#include <assert.h>
				11
				12	#include <fp16.h>
				13
				14	#include <xnnpack/math.h>
				15	#include <xnnpack/gemm.h>
				16
				17
				18	void xnn_qc8_igemm_minmax_fp32_ukernel_4x2__scalar_magic(
				19	size_t mr,
				20	size_t nc,
				21	size_t kc,
				22	size_t ks,
				23	const int8_t**restrict a,
				24	const void*restrict w,
				25	int8_t*restrict c,
				26	size_t cm_stride,
				27	size_t cn_stride,
				28	size_t a_offset,
				29	const int8_t* zero,
				30	const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
				31	{
				32	assert(mr != 0);
				33	assert(mr <= 4);
				34	assert(nc != 0);
				35	assert(kc != 0);
				36	assert(ks != 0);
				37	assert(ks % (4 * sizeof(void*)) == 0);
				38	assert(a != NULL);
				39	assert(w != NULL);
				40	assert(c != NULL);
				41
				42	int8_t* c0 = c;
				43	int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
				44	if XNN_UNPREDICTABLE(mr < 2) {
				45	c1 = c0;
				46	}
				47	int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
				48	if XNN_UNPREDICTABLE(mr <= 2) {
				49	c2 = c1;
				50	}
				51	int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
				52	if XNN_UNPREDICTABLE(mr != 4) {
				53	c3 = c2;
				54	}
				55
				56	do {
				57	int32_t vacc0x0 = ((const int32_t*) w)[0];
				58	int32_t vacc0x1 = ((const int32_t*) w)[1];
				59	int32_t vacc1x0 = vacc0x0;
				60	int32_t vacc1x1 = vacc0x1;
				61	int32_t vacc2x0 = vacc0x0;
				62	int32_t vacc2x1 = vacc0x1;
				63	int32_t vacc3x0 = vacc0x0;
				64	int32_t vacc3x1 = vacc0x1;
Marat Dukhan	927d474	2021-07-15 13:42:49 -0700	[diff] [blame]	65	w = (const void) ((const int32_t) w + 2);
Marat Dukhan	d602154	2021-06-30 09:04:20 -0700	[diff] [blame]	66
				67	size_t p = ks;
				68	do {
				69	const int8_t* restrict a0 = a[0];
				70	assert(a0 != NULL);
				71	if XNN_UNPREDICTABLE(a0 != zero) {
				72	a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
				73	}
				74	const int8_t* restrict a1 = a[1];
				75	assert(a1 != NULL);
				76	if XNN_UNPREDICTABLE(a1 != zero) {
				77	a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
				78	}
				79	const int8_t* restrict a2 = a[2];
				80	assert(a2 != NULL);
				81	if XNN_UNPREDICTABLE(a2 != zero) {
				82	a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
				83	}
				84	const int8_t* restrict a3 = a[3];
				85	assert(a3 != NULL);
				86	if XNN_UNPREDICTABLE(a3 != zero) {
				87	a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
				88	}
				89	a += 4;
				90
				91	size_t k = kc;
				92	do {
				93	const int32_t va0 = (int32_t) *a0++;
				94	const int32_t va1 = (int32_t) *a1++;
				95	const int32_t va2 = (int32_t) *a2++;
				96	const int32_t va3 = (int32_t) *a3++;
				97
Marat Dukhan	927d474	2021-07-15 13:42:49 -0700	[diff] [blame]	98	const int32_t vb0 = (int32_t) ((const int8_t*) w)[0];
				99	const int32_t vb1 = (int32_t) ((const int8_t*) w)[1];
				100	w = (const void) ((const int8_t) w + 2);
Marat Dukhan	d602154	2021-06-30 09:04:20 -0700	[diff] [blame]	101
				102	vacc0x0 += va0 * vb0;
				103	vacc0x1 += va0 * vb1;
				104	vacc1x0 += va1 * vb0;
				105	vacc1x1 += va1 * vb1;
				106	vacc2x0 += va2 * vb0;
				107	vacc2x1 += va2 * vb1;
				108	vacc3x0 += va3 * vb0;
				109	vacc3x1 += va3 * vb1;
				110
				111	k -= sizeof(int8_t);
				112	} while (k != 0);
				113	p -= 4 * sizeof(void*);
				114	} while (p != 0);
				115
				116	float vfpacc0x0 = (float) vacc0x0;
				117	float vfpacc0x1 = (float) vacc0x1;
				118	float vfpacc1x0 = (float) vacc1x0;
				119	float vfpacc1x1 = (float) vacc1x1;
				120	float vfpacc2x0 = (float) vacc2x0;
				121	float vfpacc2x1 = (float) vacc2x1;
				122	float vfpacc3x0 = (float) vacc3x0;
				123	float vfpacc3x1 = (float) vacc3x1;
				124
Marat Dukhan	4c49494	2021-09-05 17:48:08 -0700	[diff] [blame]	125	typedef XNN_UNALIGNED float unaligned_float;
				126	const float vscale0 = ((const unaligned_float*) w)[0];
Marat Dukhan	d602154	2021-06-30 09:04:20 -0700	[diff] [blame]	127	vfpacc0x0 *= vscale0;
				128	vfpacc1x0 *= vscale0;
				129	vfpacc2x0 *= vscale0;
				130	vfpacc3x0 *= vscale0;
Marat Dukhan	4c49494	2021-09-05 17:48:08 -0700	[diff] [blame]	131	const float vscale1 = ((const unaligned_float*) w)[1];
Marat Dukhan	d602154	2021-06-30 09:04:20 -0700	[diff] [blame]	132	vfpacc0x1 *= vscale1;
				133	vfpacc1x1 *= vscale1;
				134	vfpacc2x1 *= vscale1;
				135	vfpacc3x1 *= vscale1;
				136	w = (const void) ((const float) w + 2);
				137
				138	const float voutput_min_less_zero_point = params->scalar_magic.output_min_less_zero_point;
				139	vfpacc0x0 = math_max_f32(vfpacc0x0, voutput_min_less_zero_point);
				140	vfpacc0x1 = math_max_f32(vfpacc0x1, voutput_min_less_zero_point);
				141	vfpacc1x0 = math_max_f32(vfpacc1x0, voutput_min_less_zero_point);
				142	vfpacc1x1 = math_max_f32(vfpacc1x1, voutput_min_less_zero_point);
				143	vfpacc2x0 = math_max_f32(vfpacc2x0, voutput_min_less_zero_point);
				144	vfpacc2x1 = math_max_f32(vfpacc2x1, voutput_min_less_zero_point);
				145	vfpacc3x0 = math_max_f32(vfpacc3x0, voutput_min_less_zero_point);
				146	vfpacc3x1 = math_max_f32(vfpacc3x1, voutput_min_less_zero_point);
				147
				148	const float voutput_max_less_zero_point = params->scalar_magic.output_max_less_zero_point;
				149	vfpacc0x0 = math_min_f32(vfpacc0x0, voutput_max_less_zero_point);
				150	vfpacc0x1 = math_min_f32(vfpacc0x1, voutput_max_less_zero_point);
				151	vfpacc1x0 = math_min_f32(vfpacc1x0, voutput_max_less_zero_point);
				152	vfpacc1x1 = math_min_f32(vfpacc1x1, voutput_max_less_zero_point);
				153	vfpacc2x0 = math_min_f32(vfpacc2x0, voutput_max_less_zero_point);
				154	vfpacc2x1 = math_min_f32(vfpacc2x1, voutput_max_less_zero_point);
				155	vfpacc3x0 = math_min_f32(vfpacc3x0, voutput_max_less_zero_point);
				156	vfpacc3x1 = math_min_f32(vfpacc3x1, voutput_max_less_zero_point);
				157
				158	const float vmagic_bias = params->scalar_magic.magic_bias;
				159	vfpacc0x0 += vmagic_bias;
				160	vfpacc0x1 += vmagic_bias;
				161	vfpacc1x0 += vmagic_bias;
				162	vfpacc1x1 += vmagic_bias;
				163	vfpacc2x0 += vmagic_bias;
				164	vfpacc2x1 += vmagic_bias;
				165	vfpacc3x0 += vmagic_bias;
				166	vfpacc3x1 += vmagic_bias;
				167
				168	const int32_t vmagic_bias_less_output_zero_point = params->scalar_magic.magic_bias_less_output_zero_point;
				169	int32_t vout0x0 = (int32_t) fp32_to_bits(vfpacc0x0) - vmagic_bias_less_output_zero_point;
				170	int32_t vout0x1 = (int32_t) fp32_to_bits(vfpacc0x1) - vmagic_bias_less_output_zero_point;
				171	int32_t vout1x0 = (int32_t) fp32_to_bits(vfpacc1x0) - vmagic_bias_less_output_zero_point;
				172	int32_t vout1x1 = (int32_t) fp32_to_bits(vfpacc1x1) - vmagic_bias_less_output_zero_point;
				173	int32_t vout2x0 = (int32_t) fp32_to_bits(vfpacc2x0) - vmagic_bias_less_output_zero_point;
				174	int32_t vout2x1 = (int32_t) fp32_to_bits(vfpacc2x1) - vmagic_bias_less_output_zero_point;
				175	int32_t vout3x0 = (int32_t) fp32_to_bits(vfpacc3x0) - vmagic_bias_less_output_zero_point;
				176	int32_t vout3x1 = (int32_t) fp32_to_bits(vfpacc3x1) - vmagic_bias_less_output_zero_point;
				177
				178	if XNN_LIKELY(nc >= 2) {
				179	c3[0] = (int8_t) vout3x0;
				180	c3[1] = (int8_t) vout3x1;
				181	c2[0] = (int8_t) vout2x0;
				182	c2[1] = (int8_t) vout2x1;
				183	c1[0] = (int8_t) vout1x0;
				184	c1[1] = (int8_t) vout1x1;
				185	c0[0] = (int8_t) vout0x0;
				186	c0[1] = (int8_t) vout0x1;
				187
				188	c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
				189	c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
				190	c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
				191	c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
				192
				193	a = (const int8_t**restrict) ((uintptr_t) a - ks);
				194	nc -= 2;
				195	} else {
				196	if (nc & 1) {
				197	c3[0] = (int8_t) vout3x0;
				198	c2[0] = (int8_t) vout2x0;
				199	c1[0] = (int8_t) vout1x0;
				200	c0[0] = (int8_t) vout0x0;
				201	}
				202
				203	nc = 0;
				204	}
				205	} while (nc != 0);
				206	}