Blame - src/qc8-gemm/gen/6x8c4-minmax-fp32-neondot.c - platform/external/XNNPACK

blob: 37f756744e2eb0a7da3fb1a079472060f51772cf [file] [log] [blame]

Marat Dukhan	e76478b	2021-06-28 16:35:40 -0700	[diff] [blame]	1	// Auto-generated file. Do not edit!
Frank Barchard	e452560	2021-06-29 13:50:00 -0700	[diff] [blame]	2	// Template: src/qs8-gemm/c4-neondot.c.in
Marat Dukhan	e76478b	2021-06-28 16:35:40 -0700	[diff] [blame]	3	// Generator: tools/xngen
				4	//
				5	// Copyright 2020 Google LLC
				6	//
				7	// This source code is licensed under the BSD-style license found in the
				8	// LICENSE file in the root directory of this source tree.
				9
				10	#include <assert.h>
				11
				12	#include <arm_neon.h>
				13
				14	#include <xnnpack/gemm.h>
				15	#include <xnnpack/intrinsics-polyfill.h>
				16	#include <xnnpack/math.h>
				17
				18
				19	void xnn_qc8_gemm_minmax_fp32_ukernel_6x8c4__neondot(
				20	size_t mr,
				21	size_t nc,
				22	size_t kc,
				23	const int8_t* restrict a,
				24	size_t a_stride,
				25	const void* restrict w,
				26	int8_t* restrict c,
				27	size_t cm_stride,
				28	size_t cn_stride,
				29	const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
				30	{
				31	assert(mr != 0);
				32	assert(mr <= 6);
				33	assert(nc != 0);
				34	assert(kc != 0);
				35	assert(kc % sizeof(int8_t) == 0);
				36	assert(a != NULL);
				37	assert(w != NULL);
				38	assert(c != NULL);
				39
				40	kc = round_up_po2(kc, 4 * sizeof(int8_t));
				41	const int8_t* a0 = a;
				42	int8_t* c0 = c;
				43	const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
				44	int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
				45	if XNN_UNPREDICTABLE(mr < 2) {
				46	a1 = a0;
				47	c1 = c0;
				48	}
				49	const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
				50	int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
				51	if XNN_UNPREDICTABLE(mr <= 2) {
				52	a2 = a1;
				53	c2 = c1;
				54	}
				55	const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
				56	int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
				57	if XNN_UNPREDICTABLE(mr < 4) {
				58	a3 = a2;
				59	c3 = c2;
				60	}
				61	const int8_t* a4 = (const int8_t*) ((uintptr_t) a3 + a_stride);
				62	int8_t* c4 = (int8_t*) ((uintptr_t) c3 + cm_stride);
				63	if XNN_UNPREDICTABLE(mr <= 4) {
				64	a4 = a3;
				65	c4 = c3;
				66	}
				67	const int8_t* a5 = (const int8_t*) ((uintptr_t) a4 + a_stride);
				68	int8_t* c5 = (int8_t*) ((uintptr_t) c4 + cm_stride);
				69	if XNN_UNPREDICTABLE(mr != 6) {
				70	a5 = a4;
				71	c5 = c4;
				72	}
				73
				74	// Loop over groups of 8 columns.
				75	do {
				76	// Initialize accumulators with bias. 8 bias values are loaded from the
				77	// weight matrix, at the start of the group of 8 columns.
				78	int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void) ((const int32_t) w + 4);
				79	int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void) ((const int32_t) w + 4);
				80	int32x4_t vacc1x0123 = vacc0x0123;
				81	int32x4_t vacc1x4567 = vacc0x4567;
				82	int32x4_t vacc2x0123 = vacc0x0123;
				83	int32x4_t vacc2x4567 = vacc0x4567;
				84	int32x4_t vacc3x0123 = vacc0x0123;
				85	int32x4_t vacc3x4567 = vacc0x4567;
				86	int32x4_t vacc4x0123 = vacc0x0123;
				87	int32x4_t vacc4x4567 = vacc0x4567;
				88	int32x4_t vacc5x0123 = vacc0x0123;
				89	int32x4_t vacc5x4567 = vacc0x4567;
				90
				91	// Inner accumulation loop along the 8 columns.
				92	size_t k = kc;
				93	// 2x partial unrolled loop to load 8 bytes at a time.
				94	while (k >= 8 * sizeof(int8_t)) {
				95	// Load a 6x8 block of activations.
				96	const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 8;
				97	const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8;
				98	const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8;
				99	const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 8;
				100	const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 8;
				101	const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 8;
				102
				103	// Load a 8x8 block of weights.
				104	const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void) ((const int8_t) w + 16);
				105	const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void) ((const int8_t) w + 16);
				106	const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void) ((const int8_t) w + 16);
				107	const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void) ((const int8_t) w + 16);
				108
				109	// Multiply-accumulate: 6x8 * 8x8 --> 6x8.
				110	vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
				111	vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
				112	vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0);
				113	vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0);
				114	vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0);
				115	vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
				116	vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
				117	vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
				118	vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb0123x0123, va4x01234567, 0);
				119	vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
				120	vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
				121	vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
				122	vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
				123	vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
				124	vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
				125	vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
				126	vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
				127	vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
				128	vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
				129	vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
				130	vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
				131	vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
				132	vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
				133	vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
				134
				135	k -= 8 * sizeof(int8_t);
				136	}
				137	// Handle up to 4 final positions of `k`
				138	if XNN_UNLIKELY(k != 0) {
				139	// Load a 6x4 block of activations.
				140	const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
				141	const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
				142	const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
				143	const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
				144	const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
				145	const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
				146
				147	// Load a 4x8 block of weights.
				148	const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void) ((const int8_t) w + 16);
				149	const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void) ((const int8_t) w + 16);
				150
				151	// Multiply-accumulate: 6x4 * 4x8 --> 6x8.
				152	vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
				153	vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
				154	vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0);
				155	vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0);
				156	vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0);
				157	vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
				158	vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
				159	vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
				160	vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb0123x0123, va4x01234567, 0);
				161	vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
				162	vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
				163	vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
				164	}
				165
				166	// Post-accumulation work
				167	float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
				168	float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
				169	float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
				170	float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
				171	float32x4_t vfpacc2x0123 = vcvtq_f32_s32(vacc2x0123);
				172	float32x4_t vfpacc2x4567 = vcvtq_f32_s32(vacc2x4567);
				173	float32x4_t vfpacc3x0123 = vcvtq_f32_s32(vacc3x0123);
				174	float32x4_t vfpacc3x4567 = vcvtq_f32_s32(vacc3x4567);
				175	float32x4_t vfpacc4x0123 = vcvtq_f32_s32(vacc4x0123);
				176	float32x4_t vfpacc4x4567 = vcvtq_f32_s32(vacc4x4567);
				177	float32x4_t vfpacc5x0123 = vcvtq_f32_s32(vacc5x0123);
				178	float32x4_t vfpacc5x4567 = vcvtq_f32_s32(vacc5x4567);
				179
				180	const float32x4_t vscale0123 = vld1q_f32((const float) w); w = (const void) ((const float*) w + 4);
				181	vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123);
				182	vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale0123);
				183	vfpacc2x0123 = vmulq_f32(vfpacc2x0123, vscale0123);
				184	vfpacc3x0123 = vmulq_f32(vfpacc3x0123, vscale0123);
				185	vfpacc4x0123 = vmulq_f32(vfpacc4x0123, vscale0123);
				186	vfpacc5x0123 = vmulq_f32(vfpacc5x0123, vscale0123);
				187	const float32x4_t vscale4567 = vld1q_f32((const float) w); w = (const void) ((const float*) w + 4);
				188	vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567);
				189	vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale4567);
				190	vfpacc2x4567 = vmulq_f32(vfpacc2x4567, vscale4567);
				191	vfpacc3x4567 = vmulq_f32(vfpacc3x4567, vscale4567);
				192	vfpacc4x4567 = vmulq_f32(vfpacc4x4567, vscale4567);
				193	vfpacc5x4567 = vmulq_f32(vfpacc5x4567, vscale4567);
				194
				195	vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
				196	vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
				197	vacc1x0123 = vcvtnq_s32_f32(vfpacc1x0123);
				198	vacc1x4567 = vcvtnq_s32_f32(vfpacc1x4567);
				199	vacc2x0123 = vcvtnq_s32_f32(vfpacc2x0123);
				200	vacc2x4567 = vcvtnq_s32_f32(vfpacc2x4567);
				201	vacc3x0123 = vcvtnq_s32_f32(vfpacc3x0123);
				202	vacc3x4567 = vcvtnq_s32_f32(vfpacc3x4567);
				203	vacc4x0123 = vcvtnq_s32_f32(vfpacc4x0123);
				204	vacc4x4567 = vcvtnq_s32_f32(vfpacc4x4567);
				205	vacc5x0123 = vcvtnq_s32_f32(vfpacc5x0123);
				206	vacc5x4567 = vcvtnq_s32_f32(vfpacc5x4567);
				207
				208	const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
				209	#if XNN_ARCH_ARM64
				210	const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
				211	const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
				212	const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
				213	const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
				214	const int16x8_t vacc4x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
				215	const int16x8_t vacc5x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
				216
				217	int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
				218	int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
				219	int8x16_t vout4x01234567_5x01234567 = vqmovn_high_s16(vqmovn_s16(vacc4x01234567), vacc5x01234567);
				220	#else
				221	const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
				222	const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
				223	const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
				224	const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
				225	const int16x8_t vacc4x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567)), voutput_zero_point);
				226	const int16x8_t vacc5x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567)), voutput_zero_point);
				227
				228	int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
				229	int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
				230	int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vqmovn_s16(vacc4x01234567), vqmovn_s16(vacc5x01234567));
				231	#endif
				232	const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
				233	const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
				234
				235	vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
				236	vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
				237	vout4x01234567_5x01234567 = vmaxq_s8(vout4x01234567_5x01234567, voutput_min);
				238
				239	vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
				240	vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
				241	vout4x01234567_5x01234567 = vminq_s8(vout4x01234567_5x01234567, voutput_max);
				242
				243	if (nc >= 8) {
				244	// Main case where there the 8 columns fit in the destination.
				245	vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
				246	vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
				247	vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
				248	vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
				249	vst1_s8(c4 + 0, vget_low_s8(vout4x01234567_5x01234567));
				250	vst1_s8(c5 + 0, vget_high_s8(vout4x01234567_5x01234567));
				251
				252	// Advance to the next 8 columns.
				253	c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
				254	c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
				255	c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
				256	c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
				257	c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
				258	c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
				259
				260	a0 = (const int8_t*) ((uintptr_t) a0 - kc);
				261	a1 = (const int8_t*) ((uintptr_t) a1 - kc);
				262	a2 = (const int8_t*) ((uintptr_t) a2 - kc);
				263	a3 = (const int8_t*) ((uintptr_t) a3 - kc);
				264	a4 = (const int8_t*) ((uintptr_t) a4 - kc);
				265	a5 = (const int8_t*) ((uintptr_t) a5 - kc);
				266
				267	nc -= 8;
				268	} else {
				269	// Final case where not all of the 8 columns fit in the destination.
				270	if (nc & 4) {
				271	vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
				272	vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
				273	vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
				274	vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
				275	vst1q_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4 += 4;
				276	vst1q_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5 += 4;
				277	vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
				278	vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
				279	vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
				280	}
				281	if (nc & 2) {
				282	vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
				283	vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
				284	vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
				285	vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
				286	vst1q_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4 += 2;
				287	vst1q_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5 += 2;
				288	vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
				289	vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
				290	vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
				291	}
				292	if (nc & 1) {
				293	vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
				294	vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
				295	vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
				296	vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
				297	vst1q_lane_s8(c4, vout4x01234567_5x01234567, 0);
				298	vst1q_lane_s8(c5, vout4x01234567_5x01234567, 8);
				299	}
				300
				301	nc = 0;
				302	}
				303	} while (nc != 0);
				304	}