Generate QU8 GAVGPOOL microkernels from QS8 GAVGPOOL templates
PiperOrigin-RevId: 421477751
diff --git a/src/amalgam/sse2.c b/src/amalgam/sse2.c
index 3a87317..434c051 100644
--- a/src/amalgam/sse2.c
+++ b/src/amalgam/sse2.c
@@ -5707,8 +5707,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -5765,8 +5768,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (b + 0)));
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (b + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -5843,8 +5849,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -5865,6 +5874,7 @@
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
_mm_storel_epi64((__m128i*) output, vout0123456701234567);
output += 8;
}
@@ -5909,8 +5919,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -6031,8 +6044,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(vinit_bias, _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567));
- __m128i vacc4567 = _mm_add_epi32(vinit_bias, _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -6052,6 +6068,7 @@
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
_mm_storel_epi64((__m128i*) output, vout0123456701234567);
output += 8;
}
@@ -6096,8 +6113,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -9138,7 +9158,7 @@
}
}
-void xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8(
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8(
size_t rows,
size_t channels,
const uint8_t* input,
@@ -9158,46 +9178,62 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t packed_channels = round_up_po2(channels, 8);
- const size_t input_increment = 7 * input_stride - packed_channels;
- const __m128i vbias = _mm_load_si128((const __m128i*) ¶ms->sse2.bias);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
const __m128i vzero = _mm_setzero_si128();
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
- int32_t* acc = buffer;
- for (size_t c = 0; c < channels; c += 8) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
- const __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
- const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
- _mm_store_si128((__m128i*) acc, vacc_lo);
- _mm_store_si128((__m128i*) acc + 1, vacc_hi);
- acc += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
}
+
for (rows -= 7; rows > 7; rows -= 7) {
- acc = buffer;
i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
@@ -9206,229 +9242,238 @@
i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
- for (size_t c = 0; c < channels; c += 8) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
- __m128i vacc_lo = _mm_load_si128((const __m128i*) acc);
- __m128i vacc_hi = _mm_load_si128((const __m128i*) acc + 1);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
- vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
- vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
- _mm_store_si128((__m128i*) acc, vacc_lo);
- _mm_store_si128((__m128i*) acc + 1, vacc_hi);
- acc += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
- const __m128i vright_shift = _mm_loadl_epi64((const __m128i*) params->sse2.right_shift);
-
i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
- if (rows < 2) {
+ if XNN_UNPREDICTABLE(rows < 2) {
i1 = zero;
}
i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
- if (rows <= 2) {
+ if XNN_UNPREDICTABLE(rows <= 2) {
i2 = zero;
}
i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
- if (rows < 4) {
+ if XNN_UNPREDICTABLE(rows < 4) {
i3 = zero;
}
i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
- if (rows <= 4) {
+ if XNN_UNPREDICTABLE(rows <= 4) {
i4 = zero;
}
i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
- if (rows < 6) {
+ if XNN_UNPREDICTABLE(rows < 6) {
i5 = zero;
}
i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
- if (rows <= 6) {
+ if XNN_UNPREDICTABLE(rows <= 6) {
i6 = zero;
}
- acc = buffer;
- while (channels >= 8) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
- __m128i vacc_lo = _mm_load_si128((const __m128i*) acc);
- __m128i vacc_hi = _mm_load_si128((const __m128i*) acc + 1);
- acc += 8;
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ for (; channels >= 8; channels -= 8) {
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
- vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
- vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
- const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
- const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
- const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
- const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
- const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
- const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
- const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
- const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
- const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
- const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
- const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
- const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
- const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
- const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
- __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
- vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point));
- vout = _mm_packus_epi16(vout, vout);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max));
- vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_min));
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
- _mm_storel_epi64((__m128i*) output, vout); output += 8;
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
- channels -= 8;
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
}
- if (channels != 0) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0);
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1);
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2);
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3);
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4);
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5);
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6);
- __m128i vacc_lo = _mm_load_si128((const __m128i*) acc);
- __m128i vacc_hi = _mm_load_si128((const __m128i*) acc + 1);
+ if XNN_UNLIKELY(channels != 0) {
+ {
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
- vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
- vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
- const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
- const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
- const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
- const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
- const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
- const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
- const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
- const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
- const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
- const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
- const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
- const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
- const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
- const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
- __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
- vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point));
- vout = _mm_packus_epi16(vout, vout);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max));
- vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_min));
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
- if (channels & 4) {
- *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout);
- output += 4;
- vout = _mm_srli_epi64(vout, 32);
- }
- if (channels & 2) {
- *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout, 0);
- output += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (channels & 1) {
- *((uint8_t*) output) = (uint8_t) _mm_cvtsi128_si32(vout);
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ }
}
}
}
-void xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8(
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8(
size_t rows,
size_t channels,
const uint8_t* input,
@@ -9443,182 +9488,179 @@
const uint8_t* i0 = input;
const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
- if (rows < 2) {
+ if XNN_UNPREDICTABLE(rows < 2) {
i1 = zero;
}
const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
- if (rows <= 2) {
+ if XNN_UNPREDICTABLE(rows <= 2) {
i2 = zero;
}
const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
- if (rows < 4) {
+ if XNN_UNPREDICTABLE(rows < 4) {
i3 = zero;
}
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
- if (rows <= 4) {
+ if XNN_UNPREDICTABLE(rows <= 4) {
i4 = zero;
}
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
- if (rows < 6) {
+ if XNN_UNPREDICTABLE(rows < 6) {
i5 = zero;
}
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- if (rows <= 6) {
+ if XNN_UNPREDICTABLE(rows <= 6) {
i6 = zero;
}
- const __m128i vbias = _mm_load_si128((const __m128i*) ¶ms->sse2.bias);
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
const __m128i vzero = _mm_setzero_si128();
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
- const __m128i vright_shift = _mm_loadl_epi64((const __m128i*) params->sse2.right_shift);
+ for (; channels >= 8; channels -= 8) {
- while (channels >= 8) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
- __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
- __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
- const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
- const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
- const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
- const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
- const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
- const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
- const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
- const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
- const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
- const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
- const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
- const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
- const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
- const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
- __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
- vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point));
- vout = _mm_packus_epi16(vout, vout);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max));
- vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_min));
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
- _mm_storel_epi64((__m128i*) output, vout); output += 8;
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
- channels -= 8;
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
}
- if (channels != 0) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0);
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1);
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2);
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3);
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4);
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5);
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6);
+ if XNN_UNLIKELY(channels != 0) {
+ {
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
- __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
- __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
- const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
- const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
- const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
- const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
- const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
- const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
- const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
- const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
- const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
- const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
- const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
- const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
- const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
- const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
- __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
- vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point));
- vout = _mm_packus_epi16(vout, vout);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max));
- vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_min));
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
- if (channels & 4) {
- *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout);
- output += 4;
- vout = _mm_srli_epi64(vout, 32);
- }
- if (channels & 2) {
- *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout, 0);
- output += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (channels & 1) {
- *((uint8_t*) output) = (uint8_t) _mm_cvtsi128_si32(vout);
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ }
}
}
}
diff --git a/src/amalgam/sse41.c b/src/amalgam/sse41.c
index c233b0d..fc61f9a 100644
--- a/src/amalgam/sse41.c
+++ b/src/amalgam/sse41.c
@@ -3926,8 +3926,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), vinit_bias);
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -3970,8 +3973,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) (b + 0)));
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (b + 4)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -4034,8 +4040,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -4085,8 +4094,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -4192,8 +4204,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(vinit_bias, _mm_cvtepi16_epi32(vacc01234567));
- __m128i vacc4567 = _mm_add_epi32(vinit_bias, _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -4242,8 +4257,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), vinit_bias);
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), vinit_bias);
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -6579,6 +6597,431 @@
}
}
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
+ for (; channels >= 8; channels -= 8) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ }
+ }
+ }
+}
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
+ for (; channels >= 8; channels -= 8) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ }
+ }
+ }
+}
+
void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
size_t mr,
size_t nc,
diff --git a/src/init.c b/src/init.c
index 2fb6074..c436043 100644
--- a/src/init.c
+++ b/src/init.c
@@ -357,10 +357,10 @@
.channel_tile = 8,
};
xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
- .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8,
- .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8,
- .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
- .update.qu8 = xnn_update_qu8_avgpool_minmax_neon_params,
+ .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8,
+ .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8,
+ .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_neon_params,
+ .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_neon_params,
.row_tile = 7,
.channel_tile = 8,
};
@@ -996,10 +996,10 @@
.channel_tile = 1,
};
xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
- .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
- .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
- .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
- .update.qu8 = xnn_update_qu8_avgpool_minmax_scalar_params,
+ .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
+ .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
+ .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
+ .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
.row_tile = 7,
.channel_tile = 1,
};
@@ -1987,10 +1987,10 @@
.channel_tile = 8,
};
xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
- .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8,
- .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8,
- .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
- .update.qu8 = xnn_update_qu8_avgpool_minmax_neon_params,
+ .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8,
+ .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8,
+ .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_neon_params,
+ .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_neon_params,
.row_tile = 7,
.channel_tile = 8,
};
@@ -3146,14 +3146,25 @@
.incremental_tile = 8,
.channel_tile = 8,
};
- xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
- .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8,
- .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8,
- .init.qu8 = xnn_init_qu8_avgpool_minmax_sse2_params,
- .update.qu8 = xnn_update_qu8_avgpool_minmax_sse2_params,
- .row_tile = 7,
- .channel_tile = 8,
- };
+ if (cpuinfo_has_x86_sse4_1()) {
+ xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
+ .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
+ .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
+ .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params,
+ .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params,
+ .row_tile = 7,
+ .channel_tile = 8,
+ };
+ } else {
+ xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
+ .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
+ .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
+ .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
+ .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params,
+ .row_tile = 7,
+ .channel_tile = 8,
+ };
+ }
if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
xnn_params.qu8.vadd = (struct vbinary_parameters) {
@@ -4367,12 +4378,12 @@
.channel_tile = 1,
};
xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
- .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
- .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
- .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
- .update.qu8 = xnn_update_qu8_avgpool_minmax_scalar_params,
+ .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
+ .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
+ .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params,
+ .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params,
.row_tile = 7,
- .channel_tile = 1,
+ .channel_tile = 16,
};
xnn_params.qu8.vadd = (struct vbinary_parameters) {
@@ -5230,12 +5241,12 @@
.channel_tile = 1,
};
xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
- .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
- .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
- .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
- .update.qu8 = xnn_update_qu8_avgpool_minmax_scalar_params,
+ .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
+ .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
+ .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
+ .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
.row_tile = 7,
- .channel_tile = 1,
+ .channel_tile = 4,
};
xnn_params.qu8.vadd = (struct vbinary_parameters) {
@@ -5819,10 +5830,10 @@
.channel_tile = 1,
};
xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
- .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
- .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
- .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
- .update.qu8 = xnn_update_qu8_avgpool_minmax_scalar_params,
+ .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
+ .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
+ .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
+ .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
.row_tile = 7,
.channel_tile = 1,
};
diff --git a/src/params-init.c b/src/params-init.c
index 14cb4c3..3aee8a9 100644
--- a/src/params-init.c
+++ b/src/params-init.c
@@ -952,6 +952,285 @@
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+void xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ params->fp32_scalar_fmagic.init_bias = init_bias;
+ params->fp32_scalar_fmagic.scale = scale;
+ params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
+ params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
+ params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
+ params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
+}
+
+void xnn_update_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ params->fp32_scalar_fmagic.init_bias = init_bias;
+ params->fp32_scalar_fmagic.scale = scale;
+}
+
+void xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
+ const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
+ params->fp32_scalar_imagic.init_bias = init_bias;
+ params->fp32_scalar_imagic.scale = scale;
+ params->fp32_scalar_imagic.magic_bias = 12582912.0f;
+ params->fp32_scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
+ params->fp32_scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
+ params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
+}
+
+void xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ params->fp32_scalar_imagic.init_bias = init_bias;
+ params->fp32_scalar_imagic.scale = scale;
+}
+
+void xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ params->fp32_scalar_lrintf.init_bias = init_bias;
+ params->fp32_scalar_lrintf.scale = scale;
+ params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
+ params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
+ params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
+}
+
+void xnn_update_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ params->fp32_scalar_lrintf.init_bias = init_bias;
+ params->fp32_scalar_lrintf.scale = scale;
+}
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+void xnn_init_qu8_avgpool_minmax_fp32_sse2_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
+ for (uint32_t i = 0; i < 4; i++) {
+ params->fp32_sse2.init_bias[i] = init_bias;
+ params->fp32_sse2.scale[i] = scale;
+ params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
+ }
+ for (uint32_t i = 0; i < 8; i++) {
+ params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
+ }
+ for (uint32_t i = 0; i < 16; i++) {
+ params->fp32_sse2.output_min[i] = output_min;
+ }
+}
+
+void xnn_update_qu8_avgpool_minmax_fp32_sse2_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ for (uint32_t i = 0; i < 4; i++) {
+ params->fp32_sse2.init_bias[i] = init_bias;
+ params->fp32_sse2.scale[i] = scale;
+ }
+}
+
+void xnn_init_qu8_avgpool_minmax_fp32_sse4_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
+ for (uint32_t i = 0; i < 4; i++) {
+ params->fp32_sse4.init_bias[i] = init_bias;
+ params->fp32_sse4.scale[i] = scale;
+ params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
+ }
+ for (uint32_t i = 0; i < 8; i++) {
+ params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
+ }
+ for (uint32_t i = 0; i < 16; i++) {
+ params->fp32_sse4.output_min[i] = output_min;
+ }
+}
+
+void xnn_update_qu8_avgpool_minmax_fp32_sse4_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ for (uint32_t i = 0; i < 4; i++) {
+ params->fp32_sse4.init_bias[i] = init_bias;
+ params->fp32_sse4.scale[i] = scale;
+ }
+}
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+void xnn_init_qu8_avgpool_minmax_fp32_neon_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ params->fp32_neon.init_bias = init_bias;
+ params->fp32_neon.scale = scale;
+ params->fp32_neon.magic_bias = 12582912.0f;
+ params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
+ params->fp32_neon.output_min = output_min;
+ params->fp32_neon.output_max = output_max;
+}
+
+void xnn_update_qu8_avgpool_minmax_fp32_neon_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ params->fp32_neon.init_bias = init_bias;
+ params->fp32_neon.scale = scale;
+}
+
+void xnn_init_qu8_avgpool_minmax_fp32_neonv8_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ params->fp32_neonv8.init_bias = init_bias;
+ params->fp32_neonv8.scale = scale;
+ params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
+ params->fp32_neonv8.output_min = output_min;
+ params->fp32_neonv8.output_max = output_max;
+}
+
+void xnn_update_qu8_avgpool_minmax_fp32_neonv8_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ params->fp32_neonv8.init_bias = init_bias;
+ params->fp32_neonv8.scale = scale;
+}
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+void xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale,
+ uint8_t output_zero_point,
+ uint8_t output_min,
+ uint8_t output_max)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
+ const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
+ const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
+ for (uint32_t i = 0; i < 2; i++) {
+ params->fp32_wasmsimd.init_bias[i] = init_bias;
+ params->fp32_wasmsimd.scale[i] = scale;
+ params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
+ params->fp32_wasmsimd.magic_min[i] = magic_min;
+ params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
+ }
+ for (uint32_t i = 0; i < 8; i++) {
+ params->fp32_wasmsimd.output_max[i] = output_max;
+ }
+}
+
+void xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params(
+ union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
+ int32_t init_bias,
+ float scale)
+{
+ assert(scale >= 0x1.0p-32f);
+ assert(scale < 256.0f);
+
+ for (uint32_t i = 0; i < 2; i++) {
+ params->fp32_wasmsimd.init_bias[i] = init_bias;
+ params->fp32_wasmsimd.scale[i] = scale;
+ }
+}
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
void xnn_init_qu8_avgpool_minmax_scalar_params(
union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
int32_t bias,
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
index 19f6690..d77caf3 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
@@ -47,33 +47,33 @@
const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -99,37 +99,37 @@
const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(b + 8);
int32x4_t vaccCDEF = vld1q_s32(b + 12);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -176,37 +176,37 @@
const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -229,11 +229,11 @@
vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
#endif // !XNN_ARCH_ARM64
@@ -254,22 +254,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -284,9 +284,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
index 67cef93..3b3a21f 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
@@ -49,46 +49,46 @@
const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
- int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vaccGHIJKLMN));
- int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vaccGHIJKLMN));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
+ const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
+ const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -102,20 +102,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -144,52 +144,52 @@
const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(b + 8);
int32x4_t vaccCDEF = vld1q_s32(b + 12);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
int32x4_t vaccGHIJ = vld1q_s32(b + 16);
int32x4_t vaccKLMN = vld1q_s32(b + 20);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
- vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vaccGHIJKLMN));
- vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vaccGHIJKLMN));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
+ vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
+ vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -203,22 +203,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -268,52 +268,52 @@
const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
- vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vaccGHIJKLMN));
- vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vaccGHIJKLMN));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
+ vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
+ vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -344,13 +344,13 @@
vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
- vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
- vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
#endif // !XNN_ARCH_ARM64
@@ -376,22 +376,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -406,9 +406,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
index 847ca8b..97c9ad8 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
@@ -51,59 +51,59 @@
const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
- int16x8_t vaccOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+ int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi2xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi3xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi4xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi5xOPQRSTUV);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi6xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
- int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vaccGHIJKLMN));
- int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vaccGHIJKLMN));
- int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vaccOPQRSTUV));
- int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vaccOPQRSTUV));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
+ const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
+ const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
+ const int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV));
+ const int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -119,20 +119,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -163,67 +163,67 @@
const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
- int16x8_t vaccOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+ int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi2xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi3xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi4xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi5xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(b + 8);
int32x4_t vaccCDEF = vld1q_s32(b + 12);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
int32x4_t vaccGHIJ = vld1q_s32(b + 16);
int32x4_t vaccKLMN = vld1q_s32(b + 20);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
int32x4_t vaccOPQR = vld1q_s32(b + 24);
int32x4_t vaccSTUV = vld1q_s32(b + 28);
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi6xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
- vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vaccGHIJKLMN));
- vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vaccGHIJKLMN));
- vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vaccOPQRSTUV));
- vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vaccOPQRSTUV));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
+ vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
+ vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
+ vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV));
+ vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -239,22 +239,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -306,67 +306,67 @@
const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
- int16x8_t vaccOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+ int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi2xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi3xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi4xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi5xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi6xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
- vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vaccGHIJKLMN));
- vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vaccGHIJKLMN));
- vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vaccOPQRSTUV));
- vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vaccOPQRSTUV));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
+ vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
+ vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
+ vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV));
+ vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -405,15 +405,15 @@
vaccSTUV = vqsubq_s32(vaccSTUV, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
- vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
- vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
- vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
- vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
#endif // !XNN_ARCH_ARM64
@@ -439,22 +439,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -469,9 +469,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
index 39c0be7..fb14ee1 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
@@ -45,20 +45,20 @@
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -80,22 +80,22 @@
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -138,22 +138,22 @@
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -168,9 +168,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif // !XNN_ARCH_ARM64
@@ -191,22 +191,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0);
const int8x8_t vi1x01234567 = vld1_s8(i1);
const int8x8_t vi2x01234567 = vld1_s8(i2);
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3);
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4);
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5);
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6);
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -221,9 +221,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
index 6dab079..3d382c7 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
@@ -48,33 +48,33 @@
const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -100,37 +100,37 @@
const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(b + 8);
int32x4_t vaccCDEF = vld1q_s32(b + 12);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -176,37 +176,37 @@
const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -224,11 +224,11 @@
vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
#endif // !XNN_ARCH_ARM64
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
@@ -251,22 +251,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -278,9 +278,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
index e6d90b4..fda1318 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
@@ -50,46 +50,46 @@
const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
- int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vaccGHIJKLMN));
- int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vaccGHIJKLMN));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
+ const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
+ const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -103,20 +103,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -145,52 +145,52 @@
const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(b + 8);
int32x4_t vaccCDEF = vld1q_s32(b + 12);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
int32x4_t vaccGHIJ = vld1q_s32(b + 16);
int32x4_t vaccKLMN = vld1q_s32(b + 20);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
- vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vaccGHIJKLMN));
- vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vaccGHIJKLMN));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
+ vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
+ vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -204,22 +204,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -268,52 +268,52 @@
const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
- vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vaccGHIJKLMN));
- vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vaccGHIJKLMN));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
+ vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
+ vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -337,13 +337,13 @@
vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
- vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
- vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
#endif // !XNN_ARCH_ARM64
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
@@ -372,22 +372,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -399,9 +399,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
index 397d783..2f6dc8a 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
@@ -52,59 +52,59 @@
const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
- int16x8_t vaccOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+ int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi2xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi3xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi4xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi5xOPQRSTUV);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi6xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
- int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vaccGHIJKLMN));
- int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vaccGHIJKLMN));
- int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vaccOPQRSTUV));
- int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vaccOPQRSTUV));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
+ const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
+ const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
+ const int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV));
+ const int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -120,20 +120,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -164,67 +164,67 @@
const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
- int16x8_t vaccOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+ int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi2xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi3xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi4xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi5xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(b + 8);
int32x4_t vaccCDEF = vld1q_s32(b + 12);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
int32x4_t vaccGHIJ = vld1q_s32(b + 16);
int32x4_t vaccKLMN = vld1q_s32(b + 20);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
int32x4_t vaccOPQR = vld1q_s32(b + 24);
int32x4_t vaccSTUV = vld1q_s32(b + 28);
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi6xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
- vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vaccGHIJKLMN));
- vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vaccGHIJKLMN));
- vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vaccOPQRSTUV));
- vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vaccOPQRSTUV));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
+ vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
+ vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
+ vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV));
+ vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -240,22 +240,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -306,67 +306,67 @@
const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
- int16x8_t vaccOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+ int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi2xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi3xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi4xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi5xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4;
int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi6xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
- vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vacc89ABCDEF));
- vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vacc89ABCDEF));
- vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vaccGHIJKLMN));
- vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vaccGHIJKLMN));
- vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vaccOPQRSTUV));
- vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vaccOPQRSTUV));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
+ vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
+ vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
+ vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
+ vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
+ vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV));
+ vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -396,15 +396,15 @@
vaccSTUV = vcvtnq_s32_f32(vfpaccSTUV);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
- vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
- vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
- vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
- vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
#endif // !XNN_ARCH_ARM64
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
@@ -434,22 +434,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -461,9 +461,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
index 32fa653..3618ffd 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
@@ -46,20 +46,20 @@
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -81,22 +81,22 @@
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(b);
int32x4_t vacc4567 = vld1q_s32(b + 4);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
vst1q_s32(b, vacc0123); b += 4;
vst1q_s32(b, vacc4567); b += 4;
@@ -138,22 +138,22 @@
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -165,9 +165,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif // !XNN_ARCH_ARM64
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
@@ -189,22 +189,22 @@
const int8x8_t vi0x01234567 = vld1_s8(i0);
const int8x8_t vi1x01234567 = vld1_s8(i1);
const int8x8_t vi2x01234567 = vld1_s8(i2);
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3);
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4);
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5);
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6);
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vacc01234567));
- vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc01234567));
+ vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
+ vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -216,9 +216,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
index 2bd23be..cdd0162 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
@@ -42,19 +42,19 @@
size_t c = channels;
do {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
@@ -75,19 +75,19 @@
size_t c = channels;
do {
int32_t vacc = *b;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
@@ -129,19 +129,19 @@
const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
do {
int32_t vacc = *buffer++;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
index 0bfca6e..c0160de 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
@@ -40,40 +40,40 @@
const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
i0 += 2;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -96,40 +96,40 @@
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
int32_t vacc0 = b[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = b[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
i0 += 2;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -174,41 +174,41 @@
const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
for (; channels >= 2; channels -= 2) {
int32_t vacc0 = buffer[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = buffer[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
buffer += 2;
i0 += 2;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -235,19 +235,19 @@
}
if XNN_UNLIKELY(channels != 0) {
int32_t vacc = *buffer;
- const int32_t vi0 = *i0;
- const int32_t vi1 = *i1;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
vacc += vi0;
- const int32_t vi2 = *i2;
+ const int32_t vi2 = (int32_t) *i2;
vacc += vi1;
- const int32_t vi3 = *i3;
+ const int32_t vi3 = (int32_t) *i3;
vacc += vi2;
- const int32_t vi4 = *i4;
+ const int32_t vi4 = (int32_t) *i4;
vacc += vi3;
- const int32_t vi5 = *i5;
+ const int32_t vi5 = (int32_t) *i5;
vacc += vi4;
- const int32_t vi6 = *i6;
+ const int32_t vi6 = (int32_t) *i6;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
index d1886c7..0b7516e 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
@@ -40,66 +40,66 @@
const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
- const int32_t vi0x2 = i0[2];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
i0 += 4;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
int32_t vacc2 = vi0x2 + vinit_bias;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
int32_t vacc3 = vi0x3 + vinit_bias;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -126,68 +126,68 @@
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
int32_t vacc0 = b[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = b[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
int32_t vacc2 = b[2];
- const int32_t vi0x2 = i0[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
int32_t vacc3 = b[3];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
i0 += 4;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
vacc2 += vi0x2;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
vacc3 += vi0x3;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -236,69 +236,69 @@
const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
for (; channels >= 4; channels -= 4) {
int32_t vacc0 = buffer[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = buffer[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
int32_t vacc2 = buffer[2];
- const int32_t vi0x2 = i0[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
int32_t vacc3 = buffer[3];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
buffer += 4;
i0 += 4;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
vacc2 += vi0x2;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
vacc3 += vi0x3;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -340,19 +340,19 @@
if XNN_UNLIKELY(channels != 0) {
do {
int32_t vacc = *buffer++;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
index e886844..4eed122 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
@@ -42,19 +42,19 @@
size_t c = channels;
do {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
@@ -75,19 +75,19 @@
size_t c = channels;
do {
int32_t vacc = *b;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
@@ -129,19 +129,19 @@
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
do {
int32_t vacc = *buffer++;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
index d165964..2eea1f9 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
@@ -40,40 +40,40 @@
const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
i0 += 2;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -96,40 +96,40 @@
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
int32_t vacc0 = b[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = b[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
i0 += 2;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -174,41 +174,41 @@
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
for (; channels >= 2; channels -= 2) {
int32_t vacc0 = buffer[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = buffer[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
buffer += 2;
i0 += 2;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -238,19 +238,19 @@
}
if XNN_UNLIKELY(channels != 0) {
int32_t vacc = *buffer;
- const int32_t vi0 = *i0;
- const int32_t vi1 = *i1;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
vacc += vi0;
- const int32_t vi2 = *i2;
+ const int32_t vi2 = (int32_t) *i2;
vacc += vi1;
- const int32_t vi3 = *i3;
+ const int32_t vi3 = (int32_t) *i3;
vacc += vi2;
- const int32_t vi4 = *i4;
+ const int32_t vi4 = (int32_t) *i4;
vacc += vi3;
- const int32_t vi5 = *i5;
+ const int32_t vi5 = (int32_t) *i5;
vacc += vi4;
- const int32_t vi6 = *i6;
+ const int32_t vi6 = (int32_t) *i6;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
index 4445098..30ee417 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
@@ -40,66 +40,66 @@
const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
- const int32_t vi0x2 = i0[2];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
i0 += 4;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
int32_t vacc2 = vi0x2 + vinit_bias;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
int32_t vacc3 = vi0x3 + vinit_bias;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -126,68 +126,68 @@
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
int32_t vacc0 = b[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = b[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
int32_t vacc2 = b[2];
- const int32_t vi0x2 = i0[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
int32_t vacc3 = b[3];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
i0 += 4;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
vacc2 += vi0x2;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
vacc3 += vi0x3;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -236,69 +236,69 @@
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
for (; channels >= 4; channels -= 4) {
int32_t vacc0 = buffer[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = buffer[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
int32_t vacc2 = buffer[2];
- const int32_t vi0x2 = i0[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
int32_t vacc3 = buffer[3];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
buffer += 4;
i0 += 4;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
vacc2 += vi0x2;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
vacc3 += vi0x3;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -345,19 +345,19 @@
if XNN_UNLIKELY(channels != 0) {
do {
int32_t vacc = *buffer++;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
index 90e0e54..c8d2a6e 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
@@ -41,19 +41,19 @@
size_t c = channels;
do {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
@@ -74,19 +74,19 @@
size_t c = channels;
do {
int32_t vacc = *b;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
@@ -127,19 +127,19 @@
const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
do {
int32_t vacc = *buffer++;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
index 9eb318f..003b08d 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
@@ -39,40 +39,40 @@
const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
i0 += 2;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -95,40 +95,40 @@
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
int32_t vacc0 = b[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = b[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
i0 += 2;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -172,41 +172,41 @@
const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
for (; channels >= 2; channels -= 2) {
int32_t vacc0 = buffer[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = buffer[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
buffer += 2;
i0 += 2;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -233,19 +233,19 @@
}
if XNN_UNLIKELY(channels != 0) {
int32_t vacc = *buffer;
- const int32_t vi0 = *i0;
- const int32_t vi1 = *i1;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
vacc += vi0;
- const int32_t vi2 = *i2;
+ const int32_t vi2 = (int32_t) *i2;
vacc += vi1;
- const int32_t vi3 = *i3;
+ const int32_t vi3 = (int32_t) *i3;
vacc += vi2;
- const int32_t vi4 = *i4;
+ const int32_t vi4 = (int32_t) *i4;
vacc += vi3;
- const int32_t vi5 = *i5;
+ const int32_t vi5 = (int32_t) *i5;
vacc += vi4;
- const int32_t vi6 = *i6;
+ const int32_t vi6 = (int32_t) *i6;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
index 3abeee9..a3fba3c 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
@@ -39,66 +39,66 @@
const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
- const int32_t vi0x2 = i0[2];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
i0 += 4;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
int32_t vacc2 = vi0x2 + vinit_bias;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
int32_t vacc3 = vi0x3 + vinit_bias;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -125,68 +125,68 @@
int32_t* b = buffer;
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
int32_t vacc0 = b[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = b[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
int32_t vacc2 = b[2];
- const int32_t vi0x2 = i0[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
int32_t vacc3 = b[3];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
i0 += 4;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
vacc2 += vi0x2;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
vacc3 += vi0x3;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -234,69 +234,69 @@
const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
for (; channels >= 4; channels -= 4) {
int32_t vacc0 = buffer[0];
- const int32_t vi0x0 = i0[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
int32_t vacc1 = buffer[1];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
int32_t vacc2 = buffer[2];
- const int32_t vi0x2 = i0[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
int32_t vacc3 = buffer[3];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
buffer += 4;
i0 += 4;
vacc0 += vi0x0;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
vacc1 += vi0x1;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
vacc2 += vi0x2;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
vacc3 += vi0x3;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -338,19 +338,19 @@
if XNN_UNLIKELY(channels != 0) {
do {
int32_t vacc = *buffer++;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
index a2558b0..854275f 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
@@ -99,11 +99,16 @@
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
- const __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), vinit_bias);
- const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), vinit_bias);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -182,11 +187,16 @@
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (b + 0)));
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (b + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
- const __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), _mm_load_si128((const __m128i*) (b + 8)));
- const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), _mm_load_si128((const __m128i*) (b + 12)));
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -285,11 +295,16 @@
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
- __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 8)));
- __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 12)));
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
buffer += 16;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -320,6 +335,7 @@
__m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+
_mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
output += 16;
}
@@ -364,8 +380,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
index b29313a..c2000cd 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
@@ -119,14 +119,21 @@
vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
- const __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), vinit_bias);
- const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), vinit_bias);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
- const __m128i vaccGHIJ = _mm_add_epi32(_mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN), vinit_bias);
- const __m128i vaccKLMN = _mm_add_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN), vinit_bias);
+ __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
+ vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -177,8 +184,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -278,14 +288,21 @@
vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (b + 0)));
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (b + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
- const __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), _mm_load_si128((const __m128i*) (b + 8)));
- const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), _mm_load_si128((const __m128i*) (b + 12)));
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
- const __m128i vaccGHIJ = _mm_add_epi32(_mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN), _mm_load_si128((const __m128i*) (b + 16)));
- const __m128i vaccKLMN = _mm_add_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN), _mm_load_si128((const __m128i*) (b + 20)));
+ __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16)));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -336,8 +353,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) b));
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (b + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -457,14 +477,21 @@
vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
- __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 8)));
- __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 12)));
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
- __m128i vaccGHIJ = _mm_add_epi32(_mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN), _mm_load_si128((const __m128i*) (buffer + 16)));
- __m128i vaccKLMN = _mm_add_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN), _mm_load_si128((const __m128i*) (buffer + 20)));
+ __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16)));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20)));
buffer += 24;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -506,6 +533,7 @@
__m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
__m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+
_mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
_mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
output += 24;
@@ -551,8 +579,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
index a2cb4c4..81c4eaa 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
@@ -79,8 +79,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -137,8 +140,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (b + 0)));
- const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (b + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -215,8 +221,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -237,6 +246,7 @@
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
_mm_storel_epi64((__m128i*) output, vout0123456701234567);
output += 8;
}
@@ -281,8 +291,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
index 901710f..25fb554 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
@@ -78,10 +78,15 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), vinit_bias);
- const __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc89ABCDEF), vinit_bias);
- const __m128i vaccCDEF = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16), vinit_bias);
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+ __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -139,10 +144,15 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) (b + 0)));
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (b + 4)));
- const __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc89ABCDEF), _mm_load_si128((const __m128i*) (b + 8)));
- const __m128i vaccCDEF = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16), _mm_load_si128((const __m128i*) (b + 12)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+ __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -220,10 +230,15 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
- __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 8)));
- __m128i vaccCDEF = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16), _mm_load_si128((const __m128i*) (buffer + 12)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+ __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
buffer += 16;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -282,8 +297,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
index c5ce66d..048c8c8 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
@@ -91,12 +91,19 @@
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), vinit_bias);
- const __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc89ABCDEF), vinit_bias);
- const __m128i vaccCDEF = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16), vinit_bias);
- const __m128i vaccGHIJ = _mm_add_epi32(_mm_cvtepi16_epi32(vaccGHIJKLMN), vinit_bias);
- const __m128i vaccKLMN = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16), vinit_bias);
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+ __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16);
+ __m128i vaccGHIJ = _mm_cvtepi16_epi32(vaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
+ vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -132,8 +139,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), vinit_bias);
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -205,12 +215,19 @@
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) (b + 0)));
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (b + 4)));
- const __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc89ABCDEF), _mm_load_si128((const __m128i*) (b + 8)));
- const __m128i vaccCDEF = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16), _mm_load_si128((const __m128i*) (b + 12)));
- const __m128i vaccGHIJ = _mm_add_epi32(_mm_cvtepi16_epi32(vaccGHIJKLMN), _mm_load_si128((const __m128i*) (b + 16)));
- const __m128i vaccKLMN = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16), _mm_load_si128((const __m128i*) (b + 20)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+ __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16);
+ __m128i vaccGHIJ = _mm_cvtepi16_epi32(vaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16)));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -246,8 +263,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) b));
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (b + 4)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -339,12 +359,19 @@
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
- __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 8)));
- __m128i vaccCDEF = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16), _mm_load_si128((const __m128i*) (buffer + 12)));
- __m128i vaccGHIJ = _mm_add_epi32(_mm_cvtepi16_epi32(vaccGHIJKLMN), _mm_load_si128((const __m128i*) (buffer + 16)));
- __m128i vaccKLMN = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16), _mm_load_si128((const __m128i*) (buffer + 20)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+ __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16);
+ __m128i vaccGHIJ = _mm_cvtepi16_epi32(vaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16)));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20)));
buffer += 24;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -415,8 +442,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
index 1f4ccbd..ad41d96 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
@@ -65,8 +65,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), vinit_bias);
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), vinit_bias);
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -109,8 +112,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) (b + 0)));
- const __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (b + 4)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
_mm_store_si128((__m128i*) b, vacc0123);
_mm_store_si128((__m128i*) (b + 4), vacc4567);
@@ -173,8 +179,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -224,8 +233,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
index f0a215a..d86b070 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
@@ -139,10 +139,15 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
- const v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(b + 0));
- const v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(b + 4));
- const v128_t vacc89AB = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc89ABCDEF), wasm_v128_load(b + 8));
- const v128_t vaccCDEF = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc89ABCDEF), wasm_v128_load(b + 12));
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+ v128_t vacc89AB = wasm_v128_load(b + 8);
+ v128_t vaccCDEF = wasm_v128_load(b + 12);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF));
wasm_v128_store(b, vacc0123);
wasm_v128_store(b + 4, vacc4567);
@@ -221,12 +226,17 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
- v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(buffer + 0));
- v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(buffer + 4));
- v128_t vacc89AB = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc89ABCDEF), wasm_v128_load(buffer + 8));
- v128_t vaccCDEF = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc89ABCDEF), wasm_v128_load(buffer + 12));
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ v128_t vacc89AB = wasm_v128_load(buffer + 8);
+ v128_t vaccCDEF = wasm_v128_load(buffer + 12);
buffer += 16;
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF));
+
vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
@@ -288,10 +298,13 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
- v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(buffer));
- v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(buffer + 4));
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
buffer += 8;
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+
vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
index 1c10bc9..379b363 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
@@ -205,12 +205,19 @@
vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
- const v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(b + 0));
- const v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(b + 4));
- const v128_t vacc89AB = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc89ABCDEF), wasm_v128_load(b + 8));
- const v128_t vaccCDEF = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc89ABCDEF), wasm_v128_load(b + 12));
- const v128_t vaccGHIJ = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN), wasm_v128_load(b + 16));
- const v128_t vaccKLMN = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN), wasm_v128_load(b + 20));
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+ v128_t vacc89AB = wasm_v128_load(b + 8);
+ v128_t vaccCDEF = wasm_v128_load(b + 12);
+ v128_t vaccGHIJ = wasm_v128_load(b + 16);
+ v128_t vaccKLMN = wasm_v128_load(b + 20);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF));
+ vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN));
+ vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN));
wasm_v128_store(b, vacc0123);
wasm_v128_store(b + 4, vacc4567);
@@ -246,8 +253,11 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
- const v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(b));
- const v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(b + 4));
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
wasm_v128_store(b, vacc0123);
wasm_v128_store(b + 4, vacc4567);
@@ -340,14 +350,21 @@
vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
- v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(buffer + 0));
- v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(buffer + 4));
- v128_t vacc89AB = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc89ABCDEF), wasm_v128_load(buffer + 8));
- v128_t vaccCDEF = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc89ABCDEF), wasm_v128_load(buffer + 12));
- v128_t vaccGHIJ = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN), wasm_v128_load(buffer + 16));
- v128_t vaccKLMN = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN), wasm_v128_load(buffer + 20));
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ v128_t vacc89AB = wasm_v128_load(buffer + 8);
+ v128_t vaccCDEF = wasm_v128_load(buffer + 12);
+ v128_t vaccGHIJ = wasm_v128_load(buffer + 16);
+ v128_t vaccKLMN = wasm_v128_load(buffer + 20);
buffer += 24;
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF));
+ vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN));
+ vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN));
+
vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
@@ -423,10 +440,13 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
- v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(buffer));
- v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(buffer + 4));
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
buffer += 8;
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+
vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
index 57386ea..152aaf3 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
@@ -235,14 +235,23 @@
vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV);
- const v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(b + 0));
- const v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(b + 4));
- const v128_t vacc89AB = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc89ABCDEF), wasm_v128_load(b + 8));
- const v128_t vaccCDEF = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc89ABCDEF), wasm_v128_load(b + 12));
- const v128_t vaccGHIJ = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN), wasm_v128_load(b + 16));
- const v128_t vaccKLMN = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN), wasm_v128_load(b + 20));
- const v128_t vaccOPQR = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vaccOPQRSTUV), wasm_v128_load(b + 24));
- const v128_t vaccSTUV = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vaccOPQRSTUV), wasm_v128_load(b + 28));
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+ v128_t vacc89AB = wasm_v128_load(b + 8);
+ v128_t vaccCDEF = wasm_v128_load(b + 12);
+ v128_t vaccGHIJ = wasm_v128_load(b + 16);
+ v128_t vaccKLMN = wasm_v128_load(b + 20);
+ v128_t vaccOPQR = wasm_v128_load(b + 24);
+ v128_t vaccSTUV = wasm_v128_load(b + 28);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF));
+ vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN));
+ vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN));
+ vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_i32x4_extend_low_i16x8(vaccOPQRSTUV));
+ vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_i32x4_extend_high_i16x8(vaccOPQRSTUV));
wasm_v128_store(b, vacc0123);
wasm_v128_store(b + 4, vacc4567);
@@ -280,8 +289,11 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
- const v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(b));
- const v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(b + 4));
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
wasm_v128_store(b, vacc0123);
wasm_v128_store(b + 4, vacc4567);
@@ -387,16 +399,25 @@
vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV);
- v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(buffer + 0));
- v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(buffer + 4));
- v128_t vacc89AB = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc89ABCDEF), wasm_v128_load(buffer + 8));
- v128_t vaccCDEF = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc89ABCDEF), wasm_v128_load(buffer + 12));
- v128_t vaccGHIJ = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN), wasm_v128_load(buffer + 16));
- v128_t vaccKLMN = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN), wasm_v128_load(buffer + 20));
- v128_t vaccOPQR = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vaccOPQRSTUV), wasm_v128_load(buffer + 24));
- v128_t vaccSTUV = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vaccOPQRSTUV), wasm_v128_load(buffer + 28));
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ v128_t vacc89AB = wasm_v128_load(buffer + 8);
+ v128_t vaccCDEF = wasm_v128_load(buffer + 12);
+ v128_t vaccGHIJ = wasm_v128_load(buffer + 16);
+ v128_t vaccKLMN = wasm_v128_load(buffer + 20);
+ v128_t vaccOPQR = wasm_v128_load(buffer + 24);
+ v128_t vaccSTUV = wasm_v128_load(buffer + 28);
buffer += 32;
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF));
+ vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN));
+ vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN));
+ vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_i32x4_extend_low_i16x8(vaccOPQRSTUV));
+ vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_i32x4_extend_high_i16x8(vaccOPQRSTUV));
+
vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
@@ -483,10 +504,13 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
- v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(buffer));
- v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(buffer + 4));
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
buffer += 8;
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+
vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
index 3cb61e5..57680e7 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
@@ -109,8 +109,11 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
- const v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(b + 0));
- const v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(b + 4));
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
wasm_v128_store(b, vacc0123);
wasm_v128_store(b + 4, vacc4567);
@@ -174,10 +177,13 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
- v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(buffer + 0));
- v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(buffer + 4));
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
buffer += 8;
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+
vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
@@ -228,10 +234,13 @@
vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
- v128_t vacc0123 = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc01234567), wasm_v128_load(buffer));
- v128_t vacc4567 = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc01234567), wasm_v128_load(buffer + 4));
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
buffer += 8;
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567));
+
vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c16.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c16.c
index 62dd43b..9072dfd 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c16.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c16.c
@@ -66,33 +66,33 @@
const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -115,11 +115,11 @@
vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
#endif // !XNN_ARCH_ARM64
@@ -140,20 +140,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -168,9 +168,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c24.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c24.c
index 0999a4d..95abd6c 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c24.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c24.c
@@ -68,46 +68,46 @@
const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
- int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vaccGHIJKLMN));
- int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vaccGHIJKLMN));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
+ int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
+ int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -138,13 +138,13 @@
vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
- vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
- vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
#endif // !XNN_ARCH_ARM64
@@ -170,20 +170,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -198,9 +198,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c32.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c32.c
index 0511c00..739c79a 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c32.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c32.c
@@ -70,59 +70,59 @@
const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
- int16x8_t vaccOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+ int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi2xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi3xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi4xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi5xOPQRSTUV);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi6xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
- int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vaccGHIJKLMN));
- int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vaccGHIJKLMN));
- int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vaccOPQRSTUV));
- int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vaccOPQRSTUV));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
+ int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
+ int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
+ int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV));
+ int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -161,15 +161,15 @@
vaccSTUV = vqsubq_s32(vaccSTUV, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
- vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
- vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
- vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
- vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
#endif // !XNN_ARCH_ARM64
@@ -195,20 +195,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -223,9 +223,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c8.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c8.c
index ed90fd9..3097368 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c8.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-neon-c8.c
@@ -64,20 +64,20 @@
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -92,9 +92,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif // !XNN_ARCH_ARM64
@@ -115,20 +115,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -143,9 +143,9 @@
vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c16.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c16.c
index 1547ab1..2e582ff 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c16.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c16.c
@@ -66,33 +66,33 @@
const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -110,11 +110,11 @@
vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
#endif // !XNN_ARCH_ARM64
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
@@ -137,20 +137,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -162,9 +162,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c24.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c24.c
index cd40142..80f8ffe 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c24.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c24.c
@@ -68,46 +68,46 @@
const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
- int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vaccGHIJKLMN));
- int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vaccGHIJKLMN));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
+ int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
+ int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -131,13 +131,13 @@
vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
- vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
- vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
#endif // !XNN_ARCH_ARM64
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
@@ -166,20 +166,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -191,9 +191,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c32.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c32.c
index e548a07..c1ef78a 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c32.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c32.c
@@ -70,59 +70,59 @@
const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
- int16x8_t vacc89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
+ int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
- int16x8_t vaccGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
- int16x8_t vaccOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+ int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi2x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi2xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi2xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi3x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi3xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi3xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi4x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi4xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi4xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi5x89ABCDEF);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi5xOPQRSTUV);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
- vacc89ABCDEF = vaddw_s8(vacc89ABCDEF, vi6x89ABCDEF);
- vaccGHIJKLMN = vaddw_s8(vaccGHIJKLMN, vi6xGHIJKLMN);
- vaccOPQRSTUV = vaddw_s8(vaccOPQRSTUV, vi6xOPQRSTUV);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
- int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vacc89ABCDEF));
- int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vacc89ABCDEF));
- int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vaccGHIJKLMN));
- int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vaccGHIJKLMN));
- int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vaccOPQRSTUV));
- int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vaccOPQRSTUV));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
+ int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
+ int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
+ int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
+ int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
+ int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV));
+ int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -152,15 +152,15 @@
vaccSTUV = vcvtnq_s32_f32(vfpaccSTUV);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
- vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
- vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
- vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
- vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
- vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
- vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
#endif // !XNN_ARCH_ARM64
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
@@ -190,20 +190,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -215,9 +215,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c8.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c8.c
index 170afde..35b66e7 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c8.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-neonv8-c8.c
@@ -64,20 +64,20 @@
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -89,9 +89,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else // !XNN_ARCH_ARM64
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif // !XNN_ARCH_ARM64
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
@@ -113,20 +113,20 @@
const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
- int16x8_t vacc01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
+ int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi2x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi3x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi4x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
- vacc01234567 = vaddw_s8(vacc01234567, vi5x01234567);
- vacc01234567 = vaddw_s8(vacc01234567, vi6x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
- int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vacc01234567));
- int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vacc01234567));
+ int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
+ int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
@@ -138,9 +138,9 @@
vacc4567 = vcvtnq_s32_f32(vfpacc4567);
#if XNN_ARCH_ARM64
- vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
- vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c1.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c1.c
index 73628ef..4584e08 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c1.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c1.c
@@ -62,19 +62,19 @@
const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
do {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c2.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c2.c
index 9d3f348..32622dd 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c2.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c2.c
@@ -61,40 +61,40 @@
const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
for (; channels >= 2; channels -= 2) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
i0 += 2;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -115,25 +115,25 @@
int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0) - vmagic_bias_less_output_zero_point;
int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1) - vmagic_bias_less_output_zero_point;
- output[0] = vout0;
- output[1] = vout1;
+ output[0] = (int8_t) vout0;
+ output[1] = (int8_t) vout1;
output += 2;
}
if XNN_UNLIKELY(channels != 0) {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0;
- const int32_t vi1 = *i1;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
vacc += vi0;
- const int32_t vi2 = *i2;
+ const int32_t vi2 = (int32_t) *i2;
vacc += vi1;
- const int32_t vi3 = *i3;
+ const int32_t vi3 = (int32_t) *i3;
vacc += vi2;
- const int32_t vi4 = *i4;
+ const int32_t vi4 = (int32_t) *i4;
vacc += vi3;
- const int32_t vi5 = *i5;
+ const int32_t vi5 = (int32_t) *i5;
vacc += vi4;
- const int32_t vi6 = *i6;
+ const int32_t vi6 = (int32_t) *i6;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c4.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c4.c
index 8e66daa..ca99488 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c4.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c4.c
@@ -61,66 +61,66 @@
const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
for (; channels >= 4; channels -= 4) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
- const int32_t vi0x2 = i0[2];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
i0 += 4;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
int32_t vacc2 = vi0x2 + vinit_bias;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
int32_t vacc3 = vi0x3 + vinit_bias;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -153,28 +153,28 @@
int32_t vout2 = (int32_t) fp32_to_bits(vfpacc2) - vmagic_bias_less_output_zero_point;
int32_t vout3 = (int32_t) fp32_to_bits(vfpacc3) - vmagic_bias_less_output_zero_point;
- output[0] = vout0;
- output[1] = vout1;
- output[2] = vout2;
- output[3] = vout3;
+ output[0] = (int8_t) vout0;
+ output[1] = (int8_t) vout1;
+ output[2] = (int8_t) vout2;
+ output[3] = (int8_t) vout3;
output += 4;
}
if XNN_UNLIKELY(channels != 0) {
do {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c
index 3cf6552..9592f50 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c
@@ -62,19 +62,19 @@
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
do {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c2.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c2.c
index 6848ab1..2a9a37e 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c2.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c2.c
@@ -61,40 +61,40 @@
const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
for (; channels >= 2; channels -= 2) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
i0 += 2;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -118,25 +118,25 @@
vout0 -= vmagic_bias_less_zero_point;
vout1 -= vmagic_bias_less_zero_point;
- output[0] = vout0;
- output[1] = vout1;
+ output[0] = (int8_t) vout0;
+ output[1] = (int8_t) vout1;
output += 2;
}
if XNN_UNLIKELY(channels != 0) {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0;
- const int32_t vi1 = *i1;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
vacc += vi0;
- const int32_t vi2 = *i2;
+ const int32_t vi2 = (int32_t) *i2;
vacc += vi1;
- const int32_t vi3 = *i3;
+ const int32_t vi3 = (int32_t) *i3;
vacc += vi2;
- const int32_t vi4 = *i4;
+ const int32_t vi4 = (int32_t) *i4;
vacc += vi3;
- const int32_t vi5 = *i5;
+ const int32_t vi5 = (int32_t) *i5;
vacc += vi4;
- const int32_t vi6 = *i6;
+ const int32_t vi6 = (int32_t) *i6;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c4.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c4.c
index 352b612..6a628d1 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c4.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c4.c
@@ -61,66 +61,66 @@
const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
for (; channels >= 4; channels -= 4) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
- const int32_t vi0x2 = i0[2];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
i0 += 4;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
int32_t vacc2 = vi0x2 + vinit_bias;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
int32_t vacc3 = vi0x3 + vinit_bias;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -158,28 +158,28 @@
vout2 -= vmagic_bias_less_zero_point;
vout3 -= vmagic_bias_less_zero_point;
- output[0] = vout0;
- output[1] = vout1;
- output[2] = vout2;
- output[3] = vout3;
+ output[0] = (int8_t) vout0;
+ output[1] = (int8_t) vout1;
+ output[2] = (int8_t) vout2;
+ output[3] = (int8_t) vout3;
output += 4;
}
if XNN_UNLIKELY(channels != 0) {
do {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c1.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c1.c
index b6c75aa..a7deaa0 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c1.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c1.c
@@ -60,19 +60,19 @@
const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
do {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c2.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c2.c
index c1a303a..74c2582 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c2.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c2.c
@@ -59,40 +59,40 @@
const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
for (; channels >= 2; channels -= 2) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
i0 += 2;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
i1 += 2;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
i2 += 2;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
i3 += 2;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
i4 += 2;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
i5 += 2;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
i6 += 2;
vacc0 += vi6x0;
@@ -113,25 +113,25 @@
int32_t vout0 = vrndacc0 + voutput_zero_point;
int32_t vout1 = vrndacc1 + voutput_zero_point;
- output[0] = vout0;
- output[1] = vout1;
+ output[0] = (int8_t) vout0;
+ output[1] = (int8_t) vout1;
output += 2;
}
if XNN_UNLIKELY(channels != 0) {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0;
- const int32_t vi1 = *i1;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
vacc += vi0;
- const int32_t vi2 = *i2;
+ const int32_t vi2 = (int32_t) *i2;
vacc += vi1;
- const int32_t vi3 = *i3;
+ const int32_t vi3 = (int32_t) *i3;
vacc += vi2;
- const int32_t vi4 = *i4;
+ const int32_t vi4 = (int32_t) *i4;
vacc += vi3;
- const int32_t vi5 = *i5;
+ const int32_t vi5 = (int32_t) *i5;
vacc += vi4;
- const int32_t vi6 = *i6;
+ const int32_t vi6 = (int32_t) *i6;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c4.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c4.c
index b8b2036..cd83e0a 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c4.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c4.c
@@ -59,66 +59,66 @@
const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
for (; channels >= 4; channels -= 4) {
- const int32_t vi0x0 = i0[0];
- const int32_t vi0x1 = i0[1];
- const int32_t vi0x2 = i0[2];
- const int32_t vi0x3 = i0[3];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
i0 += 4;
int32_t vacc0 = vi0x0 + vinit_bias;
- const int32_t vi1x0 = i1[0];
+ const int32_t vi1x0 = (int32_t) i1[0];
int32_t vacc1 = vi0x1 + vinit_bias;
- const int32_t vi1x1 = i1[1];
+ const int32_t vi1x1 = (int32_t) i1[1];
int32_t vacc2 = vi0x2 + vinit_bias;
- const int32_t vi1x2 = i1[2];
+ const int32_t vi1x2 = (int32_t) i1[2];
int32_t vacc3 = vi0x3 + vinit_bias;
- const int32_t vi1x3 = i1[3];
+ const int32_t vi1x3 = (int32_t) i1[3];
i1 += 4;
vacc0 += vi1x0;
- const int32_t vi2x0 = i2[0];
+ const int32_t vi2x0 = (int32_t) i2[0];
vacc1 += vi1x1;
- const int32_t vi2x1 = i2[1];
+ const int32_t vi2x1 = (int32_t) i2[1];
vacc2 += vi1x2;
- const int32_t vi2x2 = i2[2];
+ const int32_t vi2x2 = (int32_t) i2[2];
vacc3 += vi1x3;
- const int32_t vi2x3 = i2[3];
+ const int32_t vi2x3 = (int32_t) i2[3];
i2 += 4;
vacc0 += vi2x0;
- const int32_t vi3x0 = i3[0];
+ const int32_t vi3x0 = (int32_t) i3[0];
vacc1 += vi2x1;
- const int32_t vi3x1 = i3[1];
+ const int32_t vi3x1 = (int32_t) i3[1];
vacc2 += vi2x2;
- const int32_t vi3x2 = i3[2];
+ const int32_t vi3x2 = (int32_t) i3[2];
vacc3 += vi2x3;
- const int32_t vi3x3 = i3[3];
+ const int32_t vi3x3 = (int32_t) i3[3];
i3 += 4;
vacc0 += vi3x0;
- const int32_t vi4x0 = i4[0];
+ const int32_t vi4x0 = (int32_t) i4[0];
vacc1 += vi3x1;
- const int32_t vi4x1 = i4[1];
+ const int32_t vi4x1 = (int32_t) i4[1];
vacc2 += vi3x2;
- const int32_t vi4x2 = i4[2];
+ const int32_t vi4x2 = (int32_t) i4[2];
vacc3 += vi3x3;
- const int32_t vi4x3 = i4[3];
+ const int32_t vi4x3 = (int32_t) i4[3];
i4 += 4;
vacc0 += vi4x0;
- const int32_t vi5x0 = i5[0];
+ const int32_t vi5x0 = (int32_t) i5[0];
vacc1 += vi4x1;
- const int32_t vi5x1 = i5[1];
+ const int32_t vi5x1 = (int32_t) i5[1];
vacc2 += vi4x2;
- const int32_t vi5x2 = i5[2];
+ const int32_t vi5x2 = (int32_t) i5[2];
vacc3 += vi4x3;
- const int32_t vi5x3 = i5[3];
+ const int32_t vi5x3 = (int32_t) i5[3];
i5 += 4;
vacc0 += vi5x0;
- const int32_t vi6x0 = i6[0];
+ const int32_t vi6x0 = (int32_t) i6[0];
vacc1 += vi5x1;
- const int32_t vi6x1 = i6[1];
+ const int32_t vi6x1 = (int32_t) i6[1];
vacc2 += vi5x2;
- const int32_t vi6x2 = i6[2];
+ const int32_t vi6x2 = (int32_t) i6[2];
vacc3 += vi5x3;
- const int32_t vi6x3 = i6[3];
+ const int32_t vi6x3 = (int32_t) i6[3];
i6 += 4;
vacc0 += vi6x0;
@@ -151,28 +151,28 @@
int32_t vout2 = vrndacc2 + voutput_zero_point;
int32_t vout3 = vrndacc3 + voutput_zero_point;
- output[0] = vout0;
- output[1] = vout1;
- output[2] = vout2;
- output[3] = vout3;
+ output[0] = (int8_t) vout0;
+ output[1] = (int8_t) vout1;
+ output[2] = (int8_t) vout2;
+ output[3] = (int8_t) vout3;
output += 4;
}
if XNN_UNLIKELY(channels != 0) {
do {
int32_t vacc = vinit_bias;
- const int32_t vi0 = *i0++;
- const int32_t vi1 = *i1++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
vacc += vi0;
- const int32_t vi2 = *i2++;
+ const int32_t vi2 = (int32_t) *i2++;
vacc += vi1;
- const int32_t vi3 = *i3++;
+ const int32_t vi3 = (int32_t) *i3++;
vacc += vi2;
- const int32_t vi4 = *i4++;
+ const int32_t vi4 = (int32_t) *i4++;
vacc += vi3;
- const int32_t vi5 = *i5++;
+ const int32_t vi5 = (int32_t) *i5++;
vacc += vi4;
- const int32_t vi6 = *i6++;
+ const int32_t vi6 = (int32_t) *i6++;
vacc += vi5;
vacc += vi6;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c16.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c16.c
index d9a1af1..136755e 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c16.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c16.c
@@ -117,11 +117,16 @@
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(vinit_bias, _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567));
- __m128i vacc4567 = _mm_add_epi32(vinit_bias, _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
- __m128i vacc89AB = _mm_add_epi32(vinit_bias, _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF));
- __m128i vaccCDEF = _mm_add_epi32(vinit_bias, _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF));
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -151,6 +156,7 @@
__m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+
_mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
output += 16;
}
@@ -195,8 +201,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c24.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c24.c
index 418fafd..d1a76fc 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c24.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c24.c
@@ -137,14 +137,21 @@
vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(vinit_bias, _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567));
- __m128i vacc4567 = _mm_add_epi32(vinit_bias, _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF);
- __m128i vacc89AB = _mm_add_epi32(vinit_bias, _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF));
- __m128i vaccCDEF = _mm_add_epi32(vinit_bias, _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF));
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF);
const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN);
- __m128i vaccGHIJ = _mm_add_epi32(vinit_bias, _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN));
- __m128i vaccKLMN = _mm_add_epi32(vinit_bias, _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN));
+ __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
+ vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -185,6 +192,7 @@
__m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
__m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+
_mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
_mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
output += 24;
@@ -230,8 +238,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c
index 09fbdac..fda1d0b 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c
@@ -97,8 +97,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(vinit_bias, _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567));
- __m128i vacc4567 = _mm_add_epi32(vinit_bias, _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567));
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -118,6 +121,7 @@
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
_mm_storel_epi64((__m128i*) output, vout0123456701234567);
output += 8;
}
@@ -162,8 +166,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
- __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc01234567, vsgnacc01234567), vinit_bias);
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c16.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c16.c
index 523b8e4..91f2f7c 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c16.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c16.c
@@ -96,10 +96,15 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
- __m128i vacc0123 = _mm_add_epi32(vinit_bias, _mm_cvtepi16_epi32(vacc01234567));
- __m128i vacc4567 = _mm_add_epi32(vinit_bias, _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16));
- __m128i vacc89AB = _mm_add_epi32(vinit_bias, _mm_cvtepi16_epi32(vacc89ABCDEF));
- __m128i vaccCDEF = _mm_add_epi32(vinit_bias, _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+ __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -157,8 +162,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), vinit_bias);
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), vinit_bias);
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c24.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c24.c
index c0f099e..fb7a2f6 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c24.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c24.c
@@ -109,12 +109,19 @@
vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
- __m128i vacc0123 = _mm_add_epi32(vinit_bias, _mm_cvtepi16_epi32(vacc01234567));
- __m128i vacc4567 = _mm_add_epi32(vinit_bias, _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16));
- __m128i vacc89AB = _mm_add_epi32(vinit_bias, _mm_cvtepi16_epi32(vacc89ABCDEF));
- __m128i vaccCDEF = _mm_add_epi32(vinit_bias, _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16));
- __m128i vaccGHIJ = _mm_add_epi32(vinit_bias, _mm_cvtepi16_epi32(vaccGHIJKLMN));
- __m128i vaccKLMN = _mm_add_epi32(vinit_bias, _mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+ __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16);
+ __m128i vaccGHIJ = _mm_cvtepi16_epi32(vaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
+ vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -184,8 +191,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), vinit_bias);
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), vinit_bias);
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c
index eb013ab..9248911 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c
@@ -83,8 +83,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(vinit_bias, _mm_cvtepi16_epi32(vacc01234567));
- __m128i vacc4567 = _mm_add_epi32(vinit_bias, _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16));
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
@@ -133,8 +136,11 @@
vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
- __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc01234567), vinit_bias);
- __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16), vinit_bias);
+ __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
__m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
__m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/multipass-neon.c.in b/src/qs8-gavgpool/multipass-neon.c.in
index 7d607a3..e63f973 100644
--- a/src/qs8-gavgpool/multipass-neon.c.in
+++ b/src/qs8-gavgpool/multipass-neon.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE % 8 == 0
$assert CHANNEL_TILE >= 8
$assert ROW_TILE >= 3
@@ -21,23 +22,46 @@
$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("neonv8" if ARMV8 else "neon")
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$XINT8X8_T = {"QS8": "int8x8_t", "QU8": "uint8x8_t"}[DATATYPE]
+$XINT8X16_T = {"QS8": "int8x16_t", "QU8": "uint8x16_t"}[DATATYPE]
+$XINT16X8_T = {"QS8": "int16x8_t", "QU8": "uint16x8_t"}[DATATYPE]
+$VLD1_X8 = {"QS8": "vld1_s8", "QU8": "vld1_u8"}[DATATYPE]
+$VLD1_DUP_X8 = {"QS8": "vld1_dup_s8", "QU8": "vld1_dup_u8"}[DATATYPE]
+$VLD1Q_DUP_X8 = {"QS8": "vld1q_dup_s8", "QU8": "vld1q_dup_u8"}[DATATYPE]
+$VST1_X8 = {"QS8": "vst1_s8", "QU8": "vst1_u8"}[DATATYPE]
+$VST1Q_X8 = {"QS8": "vst1q_s8", "QU8": "vst1q_u8"}[DATATYPE]
+$VST1_LANE_X8 = {"QS8": "vst1_lane_s8", "QU8": "vst1_lane_u8"}[DATATYPE]
+$VADDL_X8 = {"QS8": "vaddl_s8", "QU8": "vaddl_u8"}[DATATYPE]
+$VADDW_X8 = {"QS8": "vaddw_s8", "QU8": "vaddw_u8"}[DATATYPE]
+$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE]
+$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE]
+$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE]
+$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE]
+$VEXT_X8 = {"QS8": "vext_s8", "QU8": "vext_u8"}[DATATYPE]
+$VQMOVXN_S16 = {"QS8": "vqmovn_s16", "QU8": "vqmovun_s16"}[DATATYPE]
+$VQMOVXN_HIGH_S16 = {"QS8": "vqmovn_high_s16", "QU8": "vqmovun_high_s16"}[DATATYPE]
+$VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE]
+$VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE]
+$VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE]
+$VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE]
$ISA = "neonv8" if ARMV8 else "neon"
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__${ISA}_c${CHANNEL_TILE}(
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__${ISA}_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
+ const ${XINT8_T}* zero,
int32_t* buffer,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows > ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if CHANNEL_TILE <= 16:
const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
$else:
@@ -49,21 +73,25 @@
for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) {
$for M in range(2):
$for C in range(0, CHANNEL_TILE, 8):
- const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8;
+ const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8;
$for C in range(0, CHANNEL_TILE, 8):
- const int8x8_t vi2x${ABC[C:C+8]} = vld1_s8(i2); i2 += 8;
- int16x8_t vacc${ABC[C:C+8]} = vaddl_s8(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
+ const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8;
+ ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
$for M in range(2, ROW_TILE):
$for C in range(0, CHANNEL_TILE, 8):
$if M + 1 != ROW_TILE:
- const int8x8_t vi${M+1}x${ABC[C:C+8]} = vld1_s8(i${M+1}); i${M+1} += 8;
- vacc${ABC[C:C+8]} = vaddw_s8(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+ const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8;
+ vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 8):
- int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vinit_bias, vget_low_s16(vacc${ABC[C:C+8]}));
- int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vinit_bias, vget_high_s16(vacc${ABC[C:C+8]}));
+ $if DATATYPE == "QS8":
+ const int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[C:C+8]}));
+ const int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[C:C+8]}));
+ $else:
+ const int32x4_t vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[C:C+8]})));
+ const int32x4_t vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[C:C+8]})));
$for C in range(0, CHANNEL_TILE, 4):
vst1q_s32(b, vacc${ABC[C:C+4]}); b += 4;
@@ -72,16 +100,20 @@
if XNN_UNLIKELY(c != 0) {
do {
$for M in range(3):
- const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M}); i${M} += 8;
- int16x8_t vacc${ABC[0:8]} = vaddl_s8(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
+ const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); i${M} += 8;
+ ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
$for M in range(2, ROW_TILE):
$if M + 1 != ROW_TILE:
- const int8x8_t vi${M+1}x${ABC[0:8]} = vld1_s8(i${M+1}); i${M+1} += 8;
- vacc${ABC[0:8]} = vaddw_s8(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+ const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8;
+ vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]});
- int32x4_t vacc${ABC[0:4]} = vaddw_s16(vinit_bias, vget_low_s16(vacc${ABC[0:8]}));
- int32x4_t vacc${ABC[4:8]} = vaddw_s16(vinit_bias, vget_high_s16(vacc${ABC[0:8]}));
+ $if DATATYPE == "QS8":
+ const int32x4_t vacc${ABC[0:4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[0:8]}));
+ const int32x4_t vacc${ABC[4:8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[0:8]}));
+ $else:
+ const int32x4_t vacc${ABC[0:4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[0:8]})));
+ const int32x4_t vacc${ABC[4:8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[0:8]})));
vst1q_s32(b, vacc${ABC[0:4]}); b += 4;
vst1q_s32(b, vacc${ABC[4:8]}); b += 4;
@@ -92,34 +124,38 @@
for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) {
$for M in range(ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
int32_t* b = buffer;
size_t c = channels;
for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) {
$for M in range(2):
$for C in range(0, CHANNEL_TILE, 8):
- const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8;
+ const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8;
$for C in range(0, CHANNEL_TILE, 8):
- const int8x8_t vi2x${ABC[C:C+8]} = vld1_s8(i2); i2 += 8;
- int16x8_t vacc${ABC[C:C+8]} = vaddl_s8(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
+ const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8;
+ ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
$for M in range(2, ROW_TILE):
$for C in range(0, CHANNEL_TILE, 8):
$if M + 1 != ROW_TILE:
- const int8x8_t vi${M+1}x${ABC[C:C+8]} = vld1_s8(i${M+1}); i${M+1} += 8;
+ const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8;
$else:
$if C == 0:
int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(b);
$else:
int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(b + ${C});
int32x4_t vacc${ABC[C+4:C+8]} = vld1q_s32(b + ${C+4});
- vacc${ABC[C:C+8]} = vaddw_s8(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+ vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 8):
- vacc${ABC[C:C+4]} = vaddw_s16(vacc${ABC[C:C+4]}, vget_low_s16(vacc${ABC[C:C+8]}));
- vacc${ABC[C+4:C+8]} = vaddw_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vacc${ABC[C:C+8]}));
+ $if DATATYPE == "QS8":
+ vacc${ABC[C:C+4]} = vaddw_s16(vacc${ABC[C:C+4]}, vget_low_s16(vsum${ABC[C:C+8]}));
+ vacc${ABC[C+4:C+8]} = vaddw_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vsum${ABC[C:C+8]}));
+ $else:
+ vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[C:C+4]}), vget_low_u16(vsum${ABC[C:C+8]})));
+ vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[C+4:C+8]}), vget_high_u16(vsum${ABC[C:C+8]})));
$for C in range(0, CHANNEL_TILE, 4):
vst1q_s32(b, vacc${ABC[C:C+4]}); b += 4;
@@ -128,19 +164,23 @@
if XNN_UNLIKELY(c != 0) {
do {
$for M in range(3):
- const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M}); i${M} += 8;
- int16x8_t vacc${ABC[0:8]} = vaddl_s8(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
+ const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); i${M} += 8;
+ ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
$for M in range(2, ROW_TILE):
$if M + 1 != ROW_TILE:
- const int8x8_t vi${M+1}x${ABC[0:8]} = vld1_s8(i${M+1}); i${M+1} += 8;
+ const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8;
$else:
int32x4_t vacc${ABC[0:4]} = vld1q_s32(b);
int32x4_t vacc${ABC[4:8]} = vld1q_s32(b + 4);
- vacc${ABC[0:8]} = vaddw_s8(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+ vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]});
- vacc${ABC[0:4]} = vaddw_s16(vacc${ABC[0:4]}, vget_low_s16(vacc${ABC[0:8]}));
- vacc${ABC[4:8]} = vaddw_s16(vacc${ABC[4:8]}, vget_high_s16(vacc${ABC[0:8]}));
+ $if DATATYPE == "QS8":
+ vacc${ABC[0:4]} = vaddw_s16(vacc${ABC[0:4]}, vget_low_s16(vsum${ABC[0:8]}));
+ vacc${ABC[4:8]} = vaddw_s16(vacc${ABC[4:8]}, vget_high_s16(vsum${ABC[0:8]}));
+ $else:
+ vacc${ABC[0:4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[0:4]}), vget_low_u16(vsum${ABC[0:8]})));
+ vacc${ABC[4:8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[4:8]}), vget_high_u16(vsum${ABC[0:8]})));
vst1q_s32(b, vacc${ABC[0:4]}); b += 4;
vst1q_s32(b, vacc${ABC[4:8]}); b += 4;
@@ -150,9 +190,9 @@
}
}
- i0 = (const int8_t*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
+ i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
$for M in range(1, ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -169,32 +209,36 @@
const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.magic_bias);
const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.magic_bias_less_output_zero_point);
$if CHANNEL_TILE > 8:
- const int8x16_t voutput_min = vld1q_dup_s8(¶ms->${PARAMS_STRUCT}.output_min);
- const int8x16_t voutput_max = vld1q_dup_s8(¶ms->${PARAMS_STRUCT}.output_max);
+ const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min);
+ const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max);
$else:
- const int8x8_t voutput_min = vld1_dup_s8(¶ms->${PARAMS_STRUCT}.output_min);
- const int8x8_t voutput_max = vld1_dup_s8(¶ms->${PARAMS_STRUCT}.output_max);
+ const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min);
+ const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max);
for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
$for M in range(2):
$for C in range(0, CHANNEL_TILE, 8):
- const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8;
+ const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8;
$for C in range(0, CHANNEL_TILE, 8):
- const int8x8_t vi2x${ABC[C:C+8]} = vld1_s8(i2); i2 += 8;
- int16x8_t vacc${ABC[C:C+8]} = vaddl_s8(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
+ const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8;
+ ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
$for M in range(2, ROW_TILE):
$for C in range(0, CHANNEL_TILE, 8):
$if M + 1 != ROW_TILE:
- const int8x8_t vi${M+1}x${ABC[C:C+8]} = vld1_s8(i${M+1}); i${M+1} += 8;
+ const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8;
$else:
int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc${ABC[C+4:C+8]} = vld1q_s32(buffer); buffer += 4;
- vacc${ABC[C:C+8]} = vaddw_s8(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+ vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 8):
- vacc${ABC[C:C+4]} = vaddw_s16(vacc${ABC[C:C+4]}, vget_low_s16(vacc${ABC[C:C+8]}));
- vacc${ABC[C+4:C+8]} = vaddw_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vacc${ABC[C:C+8]}));
+ $if DATATYPE == "QS8":
+ vacc${ABC[C:C+4]} = vaddw_s16(vacc${ABC[C:C+4]}, vget_low_s16(vsum${ABC[C:C+8]}));
+ vacc${ABC[C+4:C+8]} = vaddw_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vsum${ABC[C:C+8]}));
+ $else:
+ vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[C:C+4]}), vget_low_u16(vsum${ABC[C:C+8]})));
+ vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[C+4:C+8]}), vget_high_u16(vsum${ABC[C:C+8]})));
$for C in range(0, CHANNEL_TILE, 4):
float32x4_t vfpacc${ABC[C:C+4]} = vcvtq_f32_s32(vacc${ABC[C:C+4]});
@@ -214,10 +258,10 @@
#if XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 8):
- vacc${ABC[C:C+8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc${ABC[C+4:C+8]});
+ int16x8_t vacc${ABC[C:C+8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc${ABC[C+4:C+8]});
#else // !XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 8):
- vacc${ABC[C:C+8]} = vcombine_s16(vqmovn_s32(vacc${ABC[C:C+4]}), vqmovn_s32(vacc${ABC[C+4:C+8]}));
+ int16x8_t vacc${ABC[C:C+8]} = vcombine_s16(vqmovn_s32(vacc${ABC[C:C+4]}), vqmovn_s32(vacc${ABC[C+4:C+8]}));
#endif // !XNN_ARCH_ARM64
$if ARMV8:
@@ -227,61 +271,65 @@
#if XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- int8x16_t vout${ABC[C:C+16]} = vqmovn_high_s16(vqmovn_s16(vacc${ABC[C:C+8]}), vacc${ABC[C+8:C+16]});
+ ${XINT8X16_T} vout${ABC[C:C+16]} = ${VQMOVXN_HIGH_S16}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), vacc${ABC[C+8:C+16]});
$else:
- int8x8_t vout${ABC[C:C+8]} = vqmovn_s16(vacc${ABC[C:C+8]});
+ ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]});
#else // !XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- int8x16_t vout${ABC[C:C+16]} = vcombine_s8(vqmovn_s16(vacc${ABC[C:C+8]}), vqmovn_s16(vacc${ABC[C+8:C+16]}));
+ ${XINT8X16_T} vout${ABC[C:C+16]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), ${VQMOVXN_S16}(vacc${ABC[C+8:C+16]}));
$else:
- int8x8_t vout${ABC[C:C+8]} = vqmovn_s16(vacc${ABC[C:C+8]});
+ ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]});
#endif // !XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vout${ABC[C:C+16]} = vmaxq_s8(vout${ABC[C:C+16]}, voutput_min);
+ vout${ABC[C:C+16]} = ${VMAXQ_X8}(vout${ABC[C:C+16]}, voutput_min);
$elif CHANNEL_TILE > 8:
- vout${ABC[C:C+8]} = vmax_s8(vout${ABC[C:C+8]}, vget_low_s8(voutput_min));
+ vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_min));
$else:
- vout${ABC[C:C+8]} = vmax_s8(vout${ABC[C:C+8]}, voutput_min);
+ vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, voutput_min);
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vout${ABC[C:C+16]} = vminq_s8(vout${ABC[C:C+16]}, voutput_max);
+ vout${ABC[C:C+16]} = ${VMINQ_X8}(vout${ABC[C:C+16]}, voutput_max);
$elif CHANNEL_TILE > 8:
- vout${ABC[C:C+8]} = vmin_s8(vout${ABC[C:C+8]}, vget_low_s8(voutput_max));
+ vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_max));
$else:
- vout${ABC[C:C+8]} = vmin_s8(vout${ABC[C:C+8]}, voutput_max);
+ vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, voutput_max);
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vst1q_s8(output, vout${ABC[C:C+16]}); output += 16;
+ ${VST1Q_X8}(output, vout${ABC[C:C+16]}); output += 16;
$else:
- vst1_s8(output, vout${ABC[C:C+8]}); output += 8;
+ ${VST1_X8}(output, vout${ABC[C:C+8]}); output += 8;
}
if XNN_UNLIKELY(channels != 0) {
${"do " if CHANNEL_TILE > 8 else ""}{
$for M in range(3):
$if CHANNEL_TILE > 8:
- const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M}); i${M} += 8;
+ const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); i${M} += 8;
$else:
- const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M});
- int16x8_t vacc${ABC[0:8]} = vaddl_s8(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
+ const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M});
+ ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
$for M in range(2, ROW_TILE):
$if M + 1 != ROW_TILE:
$if CHANNEL_TILE > 8:
- const int8x8_t vi${M+1}x${ABC[0:8]} = vld1_s8(i${M+1}); i${M+1} += 8;
+ const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8;
$else:
- const int8x8_t vi${M+1}x${ABC[0:8]} = vld1_s8(i${M+1});
+ const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1});
$else:
int32x4_t vacc${ABC[0:4]} = vld1q_s32(buffer); buffer += 4;
int32x4_t vacc${ABC[4:8]} = vld1q_s32(buffer); buffer += 4;
- vacc${ABC[0:8]} = vaddw_s8(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+ vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]});
- vacc${ABC[0:4]} = vaddw_s16(vacc${ABC[0:4]}, vget_low_s16(vacc${ABC[0:8]}));
- vacc${ABC[4:8]} = vaddw_s16(vacc${ABC[4:8]}, vget_high_s16(vacc${ABC[0:8]}));
+ $if DATATYPE == "QS8":
+ vacc${ABC[0:4]} = vaddw_s16(vacc${ABC[0:4]}, vget_low_s16(vsum${ABC[0:8]}));
+ vacc${ABC[4:8]} = vaddw_s16(vacc${ABC[4:8]}, vget_high_s16(vsum${ABC[0:8]}));
+ $else:
+ vacc${ABC[0:4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[0:4]}), vget_low_u16(vsum${ABC[0:8]})));
+ vacc${ABC[4:8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[4:8]}), vget_high_u16(vsum${ABC[0:8]})));
float32x4_t vfpacc${ABC[0:4]} = vcvtq_f32_s32(vacc${ABC[0:4]});
float32x4_t vfpacc${ABC[4:8]} = vcvtq_f32_s32(vacc${ABC[4:8]});
@@ -300,49 +348,49 @@
vacc${ABC[4:8]} = vqsubq_s32(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc${ABC[0:8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[0:4]}), vacc${ABC[4:8]});
+ int16x8_t vacc${ABC[0:8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[0:4]}), vacc${ABC[4:8]});
#else
- vacc${ABC[0:8]} = vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]}));
+ int16x8_t vacc${ABC[0:8]} = vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]}));
#endif
$if ARMV8:
vacc${ABC[0:8]} = vqaddq_s16(vacc${ABC[0:8]}, voutput_zero_point);
- int8x8_t vout${ABC[0:8]} = vqmovn_s16(vacc${ABC[0:8]});
+ ${XINT8X8_T} vout${ABC[0:8]} = ${VQMOVXN_S16}(vacc${ABC[0:8]});
$if CHANNEL_TILE > 8:
- vout${ABC[0:8]} = vmax_s8(vout${ABC[0:8]}, vget_low_s8(voutput_min));
- vout${ABC[0:8]} = vmin_s8(vout${ABC[0:8]}, vget_low_s8(voutput_max));
+ vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min));
+ vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max));
if XNN_LIKELY(channels >= 8) {
- vst1_s8(output, vout${ABC[0:8]}); output += 8;
+ ${VST1_X8}(output, vout${ABC[0:8]}); output += 8;
channels -= 8;
} else {
if (channels & 4) {
- vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout${ABC[0:8]}), 0); output += 4;
- vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
+ vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
}
if (channels & 2) {
- vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout${ABC[0:8]}), 0); output += 2;
- vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
+ vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
}
if (channels & 1) {
- vst1_lane_s8(output, vout${ABC[0:8]}, 0); output += 1;
+ ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0); output += 1;
}
channels = 0;
}
$else:
- vout${ABC[0:8]} = vmax_s8(vout${ABC[0:8]}, voutput_min);
- vout${ABC[0:8]} = vmin_s8(vout${ABC[0:8]}, voutput_max);
+ vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, voutput_min);
+ vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, voutput_max);
if (channels & 4) {
- vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout${ABC[0:8]}), 0); output += 4;
- vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
+ vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
}
if (channels & 2) {
- vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout${ABC[0:8]}), 0); output += 2;
- vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
+ vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
}
if (channels & 1) {
- vst1_lane_s8(output, vout${ABC[0:8]}, 0);
+ ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0);
}
}${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
}
diff --git a/src/qs8-gavgpool/multipass-scalar.c.in b/src/qs8-gavgpool/multipass-scalar.c.in
index 5fd6a1e..aa7c335 100644
--- a/src/qs8-gavgpool/multipass-scalar.c.in
+++ b/src/qs8-gavgpool/multipass-scalar.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE >= 1
$assert CHANNEL_TILE <= 16
$assert ROW_TILE >= 3
@@ -21,24 +22,25 @@
$PARAMS_STRUCT = "fp32_scalar_" + VARIANT.lower()
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__scalar_${VARIANT.lower()}_c${CHANNEL_TILE}(
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__scalar_${VARIANT.lower()}_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
+ const ${XINT8_T}* zero,
int32_t* buffer,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(rows > ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if CHANNEL_TILE <= 16:
const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
$else:
@@ -51,11 +53,11 @@
do {
int32_t vacc = vinit_bias;
$for M in range(2):
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(2, ROW_TILE):
vacc += vi${M-2};
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(ROW_TILE - 2, ROW_TILE):
vacc += vi${M};
@@ -65,18 +67,18 @@
$else:
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= ${CHANNEL_TILE}) {
$for C in range(CHANNEL_TILE):
- const int32_t vi0x${C} = i0[${C}];
+ const int32_t vi0x${C} = (int32_t) i0[${C}];
i0 += ${CHANNEL_TILE};
$for C in range(CHANNEL_TILE):
int32_t vacc${C} = vi0x${C} + vinit_bias;
- const int32_t vi1x${C} = i1[${C}];
+ const int32_t vi1x${C} = (int32_t) i1[${C}];
i1 += ${CHANNEL_TILE};
$for M in range(2, ROW_TILE):
$for C in range(CHANNEL_TILE):
vacc${C} += vi${M-1}x${C};
- const int32_t vi${M}x${C} = i${M}[${C}];
+ const int32_t vi${M}x${C} = (int32_t) i${M}[${C}];
i${M} += ${CHANNEL_TILE};
$for C in range(CHANNEL_TILE):
@@ -89,7 +91,7 @@
for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) {
$for M in range(ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
int32_t* b = buffer;
$if CHANNEL_TILE == 1:
@@ -97,11 +99,11 @@
do {
int32_t vacc = *b;
$for M in range(2):
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(2, ROW_SUBTILE):
vacc += vi${M-2};
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(ROW_SUBTILE - 2, ROW_SUBTILE):
vacc += vi${M};
@@ -112,13 +114,13 @@
for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= ${CHANNEL_TILE}) {
$for C in range(CHANNEL_TILE):
int32_t vacc${C} = b[${C}];
- const int32_t vi0x${C} = i0[${C}];
+ const int32_t vi0x${C} = (int32_t) i0[${C}];
i0 += ${CHANNEL_TILE};
$for M in range(1, ROW_SUBTILE):
$for C in range(CHANNEL_TILE):
vacc${C} += vi${M-1}x${C};
- const int32_t vi${M}x${C} = i${M}[${C}];
+ const int32_t vi${M}x${C} = (int32_t) i${M}[${C}];
i${M} += ${CHANNEL_TILE};
$for C in range(CHANNEL_TILE):
@@ -130,9 +132,9 @@
}
}
- i0 = (const int8_t*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
+ i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
$for M in range(1, ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -161,11 +163,11 @@
do {
int32_t vacc = *buffer++;
$for M in range(2):
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(2, ROW_SUBTILE):
vacc += vi${M-2};
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(ROW_SUBTILE - 2, ROW_SUBTILE):
vacc += vi${M};
@@ -188,20 +190,20 @@
const int32_t vrndacc = (int32_t) lrintf(vfpacc);
int32_t vout = vrndacc + voutput_zero_point;
- *output++ = (int8_t) vout;
+ *output++ = (${XINT8_T}) vout;
} while (--channels != 0);
$else:
for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
$for C in range(CHANNEL_TILE):
int32_t vacc${C} = buffer[${C}];
- const int32_t vi0x${C} = i0[${C}];
+ const int32_t vi0x${C} = (int32_t) i0[${C}];
buffer += ${CHANNEL_TILE};
i0 += ${CHANNEL_TILE};
$for M in range(1, ROW_SUBTILE):
$for C in range(CHANNEL_TILE):
vacc${C} += vi${M-1}x${C};
- const int32_t vi${M}x${C} = i${M}[${C}];
+ const int32_t vi${M}x${C} = (int32_t) i${M}[${C}];
i${M} += ${CHANNEL_TILE};
$for C in range(CHANNEL_TILE):
@@ -251,18 +253,18 @@
int32_t vout${C} = vrndacc${C} + voutput_zero_point;
$for C in range(CHANNEL_TILE):
- output[${C}] = (int8_t) vout${C};
+ output[${C}] = (${XINT8_T}) vout${C};
output += ${CHANNEL_TILE};
}
if XNN_UNLIKELY(channels != 0) {
$if CHANNEL_TILE == 2:
int32_t vacc = *buffer;
$for M in range(2):
- const int32_t vi${M} = *i${M};
+ const int32_t vi${M} = (int32_t) *i${M};
$for M in range(2, ROW_SUBTILE):
vacc += vi${M-2};
- const int32_t vi${M} = *i${M};
+ const int32_t vi${M} = (int32_t) *i${M};
$for M in range(ROW_SUBTILE - 2, ROW_SUBTILE):
vacc += vi${M};
@@ -285,16 +287,16 @@
const int32_t vrndacc = (int32_t) lrintf(vfpacc);
int32_t vout = vrndacc + voutput_zero_point;
- *output = (int8_t) vout;
+ *output = (${XINT8_T}) vout;
$else:
do {
int32_t vacc = *buffer++;
$for M in range(2):
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(2, ROW_SUBTILE):
vacc += vi${M-2};
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(ROW_SUBTILE - 2, ROW_SUBTILE):
vacc += vi${M};
@@ -317,7 +319,7 @@
const int32_t vrndacc = (int32_t) lrintf(vfpacc);
int32_t vout = vrndacc + voutput_zero_point;
- *output++ = (int8_t) vout;
+ *output++ = (${XINT8_T}) vout;
} while (--channels != 0);
}
}
diff --git a/src/qs8-gavgpool/multipass-sse2.c.in b/src/qs8-gavgpool/multipass-sse2.c.in
index 1518b3a..d554063 100644
--- a/src/qs8-gavgpool/multipass-sse2.c.in
+++ b/src/qs8-gavgpool/multipass-sse2.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE % 8 == 0
$assert CHANNEL_TILE >= 8
$assert ROW_TILE >= 3
@@ -18,28 +19,32 @@
#include <xnnpack/math.h>
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__sse2_c${CHANNEL_TILE}(
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__sse2_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
+ const ${XINT8_T}* zero,
int32_t* buffer,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows > ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if CHANNEL_TILE <= 16:
const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
$else:
const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
+ $if DATATYPE == "QU8":
+ const __m128i vzero = _mm_setzero_si128();
int32_t* b = buffer;
size_t c = channels;
for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) {
@@ -51,7 +56,10 @@
$elif M > 3:
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]});
$if 1 <= M <= ROW_TILE:
- const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8);
+ $if DATATYPE == "QS8":
+ const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8);
+ $else:
+ const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero);
$if M < ROW_TILE:
$if C == 0:
const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
@@ -61,9 +69,16 @@
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
- const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]});
- const __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}), vinit_bias);
- const __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}), vinit_bias);
+ $if DATATYPE == "QS8":
+ const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]});
+ __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]});
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]});
+ $else:
+ __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero);
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero);
+
+ $for C in range(0, CHANNEL_TILE, 4):
+ vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias);
_mm_store_si128((__m128i*) b, vacc${ABC[0:4]});
$for C in range(4, CHANNEL_TILE, 4):
@@ -80,14 +95,24 @@
$elif M > 4:
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]});
$if 2 <= M <= ROW_TILE + 1:
- const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8);
+ $if DATATYPE == "QS8":
+ const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8);
+ $else:
+ const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero);
$if M < ROW_TILE:
const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
i${M} += 8;
- const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]});
- const __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}), vinit_bias);
- const __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}), vinit_bias);
+ $if DATATYPE == "QS8":
+ const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]});
+ __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]});
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]});
+ $else:
+ __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero);
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero);
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias);
+ vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias);
_mm_store_si128((__m128i*) b, vacc${ABC[0:4]});
_mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]});
@@ -99,7 +124,7 @@
for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) {
$for M in range(ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
int32_t* b = buffer;
size_t c = channels;
@@ -112,7 +137,10 @@
$elif M > 3:
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]});
$if 1 <= M <= ROW_SUBTILE:
- const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8);
+ $if DATATYPE == "QS8":
+ const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8);
+ $else:
+ const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero);
$if M < ROW_SUBTILE:
$if C == 0:
const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
@@ -122,9 +150,17 @@
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
- const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]});
- const __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}), _mm_load_si128((const __m128i*) (b + ${C})));
- const __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}), _mm_load_si128((const __m128i*) (b + ${C+4})));
+ $if DATATYPE == "QS8":
+ const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]});
+ __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]});
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]});
+ $else:
+ __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero);
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero);
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b));
+ $for C in range(4, CHANNEL_TILE, 4):
+ vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (b + ${C})));
_mm_store_si128((__m128i*) b, vacc${ABC[0:4]});
$for C in range(4, CHANNEL_TILE, 4):
@@ -141,14 +177,24 @@
$elif M > 4:
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]});
$if 2 <= M <= ROW_SUBTILE + 1:
- const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8);
+ $if DATATYPE == "QS8":
+ const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8);
+ $else:
+ const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero);
$if M < ROW_SUBTILE:
const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
i${M} += 8;
- const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]});
- const __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}), _mm_load_si128((const __m128i*) b));
- const __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}), _mm_load_si128((const __m128i*) (b + 4)));
+ $if DATATYPE == "QS8":
+ const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]});
+ __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]});
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]});
+ $else:
+ __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero);
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero);
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b));
+ vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (b + 4)));
_mm_store_si128((__m128i*) b, vacc${ABC[0:4]});
_mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]});
@@ -159,9 +205,9 @@
}
}
- i0 = (const int8_t*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
+ i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
$for M in range(1, ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -184,7 +230,10 @@
$elif M > 3:
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]});
$if 1 <= M <= ROW_SUBTILE:
- const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8);
+ $if DATATYPE == "QS8":
+ const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8);
+ $else:
+ const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero);
$if M < ROW_SUBTILE:
$if C == 0:
const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
@@ -194,9 +243,17 @@
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
- const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]});
- __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}), _mm_load_si128((const __m128i*) (buffer + ${C})));
- __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}), _mm_load_si128((const __m128i*) (buffer + ${C+4})));
+ $if DATATYPE == "QS8":
+ const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]});
+ __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]});
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]});
+ $else:
+ __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero);
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero);
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer));
+ $for C in range(4, CHANNEL_TILE, 4):
+ vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (buffer + ${C})));
buffer += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 4):
@@ -214,14 +271,22 @@
$for C in range(0, CHANNEL_TILE, 8):
__m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point);
- $for C in range(0, CHANNEL_TILE, 8):
- vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min);
+ $if DATATYPE == "QS8":
+ $for C in range(0, CHANNEL_TILE, 8):
+ vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min);
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- __m128i vout${ABC[C:C+16]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
+ __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
$else:
- __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+ __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+
+ $if DATATYPE == "QU8":
+ $for C in range(0, CHANNEL_TILE, 16):
+ $if C + 8 < CHANNEL_TILE:
+ vout${ABC[C:C+16]} = _mm_max_epu8(vout${ABC[C:C+16]}, voutput_min);
+ $else:
+ vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epu8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
$if CHANNEL_TILE > 8:
_mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
@@ -243,14 +308,24 @@
$elif M > 4:
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]});
$if 2 <= M <= ROW_SUBTILE + 1:
- const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8);
+ $if DATATYPE == "QS8":
+ const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8);
+ $else:
+ const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero);
$if M < ROW_SUBTILE:
const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
i${M} += 8;
- const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]});
- __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}), _mm_load_si128((const __m128i*) (buffer + 4)));
+ $if DATATYPE == "QS8":
+ const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]});
+ __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]});
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]});
+ $else:
+ __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero);
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero);
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer));
+ vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]});
@@ -266,9 +341,12 @@
vacc${ABC[4:8]} = _mm_cvtps_epi32(vfpacc${ABC[4:8]});
__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
- vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, voutput_min);
+ $if DATATYPE == "QS8":
+ vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, voutput_min);
- __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[0:8]});
+ __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]});
+ $if DATATYPE == "QU8":
+ vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epu8(vout${ABC[0:8]}${ABC[0:8]}, voutput_min);
$if CHANNEL_TILE > 8:
if XNN_LIKELY(channels >= 8) {
@@ -288,7 +366,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) vout${ABC[0:4]};
+ *output = (${XINT8_T}) vout${ABC[0:4]};
output += 1;
}
channels = 0;
@@ -306,7 +384,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) vout${ABC[0:4]};
+ *output = (${XINT8_T}) vout${ABC[0:4]};
}
}${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
}
diff --git a/src/qs8-gavgpool/multipass-sse4.c.in b/src/qs8-gavgpool/multipass-sse4.c.in
index c5a750c..7d0d5ff 100644
--- a/src/qs8-gavgpool/multipass-sse4.c.in
+++ b/src/qs8-gavgpool/multipass-sse4.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE % 8 == 0
$assert CHANNEL_TILE >= 8
$assert ROW_TILE >= 3
@@ -18,22 +19,27 @@
#include <xnnpack/math.h>
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__sse41_c${CHANNEL_TILE}(
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+$_MM_CVTEPX8_EPI16 = {"QS8": "_mm_cvtepi8_epi16", "QU8": "_mm_cvtepu8_epi16"}[DATATYPE]
+$_MM_CVTEPX16_EPI32 = {"QS8": "_mm_cvtepi16_epi32", "QU8": "_mm_cvtepu16_epi32"}[DATATYPE]
+$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
+$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__sse41_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
+ const ${XINT8_T}* zero,
int32_t* buffer,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows > ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if CHANNEL_TILE <= 16:
const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
$else:
@@ -44,32 +50,40 @@
size_t c = channels;
for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) {
$for M in range(2):
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
$for C in range(8, CHANNEL_TILE, 8):
- const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
+ const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
i${M} += ${CHANNEL_TILE};
__m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const __m128i vxi2x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
$for C in range(8, CHANNEL_TILE, 8):
__m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]});
- const __m128i vxi2x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + ${C})));
+ const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C})));
i2 += ${CHANNEL_TILE};
$for M in range(3, ROW_TILE):
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
$for C in range(8, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]});
- const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
+ const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]});
+ $if DATATYPE == "QU8":
+ const __m128i vzero = _mm_setzero_si128();
$for C in range(0, CHANNEL_TILE, 8):
- const __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc${ABC[C:C+8]}), vinit_bias);
- const __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16), vinit_bias);
+ __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]});
+ $if DATATYPE == "QS8":
+ __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16);
+ $else:
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero);
+
+ $for C in range(0, CHANNEL_TILE, 4):
+ vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias);
_mm_store_si128((__m128i*) b, vacc${ABC[0:4]});
$for C in range(4, CHANNEL_TILE, 4):
@@ -80,22 +94,28 @@
if XNN_UNLIKELY(c != 0) {
do {
$for M in range(2):
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
i${M} += 8;
__m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const __m128i vxi2x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
i2 += 8;
$for M in range(3, ROW_TILE):
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
i${M} += 8;
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_TILE-1}x${ABC[0:8]});
- const __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc${ABC[0:8]}), vinit_bias);
- const __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16), vinit_bias);
+ __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]});
+ $if DATATYPE == "QS8":
+ __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16);
+ $else:
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128());
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias);
+ vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias);
_mm_store_si128((__m128i*) b, vacc${ABC[0:4]});
_mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]});
@@ -107,38 +127,47 @@
for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) {
$for M in range(ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
int32_t* b = buffer;
size_t c = channels;
for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) {
$for M in range(2):
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
$for C in range(8, CHANNEL_TILE, 8):
- const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
+ const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
i${M} += ${CHANNEL_TILE};
__m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const __m128i vxi2x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
$for C in range(8, CHANNEL_TILE, 8):
__m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]});
- const __m128i vxi2x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + ${C})));
+ const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C})));
i2 += ${CHANNEL_TILE};
$for M in range(3, ROW_SUBTILE):
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
$for C in range(8, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]});
- const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
+ const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]});
+ $if DATATYPE == "QU8":
+ const __m128i vzero = _mm_setzero_si128();
$for C in range(0, CHANNEL_TILE, 8):
- const __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc${ABC[C:C+8]}), _mm_load_si128((const __m128i*) (b + ${C})));
- const __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16), _mm_load_si128((const __m128i*) (b + ${C+4})));
+ __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]});
+ $if DATATYPE == "QS8":
+ __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16);
+ $else:
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero);
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b));
+ $for C in range(4, CHANNEL_TILE, 4):
+ vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (b + ${C})));
_mm_store_si128((__m128i*) b, vacc${ABC[0:4]});
$for C in range(4, CHANNEL_TILE, 4):
@@ -149,22 +178,28 @@
if XNN_UNLIKELY(c != 0) {
do {
$for M in range(2):
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
i${M} += 8;
__m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const __m128i vxi2x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
i2 += 8;
$for M in range(3, ROW_SUBTILE):
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
i${M} += 8;
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]});
- const __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc${ABC[0:8]}), _mm_load_si128((const __m128i*) b));
- const __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16), _mm_load_si128((const __m128i*) (b + 4)));
+ __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]});
+ $if DATATYPE == "QS8":
+ __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16);
+ $else:
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128());
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b));
+ vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (b + 4)));
_mm_store_si128((__m128i*) b, vacc${ABC[0:4]});
_mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]});
@@ -175,9 +210,9 @@
}
}
- i0 = (const int8_t*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
+ i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
$for M in range(1, ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -193,32 +228,41 @@
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
$for M in range(2):
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
$for C in range(8, CHANNEL_TILE, 8):
- const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
+ const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
i${M} += ${CHANNEL_TILE};
__m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const __m128i vxi2x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
$for C in range(8, CHANNEL_TILE, 8):
__m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]});
- const __m128i vxi2x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + ${C})));
+ const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C})));
i2 += ${CHANNEL_TILE};
$for M in range(3, ROW_SUBTILE):
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
$for C in range(8, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]});
- const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
+ const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]});
+ $if DATATYPE == "QU8":
+ const __m128i vzero = _mm_setzero_si128();
$for C in range(0, CHANNEL_TILE, 8):
- __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc${ABC[C:C+8]}), _mm_load_si128((const __m128i*) (buffer + ${C})));
- __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16), _mm_load_si128((const __m128i*) (buffer + ${C+4})));
+ __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]});
+ $if DATATYPE == "QS8":
+ __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16);
+ $else:
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero);
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer));
+ $for C in range(4, CHANNEL_TILE, 4):
+ vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (buffer + ${C})));
buffer += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 4):
@@ -238,15 +282,15 @@
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- __m128i vout${ABC[C:C+16]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
+ __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
$else:
- __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+ __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min);
+ vout${ABC[C:C+16]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+16]}, voutput_min);
$else:
- vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
+ vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
$if CHANNEL_TILE > 8:
_mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
@@ -262,22 +306,28 @@
if XNN_UNLIKELY(channels != 0) {
${"do " if CHANNEL_TILE > 8 else ""}{
$for M in range(2):
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
i${M} += 8;
__m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const __m128i vxi2x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
i2 += 8;
$for M in range(3, ROW_SUBTILE):
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
i${M} += 8;
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]});
- __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc${ABC[0:8]}), _mm_load_si128((const __m128i*) buffer));
- __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
+ __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]});
+ $if DATATYPE == "QS8":
+ __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16);
+ $else:
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128());
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer));
+ vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (buffer + 4)));
buffer += 8;
__m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]});
@@ -294,8 +344,8 @@
__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
- __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[0:8]});
- vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epi8(vout${ABC[0:8]}${ABC[0:8]}, voutput_min);
+ __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]});
+ vout${ABC[0:8]}${ABC[0:8]} = ${_MM_MAX_EPX8}(vout${ABC[0:8]}${ABC[0:8]}, voutput_min);
$if CHANNEL_TILE > 8:
if XNN_LIKELY(channels >= 8) {
@@ -314,7 +364,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
+ *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
output += 1;
}
channels = 0;
@@ -331,7 +381,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
+ *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
}
}${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
}
diff --git a/src/qs8-gavgpool/multipass-wasmsimd.c.in b/src/qs8-gavgpool/multipass-wasmsimd.c.in
index 3db2cae..0a2f3aa 100644
--- a/src/qs8-gavgpool/multipass-wasmsimd.c.in
+++ b/src/qs8-gavgpool/multipass-wasmsimd.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE % 8 == 0
$assert CHANNEL_TILE >= 8
$assert ROW_TILE >= 3
@@ -18,22 +19,28 @@
#include <xnnpack/math.h>
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__wasmsimd_c${CHANNEL_TILE}(
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$WASM_X16X8_LOAD8X8 = {"QS8": "wasm_i16x8_load8x8", "QU8": "wasm_u16x8_load8x8"}[DATATYPE]
+$WASM_X32X4_EXTEND_LOW_X16X8 = {"QS8": "wasm_i32x4_extend_low_i16x8", "QU8": "wasm_u32x4_extend_low_u16x8"}[DATATYPE]
+$WASM_X32X4_EXTEND_HIGH_X16X8 = {"QS8": "wasm_i32x4_extend_high_i16x8", "QU8": "wasm_u32x4_extend_high_u16x8"}[DATATYPE]
+$WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE]
+$WASM_X8X16_MIN = {"QS8": "wasm_i8x16_min", "QU8": "wasm_u8x16_min"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__wasmsimd_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
+ const ${XINT8_T}* zero,
int32_t* buffer,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows > ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if CHANNEL_TILE <= 16:
const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
$else:
@@ -44,32 +51,32 @@
size_t c = channels;
for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) {
$for M in range(2):
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
$for C in range(8, CHANNEL_TILE, 8):
- const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load8x8(i${M} + ${C});
+ const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C});
i${M} += ${CHANNEL_TILE};
v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const v128_t vxi2x${ABC[0:8]} = wasm_i16x8_load8x8(i2);
+ const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2);
$for C in range(8, CHANNEL_TILE, 8):
v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]});
- const v128_t vxi2x${ABC[C:C+8]} = wasm_i16x8_load8x8(i2 + ${C});
+ const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C});
i2 += ${CHANNEL_TILE};
$for M in range(3, ROW_TILE):
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
$for C in range(8, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]});
- const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load8x8(i${M} + ${C});
+ const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C});
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 8):
- const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc${ABC[C:C+8]}));
- const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc${ABC[C:C+8]}));
+ const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]}));
+ const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]}));
wasm_v128_store(b, vacc${ABC[0:4]});
$for C in range(4, CHANNEL_TILE, 4):
@@ -80,22 +87,22 @@
if XNN_UNLIKELY(c != 0) {
do {
$for M in range(2):
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
i${M} += 8;
v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const v128_t vxi2x${ABC[0:8]} = wasm_i16x8_load8x8(i2);
+ const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2);
i2 += 8;
$for M in range(3, ROW_TILE):
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
i${M} += 8;
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]});
- const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc${ABC[0:8]}));
- const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc${ABC[0:8]}));
+ const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]}));
+ const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]}));
wasm_v128_store(b, vacc${ABC[0:4]});
wasm_v128_store(b + 4, vacc${ABC[4:8]});
@@ -107,38 +114,42 @@
for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) {
$for M in range(ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
int32_t* b = buffer;
size_t c = channels;
for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) {
$for M in range(2):
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
$for C in range(8, CHANNEL_TILE, 8):
- const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load8x8(i${M} + ${C});
+ const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C});
i${M} += ${CHANNEL_TILE};
v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const v128_t vxi2x${ABC[0:8]} = wasm_i16x8_load8x8(i2);
+ const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2);
$for C in range(8, CHANNEL_TILE, 8):
v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]});
- const v128_t vxi2x${ABC[C:C+8]} = wasm_i16x8_load8x8(i2 + ${C});
+ const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C});
i2 += ${CHANNEL_TILE};
$for M in range(3, ROW_SUBTILE):
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
$for C in range(8, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]});
- const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load8x8(i${M} + ${C});
+ const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C});
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]});
+ v128_t vacc${ABC[0:4]} = wasm_v128_load(b);
+ $for C in range(4, CHANNEL_TILE, 4):
+ v128_t vacc${ABC[C:C+4]} = wasm_v128_load(b + ${C});
+
$for C in range(0, CHANNEL_TILE, 8):
- const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc${ABC[C:C+8]}), wasm_v128_load(b + ${C}));
- const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc${ABC[C:C+8]}), wasm_v128_load(b + ${C+4}));
+ vacc${ABC[C:C+4]} = wasm_i32x4_add(vacc${ABC[C:C+4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]}));
+ vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vacc${ABC[C+4:C+8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]}));
wasm_v128_store(b, vacc${ABC[0:4]});
$for C in range(4, CHANNEL_TILE, 4):
@@ -149,22 +160,25 @@
if XNN_UNLIKELY(c != 0) {
do {
$for M in range(2):
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
i${M} += 8;
v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const v128_t vxi2x${ABC[0:8]} = wasm_i16x8_load8x8(i2);
+ const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2);
i2 += 8;
$for M in range(3, ROW_SUBTILE):
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
i${M} += 8;
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]});
- const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc${ABC[0:8]}), wasm_v128_load(b));
- const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc${ABC[0:8]}), wasm_v128_load(b + 4));
+ v128_t vacc${ABC[0:4]} = wasm_v128_load(b);
+ v128_t vacc${ABC[4:8]} = wasm_v128_load(b + 4);
+
+ vacc${ABC[0:4]} = wasm_i32x4_add(vacc${ABC[0:4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]}));
+ vacc${ABC[4:8]} = wasm_i32x4_add(vacc${ABC[4:8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]}));
wasm_v128_store(b, vacc${ABC[0:4]});
wasm_v128_store(b + 4, vacc${ABC[4:8]});
@@ -175,9 +189,9 @@
}
}
- i0 = (const int8_t*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
+ i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
$for M in range(1, ROW_SUBTILE):
- i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -194,34 +208,38 @@
const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
$for M in range(2):
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
$for C in range(8, CHANNEL_TILE, 8):
- const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load8x8(i${M} + ${C});
+ const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C});
i${M} += ${CHANNEL_TILE};
v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const v128_t vxi2x${ABC[0:8]} = wasm_i16x8_load8x8(i2);
+ const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2);
$for C in range(8, CHANNEL_TILE, 8):
v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]});
- const v128_t vxi2x${ABC[C:C+8]} = wasm_i16x8_load8x8(i2 + ${C});
+ const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C});
i2 += ${CHANNEL_TILE};
$for M in range(3, ROW_SUBTILE):
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
$for C in range(8, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]});
- const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load8x8(i${M} + ${C});
+ const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C});
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]});
- $for C in range(0, CHANNEL_TILE, 8):
- v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc${ABC[C:C+8]}), wasm_v128_load(buffer + ${C}));
- v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc${ABC[C:C+8]}), wasm_v128_load(buffer + ${C+4}));
+ v128_t vacc${ABC[0:4]} = wasm_v128_load(buffer);
+ $for C in range(4, CHANNEL_TILE, 4):
+ v128_t vacc${ABC[C:C+4]} = wasm_v128_load(buffer + ${C});
buffer += ${CHANNEL_TILE};
+ $for C in range(0, CHANNEL_TILE, 8):
+ vacc${ABC[C:C+4]} = wasm_i32x4_add(vacc${ABC[C:C+4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]}));
+ vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vacc${ABC[C+4:C+8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]}));
+
$for C in range(0, CHANNEL_TILE, 4):
vacc${ABC[C:C+4]} = wasm_f32x4_convert_i32x4(vacc${ABC[C:C+4]});
@@ -242,15 +260,15 @@
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- v128_t vout${ABC[C:C+16]} = wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
+ v128_t vout${ABC[C:C+16]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
$else:
- v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+ v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vout${ABC[C:C+16]} = wasm_i8x16_min(vout${ABC[C:C+16]}, voutput_max);
+ vout${ABC[C:C+16]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+16]}, voutput_max);
$else:
- vout${ABC[C:C+8]}${ABC[C:C+8]} = wasm_i8x16_min(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
+ vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
$if CHANNEL_TILE > 8:
wasm_v128_store(output, vout${ABC[0:16]});
@@ -266,24 +284,27 @@
if XNN_UNLIKELY(channels != 0) {
${"do " if CHANNEL_TILE > 8 else ""}{
$for M in range(2):
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
i${M} += 8;
v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const v128_t vxi2x${ABC[0:8]} = wasm_i16x8_load8x8(i2);
+ const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2);
i2 += 8;
$for M in range(3, ROW_SUBTILE):
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
i${M} += 8;
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]});
- v128_t vacc${ABC[0:4]} = wasm_i32x4_add(wasm_i32x4_extend_low_i16x8(vacc${ABC[0:8]}), wasm_v128_load(buffer));
- v128_t vacc${ABC[4:8]} = wasm_i32x4_add(wasm_i32x4_extend_high_i16x8(vacc${ABC[0:8]}), wasm_v128_load(buffer + 4));
+ v128_t vacc${ABC[0:4]} = wasm_v128_load(buffer);
+ v128_t vacc${ABC[4:8]} = wasm_v128_load(buffer + 4);
buffer += 8;
+ vacc${ABC[0:4]} = wasm_i32x4_add(vacc${ABC[0:4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]}));
+ vacc${ABC[4:8]} = wasm_i32x4_add(vacc${ABC[4:8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]}));
+
vacc${ABC[0:4]} = wasm_f32x4_convert_i32x4(vacc${ABC[0:4]});
vacc${ABC[4:8]} = wasm_f32x4_convert_i32x4(vacc${ABC[4:8]});
@@ -300,8 +321,8 @@
vacc${ABC[4:8]} = wasm_i32x4_sub(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point);
const v128_t vout${ABC[0:8]} = wasm_i16x8_narrow_i32x4(vacc${ABC[0:4]}, vacc${ABC[4:8]});
- v128_t vout${ABC[0:8]}${ABC[0:8]} = wasm_i8x16_narrow_i16x8(vout${ABC[0:8]}, vout${ABC[0:8]});
- vout${ABC[0:8]}${ABC[0:8]} = wasm_i8x16_min(vout${ABC[0:8]}${ABC[0:8]}, voutput_max);
+ v128_t vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[0:8]}, vout${ABC[0:8]});
+ vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_MIN}(vout${ABC[0:8]}${ABC[0:8]}, voutput_max);
$if CHANNEL_TILE > 8:
if XNN_LIKELY(channels >= 8) {
@@ -321,7 +342,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) vout${ABC[0:4]};
+ *output = (${XINT8_T}) vout${ABC[0:4]};
output += 1;
}
channels = 0;
@@ -339,7 +360,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) vout${ABC[0:4]};
+ *output = (${XINT8_T}) vout${ABC[0:4]};
}
}${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
}
diff --git a/src/qs8-gavgpool/unipass-neon.c.in b/src/qs8-gavgpool/unipass-neon.c.in
index 1635186..62fa730 100644
--- a/src/qs8-gavgpool/unipass-neon.c.in
+++ b/src/qs8-gavgpool/unipass-neon.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE % 8 == 0
$assert CHANNEL_TILE >= 8
$assert ROW_TILE >= 3
@@ -18,23 +19,46 @@
$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("neonv8" if ARMV8 else "neon")
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$XINT8X8_T = {"QS8": "int8x8_t", "QU8": "uint8x8_t"}[DATATYPE]
+$XINT8X16_T = {"QS8": "int8x16_t", "QU8": "uint8x16_t"}[DATATYPE]
+$XINT16X8_T = {"QS8": "int16x8_t", "QU8": "uint16x8_t"}[DATATYPE]
+$VLD1_X8 = {"QS8": "vld1_s8", "QU8": "vld1_u8"}[DATATYPE]
+$VLD1_DUP_X8 = {"QS8": "vld1_dup_s8", "QU8": "vld1_dup_u8"}[DATATYPE]
+$VLD1Q_DUP_X8 = {"QS8": "vld1q_dup_s8", "QU8": "vld1q_dup_u8"}[DATATYPE]
+$VST1_X8 = {"QS8": "vst1_s8", "QU8": "vst1_u8"}[DATATYPE]
+$VST1Q_X8 = {"QS8": "vst1q_s8", "QU8": "vst1q_u8"}[DATATYPE]
+$VST1_LANE_X8 = {"QS8": "vst1_lane_s8", "QU8": "vst1_lane_u8"}[DATATYPE]
+$VADDL_X8 = {"QS8": "vaddl_s8", "QU8": "vaddl_u8"}[DATATYPE]
+$VADDW_X8 = {"QS8": "vaddw_s8", "QU8": "vaddw_u8"}[DATATYPE]
+$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE]
+$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE]
+$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE]
+$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE]
+$VEXT_X8 = {"QS8": "vext_s8", "QU8": "vext_u8"}[DATATYPE]
+$VQMOVXN_S16 = {"QS8": "vqmovn_s16", "QU8": "vqmovun_s16"}[DATATYPE]
+$VQMOVXN_HIGH_S16 = {"QS8": "vqmovn_high_s16", "QU8": "vqmovun_high_s16"}[DATATYPE]
+$VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE]
+$VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE]
+$VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE]
+$VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE]
$ISA = "neonv8" if ARMV8 else "neon"
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__${ISA}_c${CHANNEL_TILE}(
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__${ISA}_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+ const ${XINT8_T}* zero,
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows != 0);
assert(rows <= ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -52,29 +76,33 @@
const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.magic_bias);
const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.magic_bias_less_output_zero_point);
$if CHANNEL_TILE > 8:
- const int8x16_t voutput_min = vld1q_dup_s8(¶ms->${PARAMS_STRUCT}.output_min);
- const int8x16_t voutput_max = vld1q_dup_s8(¶ms->${PARAMS_STRUCT}.output_max);
+ const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min);
+ const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max);
$else:
- const int8x8_t voutput_min = vld1_dup_s8(¶ms->${PARAMS_STRUCT}.output_min);
- const int8x8_t voutput_max = vld1_dup_s8(¶ms->${PARAMS_STRUCT}.output_max);
+ const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min);
+ const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max);
for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
$for M in range(2):
$for C in range(0, CHANNEL_TILE, 8):
- const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8;
+ const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8;
$for C in range(0, CHANNEL_TILE, 8):
- const int8x8_t vi2x${ABC[C:C+8]} = vld1_s8(i2); i2 += 8;
- int16x8_t vacc${ABC[C:C+8]} = vaddl_s8(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
+ const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8;
+ ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
$for M in range(2, ROW_TILE):
$for C in range(0, CHANNEL_TILE, 8):
$if M + 1 != ROW_TILE:
- const int8x8_t vi${M+1}x${ABC[C:C+8]} = vld1_s8(i${M+1}); i${M+1} += 8;
- vacc${ABC[C:C+8]} = vaddw_s8(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+ const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8;
+ vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 8):
- int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vinit_bias, vget_low_s16(vacc${ABC[C:C+8]}));
- int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vinit_bias, vget_high_s16(vacc${ABC[C:C+8]}));
+ $if DATATYPE == "QS8":
+ int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[C:C+8]}));
+ int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[C:C+8]}));
+ $else:
+ int32x4_t vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[C:C+8]})));
+ int32x4_t vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[C:C+8]})));
$for C in range(0, CHANNEL_TILE, 4):
float32x4_t vfpacc${ABC[C:C+4]} = vcvtq_f32_s32(vacc${ABC[C:C+4]});
@@ -94,10 +122,10 @@
#if XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 8):
- vacc${ABC[C:C+8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc${ABC[C+4:C+8]});
+ int16x8_t vacc${ABC[C:C+8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc${ABC[C+4:C+8]});
#else // !XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 8):
- vacc${ABC[C:C+8]} = vcombine_s16(vqmovn_s32(vacc${ABC[C:C+4]}), vqmovn_s32(vacc${ABC[C+4:C+8]}));
+ int16x8_t vacc${ABC[C:C+8]} = vcombine_s16(vqmovn_s32(vacc${ABC[C:C+4]}), vqmovn_s32(vacc${ABC[C+4:C+8]}));
#endif // !XNN_ARCH_ARM64
$if ARMV8:
@@ -107,52 +135,56 @@
#if XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- int8x16_t vout${ABC[C:C+16]} = vqmovn_high_s16(vqmovn_s16(vacc${ABC[C:C+8]}), vacc${ABC[C+8:C+16]});
+ ${XINT8X16_T} vout${ABC[C:C+16]} = ${VQMOVXN_HIGH_S16}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), vacc${ABC[C+8:C+16]});
$else:
- int8x8_t vout${ABC[C:C+8]} = vqmovn_s16(vacc${ABC[C:C+8]});
+ ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]});
#else // !XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- int8x16_t vout${ABC[C:C+16]} = vcombine_s8(vqmovn_s16(vacc${ABC[C:C+8]}), vqmovn_s16(vacc${ABC[C+8:C+16]}));
+ ${XINT8X16_T} vout${ABC[C:C+16]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), ${VQMOVXN_S16}(vacc${ABC[C+8:C+16]}));
$else:
- int8x8_t vout${ABC[C:C+8]} = vqmovn_s16(vacc${ABC[C:C+8]});
+ ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]});
#endif // !XNN_ARCH_ARM64
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vout${ABC[C:C+16]} = vmaxq_s8(vout${ABC[C:C+16]}, voutput_min);
+ vout${ABC[C:C+16]} = ${VMAXQ_X8}(vout${ABC[C:C+16]}, voutput_min);
$elif CHANNEL_TILE > 8:
- vout${ABC[C:C+8]} = vmax_s8(vout${ABC[C:C+8]}, vget_low_s8(voutput_min));
+ vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_min));
$else:
- vout${ABC[C:C+8]} = vmax_s8(vout${ABC[C:C+8]}, voutput_min);
+ vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, voutput_min);
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vout${ABC[C:C+16]} = vminq_s8(vout${ABC[C:C+16]}, voutput_max);
+ vout${ABC[C:C+16]} = ${VMINQ_X8}(vout${ABC[C:C+16]}, voutput_max);
$elif CHANNEL_TILE > 8:
- vout${ABC[C:C+8]} = vmin_s8(vout${ABC[C:C+8]}, vget_low_s8(voutput_max));
+ vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_max));
$else:
- vout${ABC[C:C+8]} = vmin_s8(vout${ABC[C:C+8]}, voutput_max);
+ vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, voutput_max);
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vst1q_s8(output, vout${ABC[C:C+16]}); output += 16;
+ ${VST1Q_X8}(output, vout${ABC[C:C+16]}); output += 16;
$else:
- vst1_s8(output, vout${ABC[C:C+8]}); output += 8;
+ ${VST1_X8}(output, vout${ABC[C:C+8]}); output += 8;
}
if XNN_UNLIKELY(channels != 0) {
${"do " if CHANNEL_TILE > 8 else ""}{
$for M in range(3):
- const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M}); i${M} += 8;
- int16x8_t vacc${ABC[0:8]} = vaddl_s8(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
+ const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); i${M} += 8;
+ ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
$for M in range(2, ROW_TILE):
$if M + 1 != ROW_TILE:
- const int8x8_t vi${M+1}x${ABC[0:8]} = vld1_s8(i${M+1}); i${M+1} += 8;
- vacc${ABC[0:8]} = vaddw_s8(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+ const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8;
+ vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]});
- int32x4_t vacc${ABC[0:4]} = vaddw_s16(vinit_bias, vget_low_s16(vacc${ABC[0:8]}));
- int32x4_t vacc${ABC[4:8]} = vaddw_s16(vinit_bias, vget_high_s16(vacc${ABC[0:8]}));
+ $if DATATYPE == "QS8":
+ int32x4_t vacc${ABC[0:4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[0:8]}));
+ int32x4_t vacc${ABC[4:8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[0:8]}));
+ $else:
+ int32x4_t vacc${ABC[0:4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[0:8]})));
+ int32x4_t vacc${ABC[4:8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[0:8]})));
float32x4_t vfpacc${ABC[0:4]} = vcvtq_f32_s32(vacc${ABC[0:4]});
float32x4_t vfpacc${ABC[4:8]} = vcvtq_f32_s32(vacc${ABC[4:8]});
@@ -171,49 +203,49 @@
vacc${ABC[4:8]} = vqsubq_s32(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point);
#if XNN_ARCH_ARM64
- vacc${ABC[0:8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[0:4]}), vacc${ABC[4:8]});
+ int16x8_t vacc${ABC[0:8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[0:4]}), vacc${ABC[4:8]});
#else
- vacc${ABC[0:8]} = vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]}));
+ int16x8_t vacc${ABC[0:8]} = vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]}));
#endif
$if ARMV8:
vacc${ABC[0:8]} = vqaddq_s16(vacc${ABC[0:8]}, voutput_zero_point);
- int8x8_t vout${ABC[0:8]} = vqmovn_s16(vacc${ABC[0:8]});
+ ${XINT8X8_T} vout${ABC[0:8]} = ${VQMOVXN_S16}(vacc${ABC[0:8]});
$if CHANNEL_TILE > 8:
- vout${ABC[0:8]} = vmax_s8(vout${ABC[0:8]}, vget_low_s8(voutput_min));
- vout${ABC[0:8]} = vmin_s8(vout${ABC[0:8]}, vget_low_s8(voutput_max));
+ vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min));
+ vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max));
if XNN_LIKELY(channels >= 8) {
- vst1_s8(output, vout${ABC[0:8]}); output += 8;
+ ${VST1_X8}(output, vout${ABC[0:8]}); output += 8;
channels -= 8;
} else {
if (channels & 4) {
- vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout${ABC[0:8]}), 0); output += 4;
- vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
+ vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
}
if (channels & 2) {
- vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout${ABC[0:8]}), 0); output += 2;
- vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
+ vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
}
if (channels & 1) {
- vst1_lane_s8(output, vout${ABC[0:8]}, 0); output += 1;
+ ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0); output += 1;
}
channels = 0;
}
$else:
- vout${ABC[0:8]} = vmax_s8(vout${ABC[0:8]}, voutput_min);
- vout${ABC[0:8]} = vmin_s8(vout${ABC[0:8]}, voutput_max);
+ vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, voutput_min);
+ vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, voutput_max);
if (channels & 4) {
- vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout${ABC[0:8]}), 0); output += 4;
- vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
+ vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
}
if (channels & 2) {
- vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout${ABC[0:8]}), 0); output += 2;
- vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
+ vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
}
if (channels & 1) {
- vst1_lane_s8(output, vout${ABC[0:8]}, 0);
+ ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0);
}
}${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
}
diff --git a/src/qs8-gavgpool/unipass-scalar.c.in b/src/qs8-gavgpool/unipass-scalar.c.in
index de31c8b..5a1bc26 100644
--- a/src/qs8-gavgpool/unipass-scalar.c.in
+++ b/src/qs8-gavgpool/unipass-scalar.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE >= 1
$assert ROW_TILE >= 3
$assert REQUANTIZATION == "FP32"
@@ -18,24 +19,25 @@
$PARAMS_STRUCT = "fp32_scalar_" + VARIANT.lower()
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__scalar_${VARIANT.lower()}_c${CHANNEL_TILE}(
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__scalar_${VARIANT.lower()}_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+ const ${XINT8_T}* zero,
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(rows != 0);
assert(rows <= ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -64,18 +66,18 @@
$if CHANNEL_TILE > 1:
for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
$for C in range(CHANNEL_TILE):
- const int32_t vi0x${C} = i0[${C}];
+ const int32_t vi0x${C} = (int32_t) i0[${C}];
i0 += ${CHANNEL_TILE};
$for C in range(CHANNEL_TILE):
int32_t vacc${C} = vi0x${C} + vinit_bias;
- const int32_t vi1x${C} = i1[${C}];
+ const int32_t vi1x${C} = (int32_t) i1[${C}];
i1 += ${CHANNEL_TILE};
$for M in range(2, ROW_TILE):
$for C in range(CHANNEL_TILE):
vacc${C} += vi${M-1}x${C};
- const int32_t vi${M}x${C} = i${M}[${C}];
+ const int32_t vi${M}x${C} = (int32_t) i${M}[${C}];
i${M} += ${CHANNEL_TILE};
$for C in range(CHANNEL_TILE):
@@ -125,18 +127,18 @@
int32_t vout${C} = vrndacc${C} + voutput_zero_point;
$for C in range(CHANNEL_TILE):
- output[${C}] = vout${C};
+ output[${C}] = (${XINT8_T}) vout${C};
output += ${CHANNEL_TILE};
}
$if CHANNEL_TILE == 1:
do {
int32_t vacc = vinit_bias;
$for M in range(2):
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(2, ROW_TILE):
vacc += vi${M-2};
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(ROW_TILE - 2, ROW_TILE):
vacc += vi${M};
@@ -159,18 +161,18 @@
const int32_t vrndacc = (int32_t) lrintf(vfpacc);
int32_t vout = vrndacc + voutput_zero_point;
- *output++ = (int8_t) vout;
+ *output++ = (${XINT8_T}) vout;
} while (--channels != 0);
$else:
if XNN_UNLIKELY(channels != 0) {
$if CHANNEL_TILE == 2:
int32_t vacc = vinit_bias;
$for M in range(2):
- const int32_t vi${M} = *i${M};
+ const int32_t vi${M} = (int32_t) *i${M};
$for M in range(2, ROW_TILE):
vacc += vi${M-2};
- const int32_t vi${M} = *i${M};
+ const int32_t vi${M} = (int32_t) *i${M};
$for M in range(ROW_TILE - 2, ROW_TILE):
vacc += vi${M};
@@ -193,16 +195,16 @@
const int32_t vrndacc = (int32_t) lrintf(vfpacc);
int32_t vout = vrndacc + voutput_zero_point;
- *output = (int8_t) vout;
+ *output = (${XINT8_T}) vout;
$else:
do {
int32_t vacc = vinit_bias;
$for M in range(2):
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(2, ROW_TILE):
vacc += vi${M-2};
- const int32_t vi${M} = *i${M}++;
+ const int32_t vi${M} = (int32_t) *i${M}++;
$for M in range(ROW_TILE - 2, ROW_TILE):
vacc += vi${M};
@@ -225,7 +227,7 @@
const int32_t vrndacc = (int32_t) lrintf(vfpacc);
int32_t vout = vrndacc + voutput_zero_point;
- *output++ = (int8_t) vout;
+ *output++ = (${XINT8_T}) vout;
} while (--channels != 0);
}
}
diff --git a/src/qs8-gavgpool/unipass-sse2.c.in b/src/qs8-gavgpool/unipass-sse2.c.in
index f535c44..9b52837 100644
--- a/src/qs8-gavgpool/unipass-sse2.c.in
+++ b/src/qs8-gavgpool/unipass-sse2.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE % 8 == 0
$assert CHANNEL_TILE >= 8
$assert ROW_TILE >= 3
@@ -15,22 +16,24 @@
#include <xnnpack/gavgpool.h>
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__sse2_c${CHANNEL_TILE}(
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__sse2_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+ const ${XINT8_T}* zero,
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows != 0);
assert(rows <= ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -45,6 +48,8 @@
const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ $if DATATYPE == "QU8":
+ const __m128i vzero = _mm_setzero_si128();
for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
$for M in range(ROW_TILE + 2):
@@ -54,7 +59,10 @@
$elif M > 3:
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]});
$if 1 <= M <= ROW_TILE:
- const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8);
+ $if DATATYPE == "QS8":
+ const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8);
+ $else:
+ const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero);
$if M < ROW_TILE:
$if C == 0:
const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
@@ -64,9 +72,16 @@
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
- const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]});
- __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(vinit_bias, _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}));
- __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(vinit_bias, _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}));
+ $if DATATYPE == "QS8":
+ const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]});
+ __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]});
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]});
+ $else:
+ __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero);
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero);
+
+ $for C in range(0, CHANNEL_TILE, 4):
+ vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias);
$for C in range(0, CHANNEL_TILE, 4):
__m128 vfpacc${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]});
@@ -83,14 +98,22 @@
$for C in range(0, CHANNEL_TILE, 8):
__m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point);
- $for C in range(0, CHANNEL_TILE, 8):
- vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min);
+ $if DATATYPE == "QS8":
+ $for C in range(0, CHANNEL_TILE, 8):
+ vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min);
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- __m128i vout${ABC[C:C+16]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
+ __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
$else:
- __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+ __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+
+ $if DATATYPE == "QU8":
+ $for C in range(0, CHANNEL_TILE, 16):
+ $if C + 8 < CHANNEL_TILE:
+ vout${ABC[C:C+16]} = _mm_max_epu8(vout${ABC[C:C+16]}, voutput_min);
+ $else:
+ vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epu8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
$if CHANNEL_TILE > 8:
_mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
@@ -112,14 +135,24 @@
$elif M > 4:
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]});
$if 2 <= M <= ROW_TILE + 1:
- const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8);
+ $if DATATYPE == "QS8":
+ const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8);
+ $else:
+ const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero);
$if M < ROW_TILE:
const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
i${M} += 8;
- const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]});
- __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}), vinit_bias);
- __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}), vinit_bias);
+ $if DATATYPE == "QS8":
+ const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]});
+ __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]});
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]});
+ $else:
+ __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero);
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero);
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias);
+ vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias);
__m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]});
__m128 vfpacc${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]});
@@ -134,9 +167,12 @@
vacc${ABC[4:8]} = _mm_cvtps_epi32(vfpacc${ABC[4:8]});
__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
- vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, voutput_min);
+ $if DATATYPE == "QS8":
+ vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, voutput_min);
- __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[0:8]});
+ __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]});
+ $if DATATYPE == "QU8":
+ vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epu8(vout${ABC[0:8]}${ABC[0:8]}, voutput_min);
$if CHANNEL_TILE > 8:
if XNN_LIKELY(channels >= 8) {
@@ -156,7 +192,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) vout${ABC[0:4]};
+ *output = (${XINT8_T}) vout${ABC[0:4]};
output += 1;
}
channels = 0;
@@ -174,7 +210,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) vout${ABC[0:4]};
+ *output = (${XINT8_T}) vout${ABC[0:4]};
}
}${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
}
diff --git a/src/qs8-gavgpool/unipass-sse4.c.in b/src/qs8-gavgpool/unipass-sse4.c.in
index c5f1922..141c60b 100644
--- a/src/qs8-gavgpool/unipass-sse4.c.in
+++ b/src/qs8-gavgpool/unipass-sse4.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE % 8 == 0
$assert CHANNEL_TILE >= 8
$assert ROW_TILE >= 3
@@ -15,22 +16,27 @@
#include <xnnpack/gavgpool.h>
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__sse41_c${CHANNEL_TILE}(
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+$_MM_CVTEPX8_EPI16 = {"QS8": "_mm_cvtepi8_epi16", "QU8": "_mm_cvtepu8_epi16"}[DATATYPE]
+$_MM_CVTEPX16_EPI32 = {"QS8": "_mm_cvtepi16_epi32", "QU8": "_mm_cvtepu16_epi32"}[DATATYPE]
+$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE]
+$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__sse41_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+ const ${XINT8_T}* zero,
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows != 0);
assert(rows <= ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -47,32 +53,40 @@
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
$for M in range(2):
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
$for C in range(8, CHANNEL_TILE, 8):
- const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
+ const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
i${M} += ${CHANNEL_TILE};
__m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const __m128i vxi2x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
$for C in range(8, CHANNEL_TILE, 8):
__m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]});
- const __m128i vxi2x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + ${C})));
+ const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C})));
i2 += ${CHANNEL_TILE};
$for M in range(3, ROW_TILE):
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
$for C in range(8, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]});
- const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
+ const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C})));
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]});
+ $if DATATYPE == "QU8":
+ const __m128i vzero = _mm_setzero_si128();
$for C in range(0, CHANNEL_TILE, 8):
- __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(vinit_bias, _mm_cvtepi16_epi32(vacc${ABC[C:C+8]}));
- __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(vinit_bias, _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16));
+ __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]});
+ $if DATATYPE == "QS8":
+ __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16);
+ $else:
+ __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero);
+
+ $for C in range(0, CHANNEL_TILE, 4):
+ vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias);
$for C in range(0, CHANNEL_TILE, 4):
__m128 vfpacc${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]});
@@ -91,15 +105,15 @@
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- __m128i vout${ABC[C:C+16]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
+ __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
$else:
- __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+ __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min);
+ vout${ABC[C:C+16]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+16]}, voutput_min);
$else:
- vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
+ vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
$if CHANNEL_TILE > 8:
_mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
@@ -115,22 +129,28 @@
if XNN_UNLIKELY(channels != 0) {
${"do " if CHANNEL_TILE > 8 else ""}{
$for M in range(2):
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
i${M} += 8;
__m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const __m128i vxi2x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2));
i2 += 8;
$for M in range(3, ROW_TILE):
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
+ const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M}));
i${M} += 8;
vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_TILE-1}x${ABC[0:8]});
- __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc${ABC[0:8]}), vinit_bias);
- __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16), vinit_bias);
+ __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]});
+ $if DATATYPE == "QS8":
+ __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16);
+ $else:
+ __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128());
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias);
+ vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias);
__m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]});
__m128 vfpacc${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]});
@@ -146,8 +166,8 @@
__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
- __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[0:8]});
- vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epi8(vout${ABC[0:8]}${ABC[0:8]}, voutput_min);
+ __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]});
+ vout${ABC[0:8]}${ABC[0:8]} = ${_MM_MAX_EPX8}(vout${ABC[0:8]}${ABC[0:8]}, voutput_min);
$if CHANNEL_TILE > 8:
if XNN_LIKELY(channels >= 8) {
@@ -166,7 +186,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
+ *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
output += 1;
}
channels = 0;
@@ -183,7 +203,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
+ *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
}
}${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
}
diff --git a/src/qs8-gavgpool/unipass-wasmsimd.c.in b/src/qs8-gavgpool/unipass-wasmsimd.c.in
index 9ed0fea..0b3c468 100644
--- a/src/qs8-gavgpool/unipass-wasmsimd.c.in
+++ b/src/qs8-gavgpool/unipass-wasmsimd.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert DATATYPE in ["QS8", "QU8"]
$assert CHANNEL_TILE % 8 == 0
$assert CHANNEL_TILE >= 8
$assert ROW_TILE >= 3
@@ -15,22 +16,28 @@
#include <xnnpack/gavgpool.h>
-void xnn_qs8_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__wasmsimd_c${CHANNEL_TILE}(
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$WASM_X16X8_LOAD8X8 = {"QS8": "wasm_i16x8_load8x8", "QU8": "wasm_u16x8_load8x8"}[DATATYPE]
+$WASM_X32X4_EXTEND_LOW_X16X8 = {"QS8": "wasm_i32x4_extend_low_i16x8", "QU8": "wasm_u32x4_extend_low_u16x8"}[DATATYPE]
+$WASM_X32X4_EXTEND_HIGH_X16X8 = {"QS8": "wasm_i32x4_extend_high_i16x8", "QU8": "wasm_u32x4_extend_high_u16x8"}[DATATYPE]
+$WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE]
+$WASM_X8X16_MIN = {"QS8": "wasm_i8x16_min", "QU8": "wasm_u8x16_min"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__wasmsimd_c${CHANNEL_TILE}(
size_t rows,
size_t channels,
- const int8_t* input,
+ const ${XINT8_T}* input,
size_t input_stride,
- const int8_t* zero,
- int8_t* output,
- const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+ const ${XINT8_T}* zero,
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows != 0);
assert(rows <= ${ROW_TILE});
assert(channels != 0);
- const int8_t* i0 = input;
+ const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
- const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
+ const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if M % 2 == 1:
if XNN_UNPREDICTABLE(rows < ${M+1}) {
i${M} = zero;
@@ -48,32 +55,32 @@
const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
$for M in range(2):
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
$for C in range(8, CHANNEL_TILE, 8):
- const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load8x8(i${M} + ${C});
+ const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C});
i${M} += ${CHANNEL_TILE};
v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const v128_t vxi2x${ABC[0:8]} = wasm_i16x8_load8x8(i2);
+ const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2);
$for C in range(8, CHANNEL_TILE, 8):
v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]});
- const v128_t vxi2x${ABC[C:C+8]} = wasm_i16x8_load8x8(i2 + ${C});
+ const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C});
i2 += ${CHANNEL_TILE};
$for M in range(3, ROW_TILE):
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
$for C in range(8, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]});
- const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load8x8(i${M} + ${C});
+ const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C});
i${M} += ${CHANNEL_TILE};
$for C in range(0, CHANNEL_TILE, 8):
vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 8):
- v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc${ABC[C:C+8]}));
- v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc${ABC[C:C+8]}));
+ v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]}));
+ v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]}));
$for C in range(0, CHANNEL_TILE, 4):
vacc${ABC[C:C+4]} = wasm_f32x4_convert_i32x4(vacc${ABC[C:C+4]});
@@ -95,15 +102,15 @@
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- v128_t vout${ABC[C:C+16]} = wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
+ v128_t vout${ABC[C:C+16]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
$else:
- v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+ v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
$for C in range(0, CHANNEL_TILE, 16):
$if C + 8 < CHANNEL_TILE:
- vout${ABC[C:C+16]} = wasm_i8x16_min(vout${ABC[C:C+16]}, voutput_max);
+ vout${ABC[C:C+16]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+16]}, voutput_max);
$else:
- vout${ABC[C:C+8]}${ABC[C:C+8]} = wasm_i8x16_min(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
+ vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
$if CHANNEL_TILE > 8:
wasm_v128_store(output, vout${ABC[0:16]});
@@ -119,22 +126,22 @@
if XNN_UNLIKELY(channels != 0) {
${"do " if CHANNEL_TILE > 8 else ""}{
$for M in range(2):
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
i${M} += 8;
v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]});
- const v128_t vxi2x${ABC[0:8]} = wasm_i16x8_load8x8(i2);
+ const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2);
i2 += 8;
$for M in range(3, ROW_TILE):
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]});
- const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load8x8(i${M});
+ const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M});
i${M} += 8;
vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_TILE-1}x${ABC[0:8]});
- v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc${ABC[0:8]}));
- v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc${ABC[0:8]}));
+ v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]}));
+ v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]}));
vacc${ABC[0:4]} = wasm_f32x4_convert_i32x4(vacc${ABC[0:4]});
vacc${ABC[4:8]} = wasm_f32x4_convert_i32x4(vacc${ABC[4:8]});
@@ -152,8 +159,8 @@
vacc${ABC[4:8]} = wasm_i32x4_sub(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point);
const v128_t vout${ABC[0:8]} = wasm_i16x8_narrow_i32x4(vacc${ABC[0:4]}, vacc${ABC[4:8]});
- v128_t vout${ABC[0:8]}${ABC[0:8]} = wasm_i8x16_narrow_i16x8(vout${ABC[0:8]}, vout${ABC[0:8]});
- vout${ABC[0:8]}${ABC[0:8]} = wasm_i8x16_min(vout${ABC[0:8]}${ABC[0:8]}, voutput_max);
+ v128_t vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[0:8]}, vout${ABC[0:8]});
+ vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_MIN}(vout${ABC[0:8]}${ABC[0:8]}, voutput_max);
$if CHANNEL_TILE > 8:
if XNN_LIKELY(channels >= 8) {
@@ -173,7 +180,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) vout${ABC[0:4]};
+ *output = (${XINT8_T}) vout${ABC[0:4]};
output += 1;
}
channels = 0;
@@ -191,7 +198,7 @@
output += 2;
}
if (channels & 1) {
- *output = (int8_t) vout${ABC[0:4]};
+ *output = (${XINT8_T}) vout${ABC[0:4]};
}
}${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
}
diff --git a/src/qu8-gavgpool/7p7x-minmax-neon-c8.c b/src/qu8-gavgpool/7p7x-minmax-neon-c8.c
deleted file mode 100644
index 1d026c7..0000000
--- a/src/qu8-gavgpool/7p7x-minmax-neon-c8.c
+++ /dev/null
@@ -1,292 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/gavgpool.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8(
- size_t rows,
- size_t channels,
- const uint8_t* input,
- size_t input_stride,
- const uint8_t* zero,
- int32_t* buffer,
- uint8_t* output,
- const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
-{
- assert(rows > 7);
- assert(channels != 0);
-
- const uint8_t* i0 = input;
- const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
- const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
- const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
- const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
- const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
- const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t packed_channels = round_up_po2(channels, 8);
- const size_t input_increment = 7 * input_stride - packed_channels;
- const int32x4_t vbias = vld1q_dup_s32(¶ms->neon.bias);
-
- int32_t* acc = buffer;
- for (size_t c = 0; c < channels; c += 8) {
- const uint8x8_t vi0 = vld1_u8(i0); i0 += 8;
- const uint8x8_t vi1 = vld1_u8(i1); i1 += 8;
- const uint8x8_t vi2 = vld1_u8(i2); i2 += 8;
- const uint8x8_t vi3 = vld1_u8(i3); i3 += 8;
- const uint8x8_t vi4 = vld1_u8(i4); i4 += 8;
- const uint8x8_t vi5 = vld1_u8(i5); i5 += 8;
- const uint8x8_t vi6 = vld1_u8(i6); i6 += 8;
-
- const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);
- const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
- const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
-
- const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6);
- const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);
-
- const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345));
-
- const int32x4_t vacc_lo = vaddw_s16(vbias, vget_low_s16(vsum));
- const int32x4_t vacc_hi = vaddw_s16(vbias, vget_high_s16(vsum));
-
- vst1q_s32(acc, vacc_lo); acc += 4;
- vst1q_s32(acc, vacc_hi); acc += 4;
- }
- for (rows -= 7; rows > 7; rows -= 7) {
- acc = buffer;
-
- i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
- i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
- i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
- i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
- i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
- i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
- i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
-
- for (size_t c = 0; c < channels; c += 8) {
- const uint8x8_t vi0 = vld1_u8(i0); i0 += 8;
- const uint8x8_t vi1 = vld1_u8(i1); i1 += 8;
- const uint8x8_t vi2 = vld1_u8(i2); i2 += 8;
- const uint8x8_t vi3 = vld1_u8(i3); i3 += 8;
- const uint8x8_t vi4 = vld1_u8(i4); i4 += 8;
- const uint8x8_t vi5 = vld1_u8(i5); i5 += 8;
- const uint8x8_t vi6 = vld1_u8(i6); i6 += 8;
- const int32x4_t vacc_lo = vld1q_s32(acc);
- const int32x4_t vacc_hi = vld1q_s32(acc + 4);
-
- const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);
- const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
- const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
-
- const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6);
- const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);
-
- const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345));
-
- vst1q_s32(acc, vaddw_s16(vacc_lo, vget_low_s16(vsum))); acc += 4;
- vst1q_s32(acc, vaddw_s16(vacc_hi, vget_high_s16(vsum))); acc += 4;
- }
- }
-
-#if XNN_ARCH_ARM64
- const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
-#else
- const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier);
-#endif
- const int64x2_t vleft_shift = vld1q_dup_s64(¶ms->neon.left_shift);
- const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
- const uint8x8_t voutput_min = vld1_dup_u8(¶ms->neon.output_min);
- const uint8x8_t voutput_max = vld1_dup_u8(¶ms->neon.output_max);
-
- i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
- i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
- if (rows < 2) {
- i1 = zero;
- }
- i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
- if (rows <= 2) {
- i2 = zero;
- }
- i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
- if (rows < 4) {
- i3 = zero;
- }
- i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
- if (rows <= 4) {
- i4 = zero;
- }
- i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
- if (rows < 6) {
- i5 = zero;
- }
- i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
- if (rows <= 6) {
- i6 = zero;
- }
-
- acc = buffer;
- while (channels >= 8) {
- const uint8x8_t vi0 = vld1_u8(i0); i0 += 8;
- const uint8x8_t vi1 = vld1_u8(i1); i1 += 8;
- const uint8x8_t vi2 = vld1_u8(i2); i2 += 8;
- const uint8x8_t vi3 = vld1_u8(i3); i3 += 8;
- const uint8x8_t vi4 = vld1_u8(i4); i4 += 8;
- const uint8x8_t vi5 = vld1_u8(i5); i5 += 8;
- const uint8x8_t vi6 = vld1_u8(i6); i6 += 8;
- int32x4_t vacc_lo = vld1q_s32(acc); acc += 4;
- int32x4_t vacc_hi = vld1q_s32(acc); acc += 4;
-
- const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);
- const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
- const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
-
- const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6);
- const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);
-
- const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345));
- vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
- vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));
-
- const int32x4_t vneg_mask_lo = vreinterpretq_s32_u32(vcltq_s32(vacc_lo, vmovq_n_s32(0)));
- const int32x4_t vneg_mask_hi = vreinterpretq_s32_u32(vcltq_s32(vacc_hi, vmovq_n_s32(0)));
-
-#if XNN_ARCH_ARM64
- const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vget_low_s32(vmultiplier));
- const int64x2_t vproduct23 = vmull_high_s32(vacc_lo, vmultiplier);
- const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier));
- const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier);
-
- const int64x2_t vadjusted_product01 = vaddw_s32(vproduct01, vget_low_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product23 = vaddw_high_s32(vproduct23, vneg_mask_lo);
- const int64x2_t vadjusted_product45 = vaddw_s32(vproduct45, vget_low_s32(vneg_mask_hi));
- const int64x2_t vadjusted_product67 = vaddw_high_s32(vproduct67, vneg_mask_hi);
-#else
- const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vmultiplier);
- const int64x2_t vproduct23 = vmull_s32(vget_high_s32(vacc_lo), vmultiplier);
- const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier);
- const int64x2_t vproduct67 = vmull_s32(vget_high_s32(vacc_hi), vmultiplier);
-
- const int64x2_t vadjusted_product01 = vaddw_s32(vproduct01, vget_low_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product23 = vaddw_s32(vproduct23, vget_high_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product45 = vaddw_s32(vproduct45, vget_low_s32(vneg_mask_hi));
- const int64x2_t vadjusted_product67 = vaddw_s32(vproduct67, vget_high_s32(vneg_mask_hi));
-#endif
-
- const int64x2_t vscaled_acc01 = vrshlq_s64(vadjusted_product01, vleft_shift);
- const int64x2_t vscaled_acc23 = vrshlq_s64(vadjusted_product23, vleft_shift);
- const int64x2_t vscaled_acc45 = vrshlq_s64(vadjusted_product45, vleft_shift);
- const int64x2_t vscaled_acc67 = vrshlq_s64(vadjusted_product67, vleft_shift);
-
-#if XNN_ARCH_ARM64
- vacc_lo = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc01), vreinterpretq_s32_s64(vscaled_acc23));
- vacc_hi = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc45), vreinterpretq_s32_s64(vscaled_acc67));
-
- const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), voutput_zero_point);
-#else
- vacc_lo = vcombine_s32(vmovn_s64(vscaled_acc01), vmovn_s64(vscaled_acc23));
- vacc_hi = vcombine_s32(vmovn_s64(vscaled_acc45), vmovn_s64(vscaled_acc67));
-
- const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), voutput_zero_point);
-#endif
-
- uint8x8_t vout = vqmovun_s16(vacc);
- vout = vmax_u8(vout, voutput_min);
- vout = vmin_u8(vout, voutput_max);
-
- vst1_u8(output, vout); output += 8;
-
- channels -= 8;
- }
- if (channels != 0) {
- const uint8x8_t vi0 = vld1_u8(i0);
- const uint8x8_t vi1 = vld1_u8(i1);
- const uint8x8_t vi2 = vld1_u8(i2);
- const uint8x8_t vi3 = vld1_u8(i3);
- const uint8x8_t vi4 = vld1_u8(i4);
- const uint8x8_t vi5 = vld1_u8(i5);
- const uint8x8_t vi6 = vld1_u8(i6);
- int32x4_t vacc_lo = vld1q_s32(acc); acc += 4;
- int32x4_t vacc_hi = vld1q_s32(acc);
-
- const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);
- const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
- const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
-
- const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6);
- const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);
-
- const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345));
- vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
- vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));
-
- const int32x4_t vneg_mask_lo = vreinterpretq_s32_u32(vcltq_s32(vacc_lo, vmovq_n_s32(0)));
- const int32x4_t vneg_mask_hi = vreinterpretq_s32_u32(vcltq_s32(vacc_hi, vmovq_n_s32(0)));
-
-#if XNN_ARCH_ARM64
- const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vget_low_s32(vmultiplier));
- const int64x2_t vproduct23 = vmull_high_s32(vacc_lo, vmultiplier);
- const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier));
- const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier);
-
- const int64x2_t vadjusted_product01 = vaddw_s32(vproduct01, vget_low_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product23 = vaddw_high_s32(vproduct23, vneg_mask_lo);
- const int64x2_t vadjusted_product45 = vaddw_s32(vproduct45, vget_low_s32(vneg_mask_hi));
- const int64x2_t vadjusted_product67 = vaddw_high_s32(vproduct67, vneg_mask_hi);
-#else
- const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vmultiplier);
- const int64x2_t vproduct23 = vmull_s32(vget_high_s32(vacc_lo), vmultiplier);
- const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier);
- const int64x2_t vproduct67 = vmull_s32(vget_high_s32(vacc_hi), vmultiplier);
-
- const int64x2_t vadjusted_product01 = vaddw_s32(vproduct01, vget_low_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product23 = vaddw_s32(vproduct23, vget_high_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product45 = vaddw_s32(vproduct45, vget_low_s32(vneg_mask_hi));
- const int64x2_t vadjusted_product67 = vaddw_s32(vproduct67, vget_high_s32(vneg_mask_hi));
-#endif
-
- const int64x2_t vscaled_acc01 = vrshlq_s64(vadjusted_product01, vleft_shift);
- const int64x2_t vscaled_acc23 = vrshlq_s64(vadjusted_product23, vleft_shift);
- const int64x2_t vscaled_acc45 = vrshlq_s64(vadjusted_product45, vleft_shift);
- const int64x2_t vscaled_acc67 = vrshlq_s64(vadjusted_product67, vleft_shift);
-
-#if XNN_ARCH_ARM64
- vacc_lo = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc01), vreinterpretq_s32_s64(vscaled_acc23));
- vacc_hi = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc45), vreinterpretq_s32_s64(vscaled_acc67));
-
- const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), voutput_zero_point);
-#else
- vacc_lo = vcombine_s32(vmovn_s64(vscaled_acc01), vmovn_s64(vscaled_acc23));
- vacc_hi = vcombine_s32(vmovn_s64(vscaled_acc45), vmovn_s64(vscaled_acc67));
-
- const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), voutput_zero_point);
-#endif
-
- uint8x8_t vout = vqmovun_s16(vacc);
- vout = vmax_u8(vout, voutput_min);
- vout = vmin_u8(vout, voutput_max);
-
- if (channels & 4) {
- vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout), 0); output += 4;
- vout = vext_u8(vout, vout, 4);
- }
- if (channels & 2) {
- vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout), 0); output += 2;
- vout = vext_u8(vout, vout, 2);
- }
- if (channels & 1) {
- vst1_lane_u8(output, vout, 0);
- }
- }
-}
diff --git a/src/qu8-gavgpool/7p7x-minmax-scalar-c1.c b/src/qu8-gavgpool/7p7x-minmax-scalar-c1.c
deleted file mode 100644
index d754ed4..0000000
--- a/src/qu8-gavgpool/7p7x-minmax-scalar-c1.c
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/gavgpool.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1(
- size_t rows,
- size_t channels,
- const uint8_t* input,
- size_t input_stride,
- const uint8_t* zero,
- int32_t* buffer,
- uint8_t* output,
- const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
-{
- assert(rows > 7);
- assert(channels != 0);
-
- const uint8_t* i0 = input;
- const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
- const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
- const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
- const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
- const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
- const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - channels;
-
- // First pass.
- {
- const int32_t vbias = params->scalar.bias;
-
- int32_t* b = buffer;
- size_t c = channels;
- do {
- const uint32_t vi0 = (uint32_t) *i0++;
- const uint32_t vi1 = (uint32_t) *i1++;
- const uint32_t vi2 = (uint32_t) *i2++;
- const uint32_t vi3 = (uint32_t) *i3++;
- const uint32_t vi4 = (uint32_t) *i4++;
- const uint32_t vi5 = (uint32_t) *i5++;
- const uint32_t vi6 = (uint32_t) *i6++;
-
- const uint32_t vsum01 = vi0 + vi1;
- const uint32_t vsum23 = vi2 + vi3;
- const uint32_t vsum45 = vi4 + vi5;
-
- const uint32_t vsum016 = vsum01 + vi6;
- const uint32_t vsum2345 = vsum23 + vsum45;
-
- const uint32_t vsum = vsum016 + vsum2345;
- const int32_t vacc = vbias + (int32_t) vsum;
-
- *b++ = vacc;
- } while (--c != 0);
- }
- // Intermediate passes.
- for (rows -= 7; rows > 7; rows -= 7) {
- i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
- i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
- i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
- i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
- i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
- i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
- i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
-
- int32_t* b = buffer;
- size_t c = channels;
- do {
- const uint32_t vi0 = (uint32_t) *i0++;
- const uint32_t vi1 = (uint32_t) *i1++;
- const uint32_t vi2 = (uint32_t) *i2++;
- const uint32_t vi3 = (uint32_t) *i3++;
- const uint32_t vi4 = (uint32_t) *i4++;
- const uint32_t vi5 = (uint32_t) *i5++;
- const uint32_t vi6 = (uint32_t) *i6++;
-
- const uint32_t vsum01 = vi0 + vi1;
- const uint32_t vsum23 = vi2 + vi3;
- const uint32_t vsum45 = vi4 + vi5;
-
- const uint32_t vsum016 = vsum01 + vi6;
- const uint32_t vsum2345 = vsum23 + vsum45;
-
- const uint32_t vsum = vsum016 + vsum2345;
-
- *b++ += (int32_t) vsum;
- } while (--c != 0);
- }
-
- // Last pass.
- {
- const int32_t vmultiplier = params->scalar.multiplier;
- const int64_t vrounding = params->scalar.rounding;
- const uint32_t vshift = params->scalar.right_shift;
- const int32_t voutput_min = params->scalar.output_min_less_zero_point;
- const int32_t voutput_max = params->scalar.output_max_less_zero_point;
- const int32_t voutput_zero_point = params->scalar.output_zero_point;
-
- i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
- i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
- if (rows < 2) {
- i1 = zero;
- }
- i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
- if (rows <= 2) {
- i2 = zero;
- }
- i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
- if (rows < 4) {
- i3 = zero;
- }
- i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
- if (rows <= 4) {
- i4 = zero;
- }
- i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
- if (rows < 6) {
- i5 = zero;
- }
- i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
- if (rows <= 6) {
- i6 = zero;
- }
-
- int32_t* b = buffer;
- size_t c = channels;
- do {
- int32_t vacc = *b++;
- const uint32_t vi0 = (uint32_t) *i0++;
- const uint32_t vi1 = (uint32_t) *i1++;
- const uint32_t vi2 = (uint32_t) *i2++;
- const uint32_t vi3 = (uint32_t) *i3++;
- const uint32_t vi4 = (uint32_t) *i4++;
- const uint32_t vi5 = (uint32_t) *i5++;
- const uint32_t vi6 = (uint32_t) *i6++;
-
- const uint32_t vsum01 = vi0 + vi1;
- const uint32_t vsum23 = vi2 + vi3;
- const uint32_t vsum45 = vi4 + vi5;
-
- const uint32_t vsum016 = vsum01 + vi6;
- const uint32_t vsum2345 = vsum23 + vsum45;
-
- const uint32_t vsum = vsum016 + vsum2345;
- vacc += (int32_t) vsum;
-
- const int64_t vproduct = (int64_t) vacc * (int64_t) vmultiplier;
- const int64_t vadjusted_product = vproduct - (int64_t) (vacc < 0);
- int32_t vout = (int32_t) asr_s64(vadjusted_product + vrounding, vshift);
- vout = vout < voutput_min ? voutput_min : vout;
- vout = vout > voutput_max ? voutput_max : vout;
- vout += voutput_zero_point;
-
- *output++ = (uint8_t) vout;
- } while (--c != 0);
- }
-}
diff --git a/src/qu8-gavgpool/7p7x-minmax-sse2-c8.c b/src/qu8-gavgpool/7p7x-minmax-sse2-c8.c
deleted file mode 100644
index 6320fcf..0000000
--- a/src/qu8-gavgpool/7p7x-minmax-sse2-c8.c
+++ /dev/null
@@ -1,305 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gavgpool.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8(
- size_t rows,
- size_t channels,
- const uint8_t* input,
- size_t input_stride,
- const uint8_t* zero,
- int32_t* buffer,
- uint8_t* output,
- const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
-{
- assert(rows > 7);
- assert(channels != 0);
-
- const uint8_t* i0 = input;
- const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
- const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
- const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
- const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
- const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
- const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t packed_channels = round_up_po2(channels, 8);
- const size_t input_increment = 7 * input_stride - packed_channels;
- const __m128i vbias = _mm_load_si128((const __m128i*) ¶ms->sse2.bias);
- const __m128i vzero = _mm_setzero_si128();
-
- int32_t* acc = buffer;
- for (size_t c = 0; c < channels; c += 8) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
-
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
-
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
-
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
-
- const __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
- const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
-
- _mm_store_si128((__m128i*) acc, vacc_lo);
- _mm_store_si128((__m128i*) acc + 1, vacc_hi);
- acc += 8;
- }
- for (rows -= 7; rows > 7; rows -= 7) {
- acc = buffer;
- i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
- i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
- i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
- i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
- i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
- i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
- i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
-
- for (size_t c = 0; c < channels; c += 8) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
- __m128i vacc_lo = _mm_load_si128((const __m128i*) acc);
- __m128i vacc_hi = _mm_load_si128((const __m128i*) acc + 1);
-
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
-
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
-
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
-
- vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
- vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
-
- _mm_store_si128((__m128i*) acc, vacc_lo);
- _mm_store_si128((__m128i*) acc + 1, vacc_hi);
- acc += 8;
- }
- }
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
- const __m128i vright_shift = _mm_loadl_epi64((const __m128i*) params->sse2.right_shift);
-
- i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
- i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
- if (rows < 2) {
- i1 = zero;
- }
- i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
- if (rows <= 2) {
- i2 = zero;
- }
- i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
- if (rows < 4) {
- i3 = zero;
- }
- i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
- if (rows <= 4) {
- i4 = zero;
- }
- i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
- if (rows < 6) {
- i5 = zero;
- }
- i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
- if (rows <= 6) {
- i6 = zero;
- }
-
- acc = buffer;
- while (channels >= 8) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
- __m128i vacc_lo = _mm_load_si128((const __m128i*) acc);
- __m128i vacc_hi = _mm_load_si128((const __m128i*) acc + 1);
- acc += 8;
-
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
-
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
-
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
-
- vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
- vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
-
- const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
- const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
-
- const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
-
- const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
-
- const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
- const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
-
- const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
- const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
-
- const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
- const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
- const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
- const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
-
- const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
-
- __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
- vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point));
- vout = _mm_packus_epi16(vout, vout);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max));
- vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_min));
-
- _mm_storel_epi64((__m128i*) output, vout); output += 8;
-
- channels -= 8;
- }
- if (channels != 0) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0);
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1);
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2);
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3);
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4);
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5);
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6);
- __m128i vacc_lo = _mm_load_si128((const __m128i*) acc);
- __m128i vacc_hi = _mm_load_si128((const __m128i*) acc + 1);
-
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
-
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
-
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
-
- vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
- vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
-
- const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
- const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
-
- const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
-
- const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
-
- const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
- const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
-
- const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
- const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
-
- const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
- const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
- const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
- const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
-
- const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
-
- __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
- vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point));
- vout = _mm_packus_epi16(vout, vout);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max));
- vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_min));
-
- if (channels & 4) {
- *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout);
- output += 4;
- vout = _mm_srli_epi64(vout, 32);
- }
- if (channels & 2) {
- *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout, 0);
- output += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (channels & 1) {
- *((uint8_t*) output) = (uint8_t) _mm_cvtsi128_si32(vout);
- }
- }
-}
diff --git a/src/qu8-gavgpool/7x-minmax-neon-c8.c b/src/qu8-gavgpool/7x-minmax-neon-c8.c
deleted file mode 100644
index d9c6320..0000000
--- a/src/qu8-gavgpool/7x-minmax-neon-c8.c
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/gavgpool.h>
-
-
-void xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8(
- size_t rows,
- size_t channels,
- const uint8_t* input,
- size_t input_stride,
- const uint8_t* zero,
- uint8_t* output,
- const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
-{
- assert(rows != 0);
- assert(rows <= 7);
- assert(channels != 0);
-
- const uint8_t* i0 = input;
- const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
- if (rows < 2) {
- i1 = zero;
- }
- const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
- if (rows <= 2) {
- i2 = zero;
- }
- const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
- if (rows < 4) {
- i3 = zero;
- }
- const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
- if (rows <= 4) {
- i4 = zero;
- }
- const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
- if (rows < 6) {
- i5 = zero;
- }
- const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- if (rows <= 6) {
- i6 = zero;
- }
-
- const int32x4_t vbias = vld1q_dup_s32(¶ms->neon.bias);
-#if XNN_ARCH_ARM64
- const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
-#else
- const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier);
-#endif
- const int64x2_t vleft_shift = vld1q_dup_s64(¶ms->neon.left_shift);
- const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
- const uint8x8_t voutput_min = vld1_dup_u8(¶ms->neon.output_min);
- const uint8x8_t voutput_max = vld1_dup_u8(¶ms->neon.output_max);
- while (channels >= 8) {
- const uint8x8_t vi0 = vld1_u8(i0); i0 += 8;
- const uint8x8_t vi1 = vld1_u8(i1); i1 += 8;
- const uint8x8_t vi2 = vld1_u8(i2); i2 += 8;
- const uint8x8_t vi3 = vld1_u8(i3); i3 += 8;
- const uint8x8_t vi4 = vld1_u8(i4); i4 += 8;
- const uint8x8_t vi5 = vld1_u8(i5); i5 += 8;
- const uint8x8_t vi6 = vld1_u8(i6); i6 += 8;
-
- const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);
- const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
- const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
-
- const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6);
- const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);
-
- const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345));
- int32x4_t vacc_lo = vaddw_s16(vbias, vget_low_s16(vsum));
- int32x4_t vacc_hi = vaddw_s16(vbias, vget_high_s16(vsum));
-
- const int32x4_t vneg_mask_lo = vreinterpretq_s32_u32(vcltq_s32(vacc_lo, vmovq_n_s32(0)));
- const int32x4_t vneg_mask_hi = vreinterpretq_s32_u32(vcltq_s32(vacc_hi, vmovq_n_s32(0)));
-
-#if XNN_ARCH_ARM64
- const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vget_low_s32(vmultiplier));
- const int64x2_t vproduct23 = vmull_high_s32(vacc_lo, vmultiplier);
- const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier));
- const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier);
-
- const int64x2_t vadjusted_product01 = vaddw_s32(vproduct01, vget_low_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product23 = vaddw_high_s32(vproduct23, vneg_mask_lo);
- const int64x2_t vadjusted_product45 = vaddw_s32(vproduct45, vget_low_s32(vneg_mask_hi));
- const int64x2_t vadjusted_product67 = vaddw_high_s32(vproduct67, vneg_mask_hi);
-#else
- const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vmultiplier);
- const int64x2_t vproduct23 = vmull_s32(vget_high_s32(vacc_lo), vmultiplier);
- const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier);
- const int64x2_t vproduct67 = vmull_s32(vget_high_s32(vacc_hi), vmultiplier);
-
- const int64x2_t vadjusted_product01 = vaddw_s32(vproduct01, vget_low_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product23 = vaddw_s32(vproduct23, vget_high_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product45 = vaddw_s32(vproduct45, vget_low_s32(vneg_mask_hi));
- const int64x2_t vadjusted_product67 = vaddw_s32(vproduct67, vget_high_s32(vneg_mask_hi));
-#endif
-
- const int64x2_t vscaled_acc01 = vrshlq_s64(vadjusted_product01, vleft_shift);
- const int64x2_t vscaled_acc23 = vrshlq_s64(vadjusted_product23, vleft_shift);
- const int64x2_t vscaled_acc45 = vrshlq_s64(vadjusted_product45, vleft_shift);
- const int64x2_t vscaled_acc67 = vrshlq_s64(vadjusted_product67, vleft_shift);
-
-#if XNN_ARCH_ARM64
- vacc_lo = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc01), vreinterpretq_s32_s64(vscaled_acc23));
- vacc_hi = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc45), vreinterpretq_s32_s64(vscaled_acc67));
-
- const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), voutput_zero_point);
-#else
- vacc_lo = vcombine_s32(vmovn_s64(vscaled_acc01), vmovn_s64(vscaled_acc23));
- vacc_hi = vcombine_s32(vmovn_s64(vscaled_acc45), vmovn_s64(vscaled_acc67));
-
- const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), voutput_zero_point);
-#endif
-
- uint8x8_t vout = vqmovun_s16(vacc);
- vout = vmax_u8(vout, voutput_min);
- vout = vmin_u8(vout, voutput_max);
-
- vst1_u8(output, vout); output += 8;
-
- channels -= 8;
- }
- if (channels != 0) {
- const uint8x8_t vi0 = vld1_u8(i0);
- const uint8x8_t vi1 = vld1_u8(i1);
- const uint8x8_t vi2 = vld1_u8(i2);
- const uint8x8_t vi3 = vld1_u8(i3);
- const uint8x8_t vi4 = vld1_u8(i4);
- const uint8x8_t vi5 = vld1_u8(i5);
- const uint8x8_t vi6 = vld1_u8(i6);
-
- const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);
- const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
- const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
-
- const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6);
- const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);
-
- const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345));
- int32x4_t vacc_lo = vaddw_s16(vbias, vget_low_s16(vsum));
- int32x4_t vacc_hi = vaddw_s16(vbias, vget_high_s16(vsum));
-
- const int32x4_t vneg_mask_lo = vreinterpretq_s32_u32(vcltq_s32(vacc_lo, vmovq_n_s32(0)));
- const int32x4_t vneg_mask_hi = vreinterpretq_s32_u32(vcltq_s32(vacc_hi, vmovq_n_s32(0)));
-
-#if XNN_ARCH_ARM64
- const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vget_low_s32(vmultiplier));
- const int64x2_t vproduct23 = vmull_high_s32(vacc_lo, vmultiplier);
- const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier));
- const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier);
-
- const int64x2_t vadjusted_product01 = vaddw_s32(vproduct01, vget_low_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product23 = vaddw_high_s32(vproduct23, vneg_mask_lo);
- const int64x2_t vadjusted_product45 = vaddw_s32(vproduct45, vget_low_s32(vneg_mask_hi));
- const int64x2_t vadjusted_product67 = vaddw_high_s32(vproduct67, vneg_mask_hi);
-#else
- const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vmultiplier);
- const int64x2_t vproduct23 = vmull_s32(vget_high_s32(vacc_lo), vmultiplier);
- const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier);
- const int64x2_t vproduct67 = vmull_s32(vget_high_s32(vacc_hi), vmultiplier);
-
- const int64x2_t vadjusted_product01 = vaddw_s32(vproduct01, vget_low_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product23 = vaddw_s32(vproduct23, vget_high_s32(vneg_mask_lo));
- const int64x2_t vadjusted_product45 = vaddw_s32(vproduct45, vget_low_s32(vneg_mask_hi));
- const int64x2_t vadjusted_product67 = vaddw_s32(vproduct67, vget_high_s32(vneg_mask_hi));
-#endif
-
- const int64x2_t vscaled_acc01 = vrshlq_s64(vadjusted_product01, vleft_shift);
- const int64x2_t vscaled_acc23 = vrshlq_s64(vadjusted_product23, vleft_shift);
- const int64x2_t vscaled_acc45 = vrshlq_s64(vadjusted_product45, vleft_shift);
- const int64x2_t vscaled_acc67 = vrshlq_s64(vadjusted_product67, vleft_shift);
-
-#if XNN_ARCH_ARM64
- vacc_lo = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc01), vreinterpretq_s32_s64(vscaled_acc23));
- vacc_hi = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc45), vreinterpretq_s32_s64(vscaled_acc67));
-
- const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), voutput_zero_point);
-#else
- vacc_lo = vcombine_s32(vmovn_s64(vscaled_acc01), vmovn_s64(vscaled_acc23));
- vacc_hi = vcombine_s32(vmovn_s64(vscaled_acc45), vmovn_s64(vscaled_acc67));
-
- const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), voutput_zero_point);
-#endif
-
- uint8x8_t vout = vqmovun_s16(vacc);
- vout = vmax_u8(vout, voutput_min);
- vout = vmin_u8(vout, voutput_max);
-
- if (channels & 4) {
- vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout), 0); output += 4;
- vout = vext_u8(vout, vout, 4);
- }
- if (channels & 2) {
- vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout), 0); output += 2;
- vout = vext_u8(vout, vout, 2);
- }
- if (channels & 1) {
- vst1_lane_u8(output, vout, 0);
- }
- }
-}
diff --git a/src/qu8-gavgpool/7x-minmax-scalar-c1.c b/src/qu8-gavgpool/7x-minmax-scalar-c1.c
deleted file mode 100644
index ac9094a..0000000
--- a/src/qu8-gavgpool/7x-minmax-scalar-c1.c
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/gavgpool.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1(
- size_t rows,
- size_t channels,
- const uint8_t* input,
- size_t input_stride,
- const uint8_t* zero,
- uint8_t* output,
- const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
-{
- assert(rows != 0);
- assert(rows <= 7);
- assert(channels != 0);
-
- const uint8_t* i0 = input;
- const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
- if (rows < 2) {
- i1 = zero;
- }
- const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
- if (rows <= 2) {
- i2 = zero;
- }
- const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
- if (rows < 4) {
- i3 = zero;
- }
- const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
- if (rows <= 4) {
- i4 = zero;
- }
- const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
- if (rows < 6) {
- i5 = zero;
- }
- const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- if (rows <= 6) {
- i6 = zero;
- }
-
- const int32_t vbias = params->scalar.bias;
- const int32_t vmultiplier = params->scalar.multiplier;
- const int64_t vrounding = params->scalar.rounding;
- const uint32_t vshift = params->scalar.right_shift;
- const int32_t voutput_min = params->scalar.output_min_less_zero_point;
- const int32_t voutput_max = params->scalar.output_max_less_zero_point;
- const int32_t voutput_zero_point = params->scalar.output_zero_point;
- do {
- const uint32_t vi0 = (uint32_t) *i0++;
- const uint32_t vi1 = (uint32_t) *i1++;
- const uint32_t vi2 = (uint32_t) *i2++;
- const uint32_t vi3 = (uint32_t) *i3++;
- const uint32_t vi4 = (uint32_t) *i4++;
- const uint32_t vi5 = (uint32_t) *i5++;
- const uint32_t vi6 = (uint32_t) *i6++;
-
- const uint32_t vsum01 = vi0 + vi1;
- const uint32_t vsum23 = vi2 + vi3;
- const uint32_t vsum45 = vi4 + vi5;
-
- const uint32_t vsum016 = vsum01 + vi6;
- const uint32_t vsum2345 = vsum23 + vsum45;
-
- const uint32_t vsum = vsum016 + vsum2345;
- const int32_t vacc = vbias + (int32_t) vsum;
-
- const int64_t vproduct = (int64_t) vacc * (int64_t) vmultiplier;
- const int64_t vadjusted_product = vproduct - (int64_t) (vacc < 0);
- int32_t vout = (int32_t) asr_s64(vadjusted_product + vrounding, vshift);
- vout = vout < voutput_min ? voutput_min : vout;
- vout = vout > voutput_max ? voutput_max : vout;
- vout += voutput_zero_point;
-
- *output++ = (uint8_t) vout;
- } while (--channels != 0);
-}
diff --git a/src/qu8-gavgpool/7x-minmax-sse2-c8.c b/src/qu8-gavgpool/7x-minmax-sse2-c8.c
deleted file mode 100644
index 09a96f8..0000000
--- a/src/qu8-gavgpool/7x-minmax-sse2-c8.c
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gavgpool.h>
-
-
-void xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8(
- size_t rows,
- size_t channels,
- const uint8_t* input,
- size_t input_stride,
- const uint8_t* zero,
- uint8_t* output,
- const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
-{
- assert(rows != 0);
- assert(rows <= 7);
- assert(channels != 0);
-
- const uint8_t* i0 = input;
- const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
- if (rows < 2) {
- i1 = zero;
- }
- const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
- if (rows <= 2) {
- i2 = zero;
- }
- const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
- if (rows < 4) {
- i3 = zero;
- }
- const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
- if (rows <= 4) {
- i4 = zero;
- }
- const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
- if (rows < 6) {
- i5 = zero;
- }
- const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- if (rows <= 6) {
- i6 = zero;
- }
-
- const __m128i vbias = _mm_load_si128((const __m128i*) ¶ms->sse2.bias);
- const __m128i vzero = _mm_setzero_si128();
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
- const __m128i vright_shift = _mm_loadl_epi64((const __m128i*) params->sse2.right_shift);
-
- while (channels >= 8) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
-
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
-
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
-
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
-
- __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
- __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
-
- const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
- const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
-
- const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
-
- const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
-
- const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
- const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
-
- const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
- const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
-
- const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
- const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
- const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
- const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
-
- const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
-
- __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
- vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point));
- vout = _mm_packus_epi16(vout, vout);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max));
- vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_min));
-
- _mm_storel_epi64((__m128i*) output, vout); output += 8;
-
- channels -= 8;
- }
- if (channels != 0) {
- const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0);
- const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1);
- const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2);
- const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3);
- const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4);
- const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5);
- const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6);
-
- const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
- const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
- const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
- const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
- const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
- const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
- const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
-
- const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
- const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
- const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
-
- const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);
- const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
- const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);
-
- __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
- __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
-
- const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
- const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
-
- const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
-
- const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
-
- const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
- const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
-
- const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
- const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
-
- const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
- const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
- const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
- const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
-
- const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
- const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
-
- __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
- vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point));
- vout = _mm_packus_epi16(vout, vout);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max));
- vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_min));
-
- if (channels & 4) {
- *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout);
- output += 4;
- vout = _mm_srli_epi64(vout, 32);
- }
- if (channels & 2) {
- *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout, 0);
- output += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (channels & 1) {
- *((uint8_t*) output) = (uint8_t) _mm_cvtsi128_si32(vout);
- }
- }
-}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
new file mode 100644
index 0000000..db34fc5
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
@@ -0,0 +1,315 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(b + 8);
+ int32x4_t vaccCDEF = vld1q_s32(b + 12);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max);
+ for (; channels >= 16; channels -= 16) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+ vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
+ vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ #endif // !XNN_ARCH_ARM64
+
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
new file mode 100644
index 0000000..c9ec0a7
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
@@ -0,0 +1,437 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+ const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
+ const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ vst1q_s32(b, vaccGHIJ); b += 4;
+ vst1q_s32(b, vaccKLMN); b += 4;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(b + 8);
+ int32x4_t vaccCDEF = vld1q_s32(b + 12);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ int32x4_t vaccGHIJ = vld1q_s32(b + 16);
+ int32x4_t vaccKLMN = vld1q_s32(b + 20);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+ vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
+ vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ vst1q_s32(b, vaccGHIJ); b += 4;
+ vst1q_s32(b, vaccKLMN); b += 4;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max);
+ for (; channels >= 24; channels -= 24) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+ vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
+ vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+ float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
+ float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+ vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
+ vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+ vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
+ vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));
+ vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias));
+ vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);
+ vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point);
+ vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ #endif // !XNN_ARCH_ARM64
+
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min));
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max));
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ vst1_u8(output, voutGHIJKLMN); output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
new file mode 100644
index 0000000..c4f4c28
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
@@ -0,0 +1,500 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 32; c -= 32) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+ const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
+ const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
+ const int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV)));
+ const int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ vst1q_s32(b, vaccGHIJ); b += 4;
+ vst1q_s32(b, vaccKLMN); b += 4;
+ vst1q_s32(b, vaccOPQR); b += 4;
+ vst1q_s32(b, vaccSTUV); b += 4;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 32; c -= 32) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(b + 8);
+ int32x4_t vaccCDEF = vld1q_s32(b + 12);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ int32x4_t vaccGHIJ = vld1q_s32(b + 16);
+ int32x4_t vaccKLMN = vld1q_s32(b + 20);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ int32x4_t vaccOPQR = vld1q_s32(b + 24);
+ int32x4_t vaccSTUV = vld1q_s32(b + 28);
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+ vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
+ vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
+ vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV)));
+ vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ vst1q_s32(b, vaccGHIJ); b += 4;
+ vst1q_s32(b, vaccKLMN); b += 4;
+ vst1q_s32(b, vaccOPQR); b += 4;
+ vst1q_s32(b, vaccSTUV); b += 4;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max);
+ for (; channels >= 32; channels -= 32) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+ vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
+ vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
+ vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV)));
+ vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+ float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
+ float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
+ float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR);
+ float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+ vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
+ vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
+ vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale);
+ vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+ vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
+ vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));
+ vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias));
+ vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias));
+ vaccOPQR = vreinterpretq_s32_f32(vaddq_f32(vfpaccOPQR, vmagic_bias));
+ vaccSTUV = vreinterpretq_s32_f32(vaddq_f32(vfpaccSTUV, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);
+ vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point);
+ vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point);
+ vaccOPQR = vqsubq_s32(vaccOPQR, vmagic_bias_less_output_zero_point);
+ vaccSTUV = vqsubq_s32(vaccSTUV, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
+ #endif // !XNN_ARCH_ARM64
+
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
new file mode 100644
index 0000000..78b89eb
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
@@ -0,0 +1,246 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neon.output_min);
+ const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neon.output_max);
+ for (; channels >= 8; channels -= 8) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif // !XNN_ARCH_ARM64
+
+
+ #if XNN_ARCH_ARM64
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ #else // !XNN_ARCH_ARM64
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ #endif // !XNN_ARCH_ARM64
+
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
+
+ vst1_u8(output, vout01234567); output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0);
+ const uint8x8_t vi1x01234567 = vld1_u8(i1);
+ const uint8x8_t vi2x01234567 = vld1_u8(i2);
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3);
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5);
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6);
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
+
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0);
+ }
+ }
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
new file mode 100644
index 0000000..d9989f1
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
@@ -0,0 +1,310 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(b + 8);
+ int32x4_t vaccCDEF = vld1q_s32(b + 12);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+ for (; channels >= 16; channels -= 16) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+ vacc89AB = vcvtnq_s32_f32(vfpacc89AB);
+ vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ #endif // !XNN_ARCH_ARM64
+
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+ vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
new file mode 100644
index 0000000..d3a230f
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
@@ -0,0 +1,431 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+ const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
+ const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ vst1q_s32(b, vaccGHIJ); b += 4;
+ vst1q_s32(b, vaccKLMN); b += 4;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(b + 8);
+ int32x4_t vaccCDEF = vld1q_s32(b + 12);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ int32x4_t vaccGHIJ = vld1q_s32(b + 16);
+ int32x4_t vaccKLMN = vld1q_s32(b + 20);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+ vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
+ vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ vst1q_s32(b, vaccGHIJ); b += 4;
+ vst1q_s32(b, vaccKLMN); b += 4;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+ for (; channels >= 24; channels -= 24) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+ vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
+ vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+ float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
+ float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+ vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
+ vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+ vacc89AB = vcvtnq_s32_f32(vfpacc89AB);
+ vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
+ vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ);
+ vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ #endif // !XNN_ARCH_ARM64
+
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+ vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
+ vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point);
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min));
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max));
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ vst1_u8(output, voutGHIJKLMN); output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
new file mode 100644
index 0000000..5b2031d
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
@@ -0,0 +1,493 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 32; c -= 32) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+ const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
+ const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
+ const int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV)));
+ const int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ vst1q_s32(b, vaccGHIJ); b += 4;
+ vst1q_s32(b, vaccKLMN); b += 4;
+ vst1q_s32(b, vaccOPQR); b += 4;
+ vst1q_s32(b, vaccSTUV); b += 4;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 32; c -= 32) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(b + 8);
+ int32x4_t vaccCDEF = vld1q_s32(b + 12);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ int32x4_t vaccGHIJ = vld1q_s32(b + 16);
+ int32x4_t vaccKLMN = vld1q_s32(b + 20);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ int32x4_t vaccOPQR = vld1q_s32(b + 24);
+ int32x4_t vaccSTUV = vld1q_s32(b + 28);
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+ vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
+ vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
+ vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV)));
+ vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ vst1q_s32(b, vacc89AB); b += 4;
+ vst1q_s32(b, vaccCDEF); b += 4;
+ vst1q_s32(b, vaccGHIJ); b += 4;
+ vst1q_s32(b, vaccKLMN); b += 4;
+ vst1q_s32(b, vaccOPQR); b += 4;
+ vst1q_s32(b, vaccSTUV); b += 4;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+ for (; channels >= 32; channels -= 32) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+ vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
+ vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
+ vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
+ vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
+ vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV)));
+ vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+ float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
+ float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
+ float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR);
+ float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+ vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
+ vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
+ vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale);
+ vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+ vacc89AB = vcvtnq_s32_f32(vfpacc89AB);
+ vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
+ vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ);
+ vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN);
+ vaccOPQR = vcvtnq_s32_f32(vfpaccOPQR);
+ vaccSTUV = vcvtnq_s32_f32(vfpaccSTUV);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
+ #endif // !XNN_ARCH_ARM64
+
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+ vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
+ vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point);
+ vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point);
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
new file mode 100644
index 0000000..d9d5c65
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
@@ -0,0 +1,242 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(b);
+ int32x4_t vacc4567 = vld1q_s32(b + 4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ vst1q_s32(b, vacc0123); b += 4;
+ vst1q_s32(b, vacc4567); b += 4;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+ const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neonv8.output_max);
+ for (; channels >= 8; channels -= 8) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif // !XNN_ARCH_ARM64
+
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ #if XNN_ARCH_ARM64
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ #else // !XNN_ARCH_ARM64
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ #endif // !XNN_ARCH_ARM64
+
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
+
+ vst1_u8(output, vout01234567); output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0);
+ const uint8x8_t vi1x01234567 = vld1_u8(i1);
+ const uint8x8_t vi2x01234567 = vld1_u8(i2);
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3);
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4);
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5);
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6);
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
+ int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
+ vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
+
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0);
+ }
+ }
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
new file mode 100644
index 0000000..8b0b708
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
@@ -0,0 +1,157 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+
+ const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
+ int32_t* b = buffer;
+ size_t c = channels;
+ do {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ *b++ = vacc;
+ } while (--c != 0);
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ do {
+ int32_t vacc = *b;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ *b++ = vacc;
+ } while (--c != 0);
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float vscale = params->fp32_scalar_fmagic.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
+ const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
+ const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
+ do {
+ int32_t vacc = *buffer++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc) - vmagic_bias_less_output_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
new file mode 100644
index 0000000..28b98a0
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
@@ -0,0 +1,263 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+
+ const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ i0 += 2;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b += 2;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
+ int32_t vacc0 = b[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = b[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ i0 += 2;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b += 2;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float vscale = params->fp32_scalar_fmagic.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
+ const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
+ const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
+ for (; channels >= 2; channels -= 2) {
+ int32_t vacc0 = buffer[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = buffer[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ buffer += 2;
+ i0 += 2;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+
+ vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+ vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+ vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+ vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+ vfpacc0 += vmagic_bias;
+ vfpacc1 += vmagic_bias;
+
+ int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0) - vmagic_bias_less_output_zero_point;
+ int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1) - vmagic_bias_less_output_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output += 2;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ int32_t vacc = *buffer;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc) - vmagic_bias_less_output_zero_point;
+
+ *output = (uint8_t) vout;
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
new file mode 100644
index 0000000..a063456
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
@@ -0,0 +1,369 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+
+ const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ i0 += 4;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ int32_t vacc2 = vi0x2 + vinit_bias;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ int32_t vacc3 = vi0x3 + vinit_bias;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b[2] = vacc2;
+ b[3] = vacc3;
+ b += 4;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
+ int32_t vacc0 = b[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = b[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ int32_t vacc2 = b[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ int32_t vacc3 = b[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ i0 += 4;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ vacc2 += vi0x2;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ vacc3 += vi0x3;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b[2] = vacc2;
+ b[3] = vacc3;
+ b += 4;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float vscale = params->fp32_scalar_fmagic.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
+ const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
+ const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
+ for (; channels >= 4; channels -= 4) {
+ int32_t vacc0 = buffer[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = buffer[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ int32_t vacc2 = buffer[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ int32_t vacc3 = buffer[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ buffer += 4;
+ i0 += 4;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ vacc2 += vi0x2;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ vacc3 += vi0x3;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+ float vfpacc2 = (float) vacc2 * vscale;
+ float vfpacc3 = (float) vacc3 * vscale;
+
+ vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+ vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+ vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+ vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+ vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+ vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+ vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+ vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+ vfpacc0 += vmagic_bias;
+ vfpacc1 += vmagic_bias;
+ vfpacc2 += vmagic_bias;
+ vfpacc3 += vmagic_bias;
+
+ int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0) - vmagic_bias_less_output_zero_point;
+ int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1) - vmagic_bias_less_output_zero_point;
+ int32_t vout2 = (int32_t) fp32_to_bits(vfpacc2) - vmagic_bias_less_output_zero_point;
+ int32_t vout3 = (int32_t) fp32_to_bits(vfpacc3) - vmagic_bias_less_output_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output[2] = (uint8_t) vout2;
+ output[3] = (uint8_t) vout3;
+ output += 4;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ int32_t vacc = *buffer++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc) - vmagic_bias_less_output_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
new file mode 100644
index 0000000..ca19e5b
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
@@ -0,0 +1,158 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+
+ const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
+ int32_t* b = buffer;
+ size_t c = channels;
+ do {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ *b++ = vacc;
+ } while (--c != 0);
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ do {
+ int32_t vacc = *b;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ *b++ = vacc;
+ } while (--c != 0);
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float vscale = params->fp32_scalar_imagic.scale;
+ const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+ const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+ const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+ const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+ do {
+ int32_t vacc = *buffer++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc);
+ vout = math_max_s32(vout, vmagic_min);
+ vout = math_min_s32(vout, vmagic_max);
+ vout -= vmagic_bias_less_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
new file mode 100644
index 0000000..1fba58a
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
@@ -0,0 +1,267 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+
+ const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ i0 += 2;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b += 2;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
+ int32_t vacc0 = b[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = b[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ i0 += 2;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b += 2;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float vscale = params->fp32_scalar_imagic.scale;
+ const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+ const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+ const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+ const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+ for (; channels >= 2; channels -= 2) {
+ int32_t vacc0 = buffer[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = buffer[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ buffer += 2;
+ i0 += 2;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+
+ vfpacc0 += vmagic_bias;
+ vfpacc1 += vmagic_bias;
+
+ int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0);
+ int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1);
+
+ vout0 = math_max_s32(vout0, vmagic_min);
+ vout1 = math_max_s32(vout1, vmagic_min);
+
+ vout0 = math_min_s32(vout0, vmagic_max);
+ vout1 = math_min_s32(vout1, vmagic_max);
+
+ vout0 -= vmagic_bias_less_zero_point;
+ vout1 -= vmagic_bias_less_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output += 2;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ int32_t vacc = *buffer;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc);
+ vout = math_max_s32(vout, vmagic_min);
+ vout = math_min_s32(vout, vmagic_max);
+ vout -= vmagic_bias_less_zero_point;
+
+ *output = (uint8_t) vout;
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
new file mode 100644
index 0000000..4b44df1
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
@@ -0,0 +1,375 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+
+ const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ i0 += 4;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ int32_t vacc2 = vi0x2 + vinit_bias;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ int32_t vacc3 = vi0x3 + vinit_bias;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b[2] = vacc2;
+ b[3] = vacc3;
+ b += 4;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
+ int32_t vacc0 = b[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = b[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ int32_t vacc2 = b[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ int32_t vacc3 = b[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ i0 += 4;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ vacc2 += vi0x2;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ vacc3 += vi0x3;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b[2] = vacc2;
+ b[3] = vacc3;
+ b += 4;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float vscale = params->fp32_scalar_imagic.scale;
+ const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+ const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+ const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+ const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+ for (; channels >= 4; channels -= 4) {
+ int32_t vacc0 = buffer[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = buffer[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ int32_t vacc2 = buffer[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ int32_t vacc3 = buffer[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ buffer += 4;
+ i0 += 4;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ vacc2 += vi0x2;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ vacc3 += vi0x3;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+ float vfpacc2 = (float) vacc2 * vscale;
+ float vfpacc3 = (float) vacc3 * vscale;
+
+ vfpacc0 += vmagic_bias;
+ vfpacc1 += vmagic_bias;
+ vfpacc2 += vmagic_bias;
+ vfpacc3 += vmagic_bias;
+
+ int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0);
+ int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1);
+ int32_t vout2 = (int32_t) fp32_to_bits(vfpacc2);
+ int32_t vout3 = (int32_t) fp32_to_bits(vfpacc3);
+
+ vout0 = math_max_s32(vout0, vmagic_min);
+ vout1 = math_max_s32(vout1, vmagic_min);
+ vout2 = math_max_s32(vout2, vmagic_min);
+ vout3 = math_max_s32(vout3, vmagic_min);
+
+ vout0 = math_min_s32(vout0, vmagic_max);
+ vout1 = math_min_s32(vout1, vmagic_max);
+ vout2 = math_min_s32(vout2, vmagic_max);
+ vout3 = math_min_s32(vout3, vmagic_max);
+
+ vout0 -= vmagic_bias_less_zero_point;
+ vout1 -= vmagic_bias_less_zero_point;
+ vout2 -= vmagic_bias_less_zero_point;
+ vout3 -= vmagic_bias_less_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output[2] = (uint8_t) vout2;
+ output[3] = (uint8_t) vout3;
+ output += 4;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ int32_t vacc = *buffer++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc);
+ vout = math_max_s32(vout, vmagic_min);
+ vout = math_min_s32(vout, vmagic_max);
+ vout -= vmagic_bias_less_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
new file mode 100644
index 0000000..8b8717a
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
@@ -0,0 +1,155 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+
+ const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
+ int32_t* b = buffer;
+ size_t c = channels;
+ do {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ *b++ = vacc;
+ } while (--c != 0);
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ do {
+ int32_t vacc = *b;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ *b++ = vacc;
+ } while (--c != 0);
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float vscale = params->fp32_scalar_lrintf.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
+ const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
+ do {
+ int32_t vacc = *buffer++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ const int32_t vrndacc = (int32_t) lrintf(vfpacc);
+ int32_t vout = vrndacc + voutput_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
new file mode 100644
index 0000000..867ccad
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
@@ -0,0 +1,261 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+
+ const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ i0 += 2;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b += 2;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
+ int32_t vacc0 = b[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = b[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ i0 += 2;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b += 2;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float vscale = params->fp32_scalar_lrintf.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
+ const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
+ for (; channels >= 2; channels -= 2) {
+ int32_t vacc0 = buffer[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = buffer[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ buffer += 2;
+ i0 += 2;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+
+ vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+ vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+ vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+ vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+ const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
+ const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
+
+ int32_t vout0 = vrndacc0 + voutput_zero_point;
+ int32_t vout1 = vrndacc1 + voutput_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output += 2;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ int32_t vacc = *buffer;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ const int32_t vrndacc = (int32_t) lrintf(vfpacc);
+ int32_t vout = vrndacc + voutput_zero_point;
+
+ *output = (uint8_t) vout;
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
new file mode 100644
index 0000000..a87d252
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
@@ -0,0 +1,367 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+
+ const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ i0 += 4;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ int32_t vacc2 = vi0x2 + vinit_bias;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ int32_t vacc3 = vi0x3 + vinit_bias;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b[2] = vacc2;
+ b[3] = vacc3;
+ b += 4;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
+ int32_t vacc0 = b[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = b[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ int32_t vacc2 = b[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ int32_t vacc3 = b[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ i0 += 4;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ vacc2 += vi0x2;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ vacc3 += vi0x3;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ b[0] = vacc0;
+ b[1] = vacc1;
+ b[2] = vacc2;
+ b[3] = vacc3;
+ b += 4;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const float vscale = params->fp32_scalar_lrintf.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
+ const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
+ for (; channels >= 4; channels -= 4) {
+ int32_t vacc0 = buffer[0];
+ const int32_t vi0x0 = (int32_t) i0[0];
+ int32_t vacc1 = buffer[1];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ int32_t vacc2 = buffer[2];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ int32_t vacc3 = buffer[3];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ buffer += 4;
+ i0 += 4;
+
+ vacc0 += vi0x0;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ vacc1 += vi0x1;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ vacc2 += vi0x2;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ vacc3 += vi0x3;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+ float vfpacc2 = (float) vacc2 * vscale;
+ float vfpacc3 = (float) vacc3 * vscale;
+
+ vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+ vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+ vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+ vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+ vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+ vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+ vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+ vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+ const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
+ const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
+ const int32_t vrndacc2 = (int32_t) lrintf(vfpacc2);
+ const int32_t vrndacc3 = (int32_t) lrintf(vfpacc3);
+
+ int32_t vout0 = vrndacc0 + voutput_zero_point;
+ int32_t vout1 = vrndacc1 + voutput_zero_point;
+ int32_t vout2 = vrndacc2 + voutput_zero_point;
+ int32_t vout3 = vrndacc3 + voutput_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output[2] = (uint8_t) vout2;
+ output[3] = (uint8_t) vout3;
+ output += 4;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ int32_t vacc = *buffer++;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ const int32_t vrndacc = (int32_t) lrintf(vfpacc);
+ int32_t vout = vrndacc + voutput_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
new file mode 100644
index 0000000..d86d99b
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
@@ -0,0 +1,424 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-sse2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
+ const __m128i vzero = _mm_setzero_si128();
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
+ i0 += 16;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero);
+ const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
+ i1 += 16;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero);
+ const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
+ i2 += 16;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero);
+ const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
+ i3 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero);
+ const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
+ i4 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero);
+ const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
+ i5 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero);
+ const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
+ i6 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ _mm_store_si128((__m128i*) (b + 8), vacc89AB);
+ _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
+ b += 16;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
+ i0 += 16;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero);
+ const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
+ i1 += 16;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero);
+ const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
+ i2 += 16;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero);
+ const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
+ i3 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero);
+ const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
+ i4 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero);
+ const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
+ i5 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero);
+ const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
+ i6 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ _mm_store_si128((__m128i*) (b + 8), vacc89AB);
+ _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
+ b += 16;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ for (; channels >= 16; channels -= 16) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
+ i0 += 16;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero);
+ const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
+ i1 += 16;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero);
+ const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
+ i2 += 16;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero);
+ const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
+ i3 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero);
+ const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
+ i4 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero);
+ const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
+ i5 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero);
+ const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
+ i6 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
+ buffer += 16;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+ __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
+ __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+ vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
+ vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+ vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
+ vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+ vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
+ vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+
+ __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+
+ vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if XNN_LIKELY(channels >= 8) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
new file mode 100644
index 0000000..9d4dfb8
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
@@ -0,0 +1,618 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-sse2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
+ const __m128i vzero = _mm_setzero_si128();
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
+ const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
+ i0 += 24;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero);
+ const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
+ const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, vzero);
+ const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
+ i1 += 24;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero);
+ const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
+ const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, vzero);
+ const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
+ i2 += 24;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero);
+ const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
+ __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, vzero);
+ const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
+ i3 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero);
+ const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, vzero);
+ const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
+ i4 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero);
+ const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, vzero);
+ const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
+ i5 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero);
+ const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, vzero);
+ const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
+ i6 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vzero);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
+ vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ _mm_store_si128((__m128i*) (b + 8), vacc89AB);
+ _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
+ _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
+ _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
+ b += 24;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
+ const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
+ i0 += 24;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero);
+ const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
+ const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, vzero);
+ const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
+ i1 += 24;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero);
+ const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
+ const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, vzero);
+ const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
+ i2 += 24;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero);
+ const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
+ __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, vzero);
+ const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
+ i3 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero);
+ const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, vzero);
+ const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
+ i4 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero);
+ const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, vzero);
+ const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
+ i5 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero);
+ const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, vzero);
+ const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
+ i6 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vzero);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16)));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ _mm_store_si128((__m128i*) (b + 8), vacc89AB);
+ _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
+ _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
+ _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
+ b += 24;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ for (; channels >= 24; channels -= 24) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
+ const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
+ i0 += 24;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero);
+ const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
+ const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, vzero);
+ const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
+ i1 += 24;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero);
+ const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
+ const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, vzero);
+ const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
+ i2 += 24;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero);
+ const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
+ __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, vzero);
+ const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
+ i3 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero);
+ const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, vzero);
+ const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
+ i4 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero);
+ const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, vzero);
+ const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
+ i5 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero);
+ const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, vzero);
+ const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
+ i6 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vzero);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16)));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20)));
+ buffer += 24;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+ __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
+ __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
+ __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ);
+ __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+ vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
+ vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
+ vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale);
+ vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+ vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
+ vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
+ vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point);
+ vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+ vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
+ vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
+ vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ);
+ vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+ __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
+
+
+ __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+ __m128i voutGHIJKLMNGHIJKLMN = _mm_packus_epi16(voutGHIJKLMN, voutGHIJKLMN);
+
+ vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNGHIJKLMN = _mm_max_epu8(voutGHIJKLMNGHIJKLMN, voutput_min);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
+ output += 24;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if XNN_LIKELY(channels >= 8) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
new file mode 100644
index 0000000..6986c1d
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
@@ -0,0 +1,331 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-sse2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
+ const __m128i vzero = _mm_setzero_si128();
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ for (; channels >= 8; channels -= 8) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ }
+ }
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
new file mode 100644
index 0000000..8f91fdd
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
@@ -0,0 +1,350 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-sse4.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
+ i0 += 16;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
+ i1 += 16;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
+ i2 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
+ i3 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
+ i4 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
+ i5 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
+ i6 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ _mm_store_si128((__m128i*) (b + 8), vacc89AB);
+ _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
+ b += 16;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
+ i0 += 16;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
+ i1 += 16;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
+ i2 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
+ i3 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
+ i4 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
+ i5 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
+ i6 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ _mm_store_si128((__m128i*) (b + 8), vacc89AB);
+ _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
+ b += 16;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
+ for (; channels >= 16; channels -= 16) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
+ i0 += 16;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
+ i1 += 16;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
+ i2 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
+ i3 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
+ i4 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
+ i5 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
+ i6 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
+ buffer += 16;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+ __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
+ __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+ vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
+ vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+ vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
+ vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+ vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
+ vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+ __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+
+ vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if XNN_LIKELY(channels >= 8) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
new file mode 100644
index 0000000..8d5a9fe
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
@@ -0,0 +1,495 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-sse4.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
+ const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
+ i0 += 24;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
+ const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
+ i1 += 24;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
+ __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
+ i2 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
+ i3 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
+ i4 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
+ i5 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
+ i6 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
+ vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ _mm_store_si128((__m128i*) (b + 8), vacc89AB);
+ _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
+ _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
+ _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
+ b += 24;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
+ const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
+ i0 += 24;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
+ const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
+ i1 += 24;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
+ __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
+ i2 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
+ i3 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
+ i4 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
+ i5 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
+ i6 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16)));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ _mm_store_si128((__m128i*) (b + 8), vacc89AB);
+ _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
+ _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
+ _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
+ b += 24;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
+ for (; channels >= 24; channels -= 24) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
+ const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
+ i0 += 24;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
+ const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
+ i1 += 24;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
+ __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
+ i2 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
+ i3 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
+ i4 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
+ i5 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
+ i6 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16)));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20)));
+ buffer += 24;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+ __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
+ __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
+ __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ);
+ __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+ vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
+ vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
+ vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale);
+ vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+ vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
+ vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
+ vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point);
+ vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+ vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
+ vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
+ vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ);
+ vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+ __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
+
+ __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+ __m128i voutGHIJKLMNGHIJKLMN = _mm_packus_epi16(voutGHIJKLMN, voutGHIJKLMN);
+
+ vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNGHIJKLMN = _mm_max_epu8(voutGHIJKLMNGHIJKLMN, voutput_min);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
+ output += 24;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if XNN_LIKELY(channels >= 8) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
new file mode 100644
index 0000000..f7fae60
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
@@ -0,0 +1,278 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-sse4.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
+
+ _mm_store_si128((__m128i*) b, vacc0123);
+ _mm_store_si128((__m128i*) (b + 4), vacc4567);
+ b += 8;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
+ for (; channels >= 8; channels -= 8) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
+ buffer += 8;
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ }
+ }
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
new file mode 100644
index 0000000..0825b54
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
@@ -0,0 +1,351 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+
+ const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ i0 += 16;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ i1 += 16;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ i2 += 16;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ i3 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ i4 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ i5 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ i6 += 16;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ wasm_v128_store(b + 8, vacc89AB);
+ wasm_v128_store(b + 12, vaccCDEF);
+ b += 16;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 16)) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ i0 += 16;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ i1 += 16;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ i2 += 16;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ i3 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ i4 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ i5 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ i6 += 16;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+ v128_t vacc89AB = wasm_v128_load(b + 8);
+ v128_t vaccCDEF = wasm_v128_load(b + 12);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ wasm_v128_store(b + 8, vacc89AB);
+ wasm_v128_store(b + 12, vaccCDEF);
+ b += 16;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
+ const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
+ const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
+ const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
+ for (; channels >= 16; channels -= 16) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ i0 += 16;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ i1 += 16;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ i2 += 16;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ i3 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ i4 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ i5 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ i6 += 16;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ v128_t vacc89AB = wasm_v128_load(buffer + 8);
+ v128_t vaccCDEF = wasm_v128_load(buffer + 12);
+ buffer += 16;
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+ vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
+ vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+ vacc89AB = wasm_f32x4_mul(vacc89AB, vscale);
+ vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+ vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias);
+ vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+ vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min);
+ vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point);
+
+ v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF);
+
+ v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF);
+
+ vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max);
+
+ wasm_v128_store(output, vout0123456789ABCDEF);
+ output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ buffer += 8;
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ if XNN_LIKELY(channels >= 8) {
+ *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
+ vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
new file mode 100644
index 0000000..4d444ed
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
@@ -0,0 +1,493 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16);
+ i0 += 24;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16);
+ i1 += 24;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16);
+ i2 += 24;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16);
+ i3 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16);
+ i4 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16);
+ i5 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16);
+ i6 += 24;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+ const v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN));
+ const v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ wasm_v128_store(b + 8, vacc89AB);
+ wasm_v128_store(b + 12, vaccCDEF);
+ wasm_v128_store(b + 16, vaccGHIJ);
+ wasm_v128_store(b + 20, vaccKLMN);
+ b += 24;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16);
+ i0 += 24;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16);
+ i1 += 24;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16);
+ i2 += 24;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16);
+ i3 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16);
+ i4 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16);
+ i5 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16);
+ i6 += 24;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+ v128_t vacc89AB = wasm_v128_load(b + 8);
+ v128_t vaccCDEF = wasm_v128_load(b + 12);
+ v128_t vaccGHIJ = wasm_v128_load(b + 16);
+ v128_t vaccKLMN = wasm_v128_load(b + 20);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+ vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN));
+ vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ wasm_v128_store(b + 8, vacc89AB);
+ wasm_v128_store(b + 12, vaccCDEF);
+ wasm_v128_store(b + 16, vaccGHIJ);
+ wasm_v128_store(b + 20, vaccKLMN);
+ b += 24;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
+ const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
+ const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
+ const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
+ for (; channels >= 24; channels -= 24) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16);
+ i0 += 24;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16);
+ i1 += 24;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16);
+ i2 += 24;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16);
+ i3 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16);
+ i4 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16);
+ i5 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16);
+ i6 += 24;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ v128_t vacc89AB = wasm_v128_load(buffer + 8);
+ v128_t vaccCDEF = wasm_v128_load(buffer + 12);
+ v128_t vaccGHIJ = wasm_v128_load(buffer + 16);
+ v128_t vaccKLMN = wasm_v128_load(buffer + 20);
+ buffer += 24;
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+ vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN));
+ vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+ vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
+ vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF);
+ vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ);
+ vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+ vacc89AB = wasm_f32x4_mul(vacc89AB, vscale);
+ vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale);
+ vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale);
+ vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+ vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias);
+ vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias);
+ vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias);
+ vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+ vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min);
+ vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min);
+ vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min);
+ vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point);
+ vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point);
+ vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point);
+
+ v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF);
+ v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN);
+
+ v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF);
+ v128_t voutGHIJKLMNGHIJKLMN = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutGHIJKLMN);
+
+ vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNGHIJKLMN = wasm_u8x16_min(voutGHIJKLMNGHIJKLMN, voutput_max);
+
+ wasm_v128_store(output, vout0123456789ABCDEF);
+ *((double*) (output + 16)) = wasm_f64x2_extract_lane(voutGHIJKLMNGHIJKLMN, 0);
+ output += 24;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ buffer += 8;
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ if XNN_LIKELY(channels >= 8) {
+ *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
+ vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
new file mode 100644
index 0000000..74363c4
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
@@ -0,0 +1,557 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 32; c -= 32) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16);
+ const v128_t vxi0xOPQRSTUV = wasm_u16x8_load8x8(i0 + 24);
+ i0 += 32;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16);
+ const v128_t vxi1xOPQRSTUV = wasm_u16x8_load8x8(i1 + 24);
+ i1 += 32;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16);
+ v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV);
+ const v128_t vxi2xOPQRSTUV = wasm_u16x8_load8x8(i2 + 24);
+ i2 += 32;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV);
+ const v128_t vxi3xOPQRSTUV = wasm_u16x8_load8x8(i3 + 24);
+ i3 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV);
+ const v128_t vxi4xOPQRSTUV = wasm_u16x8_load8x8(i4 + 24);
+ i4 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV);
+ const v128_t vxi5xOPQRSTUV = wasm_u16x8_load8x8(i5 + 24);
+ i5 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV);
+ const v128_t vxi6xOPQRSTUV = wasm_u16x8_load8x8(i6 + 24);
+ i6 += 32;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV);
+
+ const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+ const v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN));
+ const v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN));
+ const v128_t vaccOPQR = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccOPQRSTUV));
+ const v128_t vaccSTUV = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccOPQRSTUV));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ wasm_v128_store(b + 8, vacc89AB);
+ wasm_v128_store(b + 12, vaccCDEF);
+ wasm_v128_store(b + 16, vaccGHIJ);
+ wasm_v128_store(b + 20, vaccKLMN);
+ wasm_v128_store(b + 24, vaccOPQR);
+ wasm_v128_store(b + 28, vaccSTUV);
+ b += 32;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c >= 32; c -= 32) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16);
+ const v128_t vxi0xOPQRSTUV = wasm_u16x8_load8x8(i0 + 24);
+ i0 += 32;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16);
+ const v128_t vxi1xOPQRSTUV = wasm_u16x8_load8x8(i1 + 24);
+ i1 += 32;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16);
+ v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV);
+ const v128_t vxi2xOPQRSTUV = wasm_u16x8_load8x8(i2 + 24);
+ i2 += 32;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV);
+ const v128_t vxi3xOPQRSTUV = wasm_u16x8_load8x8(i3 + 24);
+ i3 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV);
+ const v128_t vxi4xOPQRSTUV = wasm_u16x8_load8x8(i4 + 24);
+ i4 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV);
+ const v128_t vxi5xOPQRSTUV = wasm_u16x8_load8x8(i5 + 24);
+ i5 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV);
+ const v128_t vxi6xOPQRSTUV = wasm_u16x8_load8x8(i6 + 24);
+ i6 += 32;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV);
+
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+ v128_t vacc89AB = wasm_v128_load(b + 8);
+ v128_t vaccCDEF = wasm_v128_load(b + 12);
+ v128_t vaccGHIJ = wasm_v128_load(b + 16);
+ v128_t vaccKLMN = wasm_v128_load(b + 20);
+ v128_t vaccOPQR = wasm_v128_load(b + 24);
+ v128_t vaccSTUV = wasm_v128_load(b + 28);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+ vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN));
+ vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN));
+ vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_u32x4_extend_low_u16x8(vaccOPQRSTUV));
+ vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_u32x4_extend_high_u16x8(vaccOPQRSTUV));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ wasm_v128_store(b + 8, vacc89AB);
+ wasm_v128_store(b + 12, vaccCDEF);
+ wasm_v128_store(b + 16, vaccGHIJ);
+ wasm_v128_store(b + 20, vaccKLMN);
+ wasm_v128_store(b + 24, vaccOPQR);
+ wasm_v128_store(b + 28, vaccSTUV);
+ b += 32;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
+ const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
+ const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
+ const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
+ for (; channels >= 32; channels -= 32) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16);
+ const v128_t vxi0xOPQRSTUV = wasm_u16x8_load8x8(i0 + 24);
+ i0 += 32;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16);
+ const v128_t vxi1xOPQRSTUV = wasm_u16x8_load8x8(i1 + 24);
+ i1 += 32;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16);
+ v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV);
+ const v128_t vxi2xOPQRSTUV = wasm_u16x8_load8x8(i2 + 24);
+ i2 += 32;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV);
+ const v128_t vxi3xOPQRSTUV = wasm_u16x8_load8x8(i3 + 24);
+ i3 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV);
+ const v128_t vxi4xOPQRSTUV = wasm_u16x8_load8x8(i4 + 24);
+ i4 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV);
+ const v128_t vxi5xOPQRSTUV = wasm_u16x8_load8x8(i5 + 24);
+ i5 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV);
+ const v128_t vxi6xOPQRSTUV = wasm_u16x8_load8x8(i6 + 24);
+ i6 += 32;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV);
+
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ v128_t vacc89AB = wasm_v128_load(buffer + 8);
+ v128_t vaccCDEF = wasm_v128_load(buffer + 12);
+ v128_t vaccGHIJ = wasm_v128_load(buffer + 16);
+ v128_t vaccKLMN = wasm_v128_load(buffer + 20);
+ v128_t vaccOPQR = wasm_v128_load(buffer + 24);
+ v128_t vaccSTUV = wasm_v128_load(buffer + 28);
+ buffer += 32;
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+ vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN));
+ vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN));
+ vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_u32x4_extend_low_u16x8(vaccOPQRSTUV));
+ vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_u32x4_extend_high_u16x8(vaccOPQRSTUV));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+ vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
+ vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF);
+ vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ);
+ vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN);
+ vaccOPQR = wasm_f32x4_convert_i32x4(vaccOPQR);
+ vaccSTUV = wasm_f32x4_convert_i32x4(vaccSTUV);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+ vacc89AB = wasm_f32x4_mul(vacc89AB, vscale);
+ vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale);
+ vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale);
+ vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale);
+ vaccOPQR = wasm_f32x4_mul(vaccOPQR, vscale);
+ vaccSTUV = wasm_f32x4_mul(vaccSTUV, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+ vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias);
+ vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias);
+ vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias);
+ vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias);
+ vaccOPQR = wasm_f32x4_add(vaccOPQR, vmagic_bias);
+ vaccSTUV = wasm_f32x4_add(vaccSTUV, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+ vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min);
+ vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min);
+ vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min);
+ vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min);
+ vaccOPQR = wasm_i32x4_max(vaccOPQR, vmagic_min);
+ vaccSTUV = wasm_i32x4_max(vaccSTUV, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point);
+ vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point);
+ vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point);
+ vaccOPQR = wasm_i32x4_sub(vaccOPQR, vmagic_bias_less_output_zero_point);
+ vaccSTUV = wasm_i32x4_sub(vaccSTUV, vmagic_bias_less_output_zero_point);
+
+ v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF);
+ v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN);
+ v128_t voutOPQRSTUV = wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV);
+
+ v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF);
+ v128_t voutGHIJKLMNOPQRSTUV = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV);
+
+ vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNOPQRSTUV = wasm_u8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max);
+
+ wasm_v128_store(output, vout0123456789ABCDEF);
+ wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV);
+ output += 32;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ buffer += 8;
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ if XNN_LIKELY(channels >= 8) {
+ *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
+ vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
new file mode 100644
index 0000000..a8aa3d9
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
@@ -0,0 +1,279 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ int32_t* buffer,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+
+ const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ b += 8;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+
+ int32_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_v128_load(b);
+ v128_t vacc4567 = wasm_v128_load(b + 4);
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ wasm_v128_store(b, vacc0123);
+ wasm_v128_store(b + 4, vacc4567);
+ b += 8;
+ }
+ }
+
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
+ const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
+ const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
+ const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
+ for (; channels >= 8; channels -= 8) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ buffer += 8;
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+ output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_v128_load(buffer);
+ v128_t vacc4567 = wasm_v128_load(buffer + 4);
+ buffer += 8;
+
+ vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ if (channels & 4) {
+ *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
+ vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ }
+ }
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c16.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c16.c
new file mode 100644
index 0000000..6dac4e9
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c16.c
@@ -0,0 +1,199 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max);
+ for (; channels >= 16; channels -= 16) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+ vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
+ vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ #endif // !XNN_ARCH_ARM64
+
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c24.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c24.c
new file mode 100644
index 0000000..29a45cb
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c24.c
@@ -0,0 +1,229 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max);
+ for (; channels >= 24; channels -= 24) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+ int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
+ int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+ float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
+ float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+ vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
+ vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+ vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
+ vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));
+ vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias));
+ vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);
+ vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point);
+ vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ #endif // !XNN_ARCH_ARM64
+
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min));
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max));
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ vst1_u8(output, voutGHIJKLMN); output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c32.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c32.c
new file mode 100644
index 0000000..315674d
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c32.c
@@ -0,0 +1,254 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max);
+ for (; channels >= 32; channels -= 32) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+ int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
+ int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
+ int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV)));
+ int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+ float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
+ float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
+ float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR);
+ float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+ vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
+ vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
+ vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale);
+ vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+ vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
+ vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));
+ vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias));
+ vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias));
+ vaccOPQR = vreinterpretq_s32_f32(vaddq_f32(vfpaccOPQR, vmagic_bias));
+ vaccSTUV = vreinterpretq_s32_f32(vaddq_f32(vfpaccSTUV, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);
+ vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point);
+ vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point);
+ vaccOPQR = vqsubq_s32(vaccOPQR, vmagic_bias_less_output_zero_point);
+ vaccSTUV = vqsubq_s32(vaccSTUV, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
+ #endif // !XNN_ARCH_ARM64
+
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c8.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c8.c
new file mode 100644
index 0000000..c368d91
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-neon-c8.c
@@ -0,0 +1,168 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neon.output_min);
+ const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neon.output_max);
+ for (; channels >= 8; channels -= 8) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif // !XNN_ARCH_ARM64
+
+
+ #if XNN_ARCH_ARM64
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ #else // !XNN_ARCH_ARM64
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ #endif // !XNN_ARCH_ARM64
+
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
+
+ vst1_u8(output, vout01234567); output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
+ vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
+
+ vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
+
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0);
+ }
+ }
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c16.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c16.c
new file mode 100644
index 0000000..d1cd69a
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c16.c
@@ -0,0 +1,194 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+ for (; channels >= 16; channels -= 16) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+ vacc89AB = vcvtnq_s32_f32(vfpacc89AB);
+ vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ #endif // !XNN_ARCH_ARM64
+
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+ vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c24.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c24.c
new file mode 100644
index 0000000..8131f37
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c24.c
@@ -0,0 +1,223 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+ for (; channels >= 24; channels -= 24) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+ int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
+ int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+ float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
+ float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+ vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
+ vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+ vacc89AB = vcvtnq_s32_f32(vfpacc89AB);
+ vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
+ vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ);
+ vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ #endif // !XNN_ARCH_ARM64
+
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+ vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
+ vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point);
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min));
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max));
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ vst1_u8(output, voutGHIJKLMN); output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c32.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c32.c
new file mode 100644
index 0000000..39baba5
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c32.c
@@ -0,0 +1,247 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+ for (; channels >= 32; channels -= 32) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+ const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
+ const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
+ const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
+ const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
+ const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
+ const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
+ const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
+ const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
+ const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
+ const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8;
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+ vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
+ vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
+ vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+ int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
+ int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
+ int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
+ int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
+ int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV)));
+ int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+ float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
+ float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
+ float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
+ float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
+ float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR);
+ float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+ vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
+ vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
+ vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
+ vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
+ vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale);
+ vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+ vacc89AB = vcvtnq_s32_f32(vfpacc89AB);
+ vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
+ vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ);
+ vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN);
+ vaccOPQR = vcvtnq_s32_f32(vfpaccOPQR);
+ vaccSTUV = vcvtnq_s32_f32(vfpaccSTUV);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
+ int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
+ int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
+ int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
+ int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
+ #endif // !XNN_ARCH_ARM64
+
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+ vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
+ vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point);
+ vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point);
+
+ #if XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
+ uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV);
+ #else // !XNN_ARCH_ARM64
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+ uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));
+ #endif // !XNN_ARCH_ARM64
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1_u8(output, vout01234567); output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0); output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c8.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c8.c
new file mode 100644
index 0000000..11e4af2
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-neonv8-c8.c
@@ -0,0 +1,164 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+ const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neonv8.output_max);
+ for (; channels >= 8; channels -= 8) {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else // !XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif // !XNN_ARCH_ARM64
+
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ #if XNN_ARCH_ARM64
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ #else // !XNN_ARCH_ARM64
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ #endif // !XNN_ARCH_ARM64
+
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
+
+ vst1_u8(output, vout01234567); output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
+ const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
+ const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
+ uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
+
+ const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
+ const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
+ const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
+ const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
+ vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
+ vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
+
+ int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
+ int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
+
+ float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
+ float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
+
+ vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
+ vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
+
+ vacc0123 = vcvtnq_s32_f32(vfpacc0123);
+ vacc4567 = vcvtnq_s32_f32(vfpacc4567);
+
+ #if XNN_ARCH_ARM64
+ int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
+ #else
+ int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
+ #endif
+ vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
+
+ if (channels & 4) {
+ vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (channels & 2) {
+ vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_u8(output, vout01234567, 0);
+ }
+ }
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c1.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c1.c
new file mode 100644
index 0000000..c9dbcb4
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c1.c
@@ -0,0 +1,90 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
+ const float vscale = params->fp32_scalar_fmagic.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
+ const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
+ const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
+ do {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc) - vmagic_bias_less_output_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c2.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c2.c
new file mode 100644
index 0000000..b58636c
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c2.c
@@ -0,0 +1,149 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
+ const float vscale = params->fp32_scalar_fmagic.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
+ const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
+ const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
+ for (; channels >= 2; channels -= 2) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ i0 += 2;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+
+ vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+ vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+ vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+ vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+ vfpacc0 += vmagic_bias;
+ vfpacc1 += vmagic_bias;
+
+ int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0) - vmagic_bias_less_output_zero_point;
+ int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1) - vmagic_bias_less_output_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output += 2;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc) - vmagic_bias_less_output_zero_point;
+
+ *output = (uint8_t) vout;
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c4.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c4.c
new file mode 100644
index 0000000..c39ed06
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-fmagic-c4.c
@@ -0,0 +1,191 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
+ const float vscale = params->fp32_scalar_fmagic.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
+ const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
+ const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
+ for (; channels >= 4; channels -= 4) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ i0 += 4;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ int32_t vacc2 = vi0x2 + vinit_bias;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ int32_t vacc3 = vi0x3 + vinit_bias;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+ float vfpacc2 = (float) vacc2 * vscale;
+ float vfpacc3 = (float) vacc3 * vscale;
+
+ vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+ vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+ vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+ vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+ vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+ vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+ vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+ vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+ vfpacc0 += vmagic_bias;
+ vfpacc1 += vmagic_bias;
+ vfpacc2 += vmagic_bias;
+ vfpacc3 += vmagic_bias;
+
+ int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0) - vmagic_bias_less_output_zero_point;
+ int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1) - vmagic_bias_less_output_zero_point;
+ int32_t vout2 = (int32_t) fp32_to_bits(vfpacc2) - vmagic_bias_less_output_zero_point;
+ int32_t vout3 = (int32_t) fp32_to_bits(vfpacc3) - vmagic_bias_less_output_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output[2] = (uint8_t) vout2;
+ output[3] = (uint8_t) vout3;
+ output += 4;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc) - vmagic_bias_less_output_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c
new file mode 100644
index 0000000..1d90d2a
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
+ const float vscale = params->fp32_scalar_imagic.scale;
+ const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+ const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+ const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+ const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+ do {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc);
+ vout = math_max_s32(vout, vmagic_min);
+ vout = math_min_s32(vout, vmagic_max);
+ vout -= vmagic_bias_less_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c2.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c2.c
new file mode 100644
index 0000000..b79838a
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c2.c
@@ -0,0 +1,153 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
+ const float vscale = params->fp32_scalar_imagic.scale;
+ const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+ const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+ const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+ const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+ for (; channels >= 2; channels -= 2) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ i0 += 2;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+
+ vfpacc0 += vmagic_bias;
+ vfpacc1 += vmagic_bias;
+
+ int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0);
+ int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1);
+
+ vout0 = math_max_s32(vout0, vmagic_min);
+ vout1 = math_max_s32(vout1, vmagic_min);
+
+ vout0 = math_min_s32(vout0, vmagic_max);
+ vout1 = math_min_s32(vout1, vmagic_max);
+
+ vout0 -= vmagic_bias_less_zero_point;
+ vout1 -= vmagic_bias_less_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output += 2;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc);
+ vout = math_max_s32(vout, vmagic_min);
+ vout = math_min_s32(vout, vmagic_max);
+ vout -= vmagic_bias_less_zero_point;
+
+ *output = (uint8_t) vout;
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c4.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c4.c
new file mode 100644
index 0000000..ae81e15
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c4.c
@@ -0,0 +1,197 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <fp16.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
+ const float vscale = params->fp32_scalar_imagic.scale;
+ const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+ const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+ const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+ const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+ for (; channels >= 4; channels -= 4) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ i0 += 4;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ int32_t vacc2 = vi0x2 + vinit_bias;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ int32_t vacc3 = vi0x3 + vinit_bias;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+ float vfpacc2 = (float) vacc2 * vscale;
+ float vfpacc3 = (float) vacc3 * vscale;
+
+ vfpacc0 += vmagic_bias;
+ vfpacc1 += vmagic_bias;
+ vfpacc2 += vmagic_bias;
+ vfpacc3 += vmagic_bias;
+
+ int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0);
+ int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1);
+ int32_t vout2 = (int32_t) fp32_to_bits(vfpacc2);
+ int32_t vout3 = (int32_t) fp32_to_bits(vfpacc3);
+
+ vout0 = math_max_s32(vout0, vmagic_min);
+ vout1 = math_max_s32(vout1, vmagic_min);
+ vout2 = math_max_s32(vout2, vmagic_min);
+ vout3 = math_max_s32(vout3, vmagic_min);
+
+ vout0 = math_min_s32(vout0, vmagic_max);
+ vout1 = math_min_s32(vout1, vmagic_max);
+ vout2 = math_min_s32(vout2, vmagic_max);
+ vout3 = math_min_s32(vout3, vmagic_max);
+
+ vout0 -= vmagic_bias_less_zero_point;
+ vout1 -= vmagic_bias_less_zero_point;
+ vout2 -= vmagic_bias_less_zero_point;
+ vout3 -= vmagic_bias_less_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output[2] = (uint8_t) vout2;
+ output[3] = (uint8_t) vout3;
+ output += 4;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc += vmagic_bias;
+ int32_t vout = (int32_t) fp32_to_bits(vfpacc);
+ vout = math_max_s32(vout, vmagic_min);
+ vout = math_min_s32(vout, vmagic_max);
+ vout -= vmagic_bias_less_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c1.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c1.c
new file mode 100644
index 0000000..b91e3a6
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c1.c
@@ -0,0 +1,88 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
+ const float vscale = params->fp32_scalar_lrintf.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
+ const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
+ do {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ const int32_t vrndacc = (int32_t) lrintf(vfpacc);
+ int32_t vout = vrndacc + voutput_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c2.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c2.c
new file mode 100644
index 0000000..de54d5c
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c2.c
@@ -0,0 +1,147 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
+ const float vscale = params->fp32_scalar_lrintf.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
+ const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
+ for (; channels >= 2; channels -= 2) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ i0 += 2;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ i1 += 2;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ i2 += 2;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ i3 += 2;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ i4 += 2;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ i5 += 2;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ i6 += 2;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+
+ vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+ vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+
+ vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+ vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+
+ const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
+ const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
+
+ int32_t vout0 = vrndacc0 + voutput_zero_point;
+ int32_t vout1 = vrndacc1 + voutput_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output += 2;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0;
+ const int32_t vi1 = (int32_t) *i1;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ const int32_t vrndacc = (int32_t) lrintf(vfpacc);
+ int32_t vout = vrndacc + voutput_zero_point;
+
+ *output = (uint8_t) vout;
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c4.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c4.c
new file mode 100644
index 0000000..ae99f23
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-scalar-lrintf-c4.c
@@ -0,0 +1,189 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-scalar.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
+ const float vscale = params->fp32_scalar_lrintf.scale;
+ const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
+ const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
+ const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
+ for (; channels >= 4; channels -= 4) {
+ const int32_t vi0x0 = (int32_t) i0[0];
+ const int32_t vi0x1 = (int32_t) i0[1];
+ const int32_t vi0x2 = (int32_t) i0[2];
+ const int32_t vi0x3 = (int32_t) i0[3];
+ i0 += 4;
+
+ int32_t vacc0 = vi0x0 + vinit_bias;
+ const int32_t vi1x0 = (int32_t) i1[0];
+ int32_t vacc1 = vi0x1 + vinit_bias;
+ const int32_t vi1x1 = (int32_t) i1[1];
+ int32_t vacc2 = vi0x2 + vinit_bias;
+ const int32_t vi1x2 = (int32_t) i1[2];
+ int32_t vacc3 = vi0x3 + vinit_bias;
+ const int32_t vi1x3 = (int32_t) i1[3];
+ i1 += 4;
+
+ vacc0 += vi1x0;
+ const int32_t vi2x0 = (int32_t) i2[0];
+ vacc1 += vi1x1;
+ const int32_t vi2x1 = (int32_t) i2[1];
+ vacc2 += vi1x2;
+ const int32_t vi2x2 = (int32_t) i2[2];
+ vacc3 += vi1x3;
+ const int32_t vi2x3 = (int32_t) i2[3];
+ i2 += 4;
+ vacc0 += vi2x0;
+ const int32_t vi3x0 = (int32_t) i3[0];
+ vacc1 += vi2x1;
+ const int32_t vi3x1 = (int32_t) i3[1];
+ vacc2 += vi2x2;
+ const int32_t vi3x2 = (int32_t) i3[2];
+ vacc3 += vi2x3;
+ const int32_t vi3x3 = (int32_t) i3[3];
+ i3 += 4;
+ vacc0 += vi3x0;
+ const int32_t vi4x0 = (int32_t) i4[0];
+ vacc1 += vi3x1;
+ const int32_t vi4x1 = (int32_t) i4[1];
+ vacc2 += vi3x2;
+ const int32_t vi4x2 = (int32_t) i4[2];
+ vacc3 += vi3x3;
+ const int32_t vi4x3 = (int32_t) i4[3];
+ i4 += 4;
+ vacc0 += vi4x0;
+ const int32_t vi5x0 = (int32_t) i5[0];
+ vacc1 += vi4x1;
+ const int32_t vi5x1 = (int32_t) i5[1];
+ vacc2 += vi4x2;
+ const int32_t vi5x2 = (int32_t) i5[2];
+ vacc3 += vi4x3;
+ const int32_t vi5x3 = (int32_t) i5[3];
+ i5 += 4;
+ vacc0 += vi5x0;
+ const int32_t vi6x0 = (int32_t) i6[0];
+ vacc1 += vi5x1;
+ const int32_t vi6x1 = (int32_t) i6[1];
+ vacc2 += vi5x2;
+ const int32_t vi6x2 = (int32_t) i6[2];
+ vacc3 += vi5x3;
+ const int32_t vi6x3 = (int32_t) i6[3];
+ i6 += 4;
+
+ vacc0 += vi6x0;
+ vacc1 += vi6x1;
+ vacc2 += vi6x2;
+ vacc3 += vi6x3;
+
+ float vfpacc0 = (float) vacc0 * vscale;
+ float vfpacc1 = (float) vacc1 * vscale;
+ float vfpacc2 = (float) vacc2 * vscale;
+ float vfpacc3 = (float) vacc3 * vscale;
+
+ vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
+ vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
+ vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
+ vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
+
+ vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
+ vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
+ vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
+ vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
+
+ const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0);
+ const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1);
+ const int32_t vrndacc2 = (int32_t) lrintf(vfpacc2);
+ const int32_t vrndacc3 = (int32_t) lrintf(vfpacc3);
+
+ int32_t vout0 = vrndacc0 + voutput_zero_point;
+ int32_t vout1 = vrndacc1 + voutput_zero_point;
+ int32_t vout2 = vrndacc2 + voutput_zero_point;
+ int32_t vout3 = vrndacc3 + voutput_zero_point;
+
+ output[0] = (uint8_t) vout0;
+ output[1] = (uint8_t) vout1;
+ output[2] = (uint8_t) vout2;
+ output[3] = (uint8_t) vout3;
+ output += 4;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ int32_t vacc = vinit_bias;
+ const int32_t vi0 = (int32_t) *i0++;
+ const int32_t vi1 = (int32_t) *i1++;
+
+ vacc += vi0;
+ const int32_t vi2 = (int32_t) *i2++;
+ vacc += vi1;
+ const int32_t vi3 = (int32_t) *i3++;
+ vacc += vi2;
+ const int32_t vi4 = (int32_t) *i4++;
+ vacc += vi3;
+ const int32_t vi5 = (int32_t) *i5++;
+ vacc += vi4;
+ const int32_t vi6 = (int32_t) *i6++;
+
+ vacc += vi5;
+ vacc += vi6;
+
+ float vfpacc = (float) vacc * vscale;
+ vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
+ vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
+ const int32_t vrndacc = (int32_t) lrintf(vfpacc);
+ int32_t vout = vrndacc + voutput_zero_point;
+
+ *output++ = (uint8_t) vout;
+ } while (--channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c16.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c16.c
new file mode 100644
index 0000000..e3f75ed
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c16.c
@@ -0,0 +1,248 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-sse2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i vzero = _mm_setzero_si128();
+ for (; channels >= 16; channels -= 16) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
+ i0 += 16;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero);
+ const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
+ i1 += 16;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero);
+ const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
+ i2 += 16;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero);
+ const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
+ i3 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero);
+ const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
+ i4 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero);
+ const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
+ i5 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero);
+ const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
+ i6 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+ __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
+ __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+ vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
+ vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+ vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
+ vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+ vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
+ vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+
+ __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+
+ vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if XNN_LIKELY(channels >= 8) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c24.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c24.c
new file mode 100644
index 0000000..e1b9224
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c24.c
@@ -0,0 +1,284 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-sse2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i vzero = _mm_setzero_si128();
+ for (; channels >= 24; channels -= 24) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
+ const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16));
+ i0 += 24;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero);
+ const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
+ const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, vzero);
+ const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16));
+ i1 += 24;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero);
+ const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
+ const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, vzero);
+ const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16));
+ i2 += 24;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero);
+ const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
+ __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, vzero);
+ const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16));
+ i3 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero);
+ const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, vzero);
+ const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16));
+ i4 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero);
+ const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, vzero);
+ const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16));
+ i5 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero);
+ const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, vzero);
+ const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
+ i6 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vzero);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
+ vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+ __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
+ __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
+ __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ);
+ __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+ vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
+ vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
+ vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale);
+ vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+ vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
+ vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
+ vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point);
+ vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+ vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
+ vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
+ vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ);
+ vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+ __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
+
+
+ __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+ __m128i voutGHIJKLMNGHIJKLMN = _mm_packus_epi16(voutGHIJKLMN, voutGHIJKLMN);
+
+ vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNGHIJKLMN = _mm_max_epu8(voutGHIJKLMNGHIJKLMN, voutput_min);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
+ output += 24;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if XNN_LIKELY(channels >= 8) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c
new file mode 100644
index 0000000..aa6471a
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c
@@ -0,0 +1,207 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-sse2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i vzero = _mm_setzero_si128();
+ for (; channels >= 8; channels -= 8) {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+
+ const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
+ i0 += 8;
+
+ const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
+ i1 += 8;
+
+ const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
+ const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
+ i2 += 8;
+
+ const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
+ const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
+ i3 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
+ const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
+ i4 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
+ const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
+ i5 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
+ const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ }
+ }
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c16.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c16.c
new file mode 100644
index 0000000..48e171f
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c16.c
@@ -0,0 +1,212 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-sse4.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
+ for (; channels >= 16; channels -= 16) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
+ i0 += 16;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
+ i1 += 16;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
+ i2 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
+ i3 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
+ i4 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
+ i5 += 16;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
+ i6 += 16;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+ __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
+ __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+ vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
+ vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+ vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
+ vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+ vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
+ vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+ __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+
+ vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if XNN_LIKELY(channels >= 8) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c24.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c24.c
new file mode 100644
index 0000000..32cecfe
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c24.c
@@ -0,0 +1,241 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-sse4.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
+ for (; channels >= 24; channels -= 24) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
+ const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
+ i0 += 24;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
+ const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
+ i1 += 24;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
+ __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
+ i2 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
+ const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
+ i3 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
+ const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
+ i4 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
+ const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
+ i5 += 24;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
+ const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
+ i6 += 24;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+ __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
+ __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
+ __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN);
+ __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+ vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
+ vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
+ vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+ __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
+ __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
+ __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ);
+ __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+ vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
+ vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
+ vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale);
+ vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+ vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
+ vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
+ vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point);
+ vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+ vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
+ vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
+ vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ);
+ vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+ __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
+
+ __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+ __m128i voutGHIJKLMNGHIJKLMN = _mm_packus_epi16(voutGHIJKLMN, voutGHIJKLMN);
+
+ vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNGHIJKLMN = _mm_max_epu8(voutGHIJKLMNGHIJKLMN, voutput_min);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
+ output += 24;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if XNN_LIKELY(channels >= 8) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c
new file mode 100644
index 0000000..6f11c42
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c
@@ -0,0 +1,178 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-sse4.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
+ for (; channels >= 8; channels -= 8) {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ const __m128i vzero = _mm_setzero_si128();
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
+ i0 += 8;
+ const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
+ i1 += 8;
+
+ __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
+ const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
+ i2 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
+ const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
+ const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
+ const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
+ const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
+ i6 += 8;
+
+ vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
+
+ __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
+ __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
+
+ vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
+ vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
+
+ __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
+ __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
+
+ vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
+ vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
+
+ vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
+ vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
+
+ vacc0123 = _mm_cvtps_epi32(vfpacc0123);
+ vacc4567 = _mm_cvtps_epi32(vfpacc4567);
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+ vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+ if (channels & 4) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ }
+ }
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c16.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c16.c
new file mode 100644
index 0000000..d7469a2
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c16.c
@@ -0,0 +1,212 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
+ const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
+ const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
+ const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
+ const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
+ for (; channels >= 16; channels -= 16) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ i0 += 16;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ i1 += 16;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ i2 += 16;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ i3 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ i4 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ i5 += 16;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ i6 += 16;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+
+ v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+ vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
+ vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+ vacc89AB = wasm_f32x4_mul(vacc89AB, vscale);
+ vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+ vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias);
+ vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+ vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min);
+ vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point);
+
+ v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF);
+
+ v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF);
+
+ vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max);
+
+ wasm_v128_store(output, vout0123456789ABCDEF);
+ output += 16;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ if XNN_LIKELY(channels >= 8) {
+ *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
+ vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c24.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c24.c
new file mode 100644
index 0000000..c1b0cb0
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c24.c
@@ -0,0 +1,241 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
+ const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
+ const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
+ const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
+ const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
+ for (; channels >= 24; channels -= 24) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16);
+ i0 += 24;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16);
+ i1 += 24;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16);
+ i2 += 24;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16);
+ i3 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16);
+ i4 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16);
+ i5 += 24;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16);
+ i6 += 24;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
+
+ v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+ v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN));
+ v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+ vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
+ vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF);
+ vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ);
+ vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+ vacc89AB = wasm_f32x4_mul(vacc89AB, vscale);
+ vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale);
+ vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale);
+ vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+ vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias);
+ vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias);
+ vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias);
+ vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+ vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min);
+ vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min);
+ vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min);
+ vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point);
+ vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point);
+ vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point);
+
+ v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF);
+ v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN);
+
+ v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF);
+ v128_t voutGHIJKLMNGHIJKLMN = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutGHIJKLMN);
+
+ vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNGHIJKLMN = wasm_u8x16_min(voutGHIJKLMNGHIJKLMN, voutput_max);
+
+ wasm_v128_store(output, vout0123456789ABCDEF);
+ *((double*) (output + 16)) = wasm_f64x2_extract_lane(voutGHIJKLMNGHIJKLMN, 0);
+ output += 24;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ if XNN_LIKELY(channels >= 8) {
+ *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
+ vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c32.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c32.c
new file mode 100644
index 0000000..fb64a55
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c32.c
@@ -0,0 +1,267 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
+ const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
+ const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
+ const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
+ const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
+ for (; channels >= 32; channels -= 32) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
+ const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16);
+ const v128_t vxi0xOPQRSTUV = wasm_u16x8_load8x8(i0 + 24);
+ i0 += 32;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
+ const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16);
+ const v128_t vxi1xOPQRSTUV = wasm_u16x8_load8x8(i1 + 24);
+ i1 += 32;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF);
+ const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
+ v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
+ const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16);
+ v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV);
+ const v128_t vxi2xOPQRSTUV = wasm_u16x8_load8x8(i2 + 24);
+ i2 += 32;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF);
+ const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN);
+ const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV);
+ const v128_t vxi3xOPQRSTUV = wasm_u16x8_load8x8(i3 + 24);
+ i3 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF);
+ const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN);
+ const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV);
+ const v128_t vxi4xOPQRSTUV = wasm_u16x8_load8x8(i4 + 24);
+ i4 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF);
+ const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN);
+ const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV);
+ const v128_t vxi5xOPQRSTUV = wasm_u16x8_load8x8(i5 + 24);
+ i5 += 32;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF);
+ const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN);
+ const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV);
+ const v128_t vxi6xOPQRSTUV = wasm_u16x8_load8x8(i6 + 24);
+ i6 += 32;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+ vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF);
+ vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN);
+ vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV);
+
+ v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+ v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF));
+ v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF));
+ v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN));
+ v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN));
+ v128_t vaccOPQR = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccOPQRSTUV));
+ v128_t vaccSTUV = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccOPQRSTUV));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+ vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
+ vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF);
+ vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ);
+ vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN);
+ vaccOPQR = wasm_f32x4_convert_i32x4(vaccOPQR);
+ vaccSTUV = wasm_f32x4_convert_i32x4(vaccSTUV);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+ vacc89AB = wasm_f32x4_mul(vacc89AB, vscale);
+ vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale);
+ vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale);
+ vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale);
+ vaccOPQR = wasm_f32x4_mul(vaccOPQR, vscale);
+ vaccSTUV = wasm_f32x4_mul(vaccSTUV, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+ vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias);
+ vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias);
+ vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias);
+ vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias);
+ vaccOPQR = wasm_f32x4_add(vaccOPQR, vmagic_bias);
+ vaccSTUV = wasm_f32x4_add(vaccSTUV, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+ vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min);
+ vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min);
+ vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min);
+ vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min);
+ vaccOPQR = wasm_i32x4_max(vaccOPQR, vmagic_min);
+ vaccSTUV = wasm_i32x4_max(vaccSTUV, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+ vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point);
+ vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point);
+ vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point);
+ vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point);
+ vaccOPQR = wasm_i32x4_sub(vaccOPQR, vmagic_bias_less_output_zero_point);
+ vaccSTUV = wasm_i32x4_sub(vaccSTUV, vmagic_bias_less_output_zero_point);
+
+ v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF);
+ v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN);
+ v128_t voutOPQRSTUV = wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV);
+
+ v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF);
+ v128_t voutGHIJKLMNOPQRSTUV = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV);
+
+ vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNOPQRSTUV = wasm_u8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max);
+
+ wasm_v128_store(output, vout0123456789ABCDEF);
+ wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV);
+ output += 32;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ if XNN_LIKELY(channels >= 8) {
+ *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+ output += 8;
+ channels -= 8;
+ } else {
+ if (channels & 4) {
+ *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
+ vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ output += 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c8.c b/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c8.c
new file mode 100644
index 0000000..0a3c75d
--- /dev/null
+++ b/src/qu8-gavgpool/gen/7x-minmax-fp32-wasmsimd-c8.c
@@ -0,0 +1,178 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8(
+ size_t rows,
+ size_t channels,
+ const uint8_t* input,
+ size_t input_stride,
+ const uint8_t* zero,
+ uint8_t* output,
+ const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint8_t* i0 = input;
+ const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = zero;
+ }
+ const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = zero;
+ }
+ const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = zero;
+ }
+ const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = zero;
+ }
+ const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = zero;
+ }
+ const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = zero;
+ }
+
+ const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
+ const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
+ const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
+ const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
+ const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
+ for (; channels >= 8; channels -= 8) {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+ output += 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0);
+ i0 += 8;
+ const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1);
+ i1 += 8;
+
+ v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567);
+ const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2);
+ i2 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567);
+ const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3);
+ i3 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567);
+ const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4);
+ i4 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567);
+ const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5);
+ i5 += 8;
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567);
+ const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6);
+ i6 += 8;
+
+ vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567);
+
+ v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567));
+ v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567));
+
+ vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
+ vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
+
+ vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
+ vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
+ vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
+
+ vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
+ vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
+
+ const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
+ v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+ vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+ if (channels & 4) {
+ *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
+ vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+ output += 4;
+ }
+ uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
+ if (channels & 2) {
+ *((uint16_t*) output) = (uint16_t) vout0123;
+ vout0123 >>= 16;
+ output += 2;
+ }
+ if (channels & 1) {
+ *output = (uint8_t) vout0123;
+ }
+ }
+ }
+}
diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
index cfee5a1..c50a5ea 100644
--- a/src/xnnpack/gavgpool.h
+++ b/src/xnnpack/gavgpool.h
@@ -55,6 +55,7 @@
DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1)
DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1)
+
#define DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
size_t rows, \
@@ -82,37 +83,6 @@
DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8)
-#define DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
- XNN_INTERNAL void fn_name( \
- size_t rows, \
- size_t channels, \
- const uint8_t* input, \
- size_t input_stride, \
- const uint8_t* zero, \
- int32_t* buffer, \
- uint8_t* output, \
- const union xnn_qu8_avgpool_minmax_params* params);
-
-DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8)
-DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8)
-DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1)
-
-
-#define DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \
- XNN_INTERNAL void fn_name( \
- size_t rows, \
- size_t channels, \
- const uint8_t* input, \
- size_t input_stride, \
- const uint8_t* zero, \
- uint8_t* output, \
- const union xnn_qu8_avgpool_minmax_params* params);
-
-DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8)
-DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8)
-DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1)
-
-
#define DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
size_t rows, \
@@ -206,6 +176,99 @@
DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4)
+#define DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t rows, \
+ size_t channels, \
+ const uint8_t* input, \
+ size_t input_stride, \
+ const uint8_t* zero, \
+ int32_t* buffer, \
+ uint8_t* output, \
+ const union xnn_qu8_avgpool_minmax_params* params);
+
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2)
+DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4)
+
+
+#define DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t rows, \
+ size_t channels, \
+ const uint8_t* input, \
+ size_t input_stride, \
+ const uint8_t* zero, \
+ uint8_t* output, \
+ const union xnn_qu8_avgpool_minmax_params* params);
+
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4)
+
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2)
+DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4)
+
+
#define DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
size_t elements, \
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index e21dea4..a746f51 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -175,6 +175,24 @@
uint8_t output_min, \
uint8_t output_max);
+DECLARE_INIT_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params)
+DECLARE_INIT_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params)
+DECLARE_INIT_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params)
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ DECLARE_INIT_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_init_qu8_avgpool_minmax_fp32_neon_params)
+ DECLARE_INIT_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_init_qu8_avgpool_minmax_fp32_neonv8_params)
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ DECLARE_INIT_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_init_qu8_avgpool_minmax_fp32_sse2_params)
+ DECLARE_INIT_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_init_qu8_avgpool_minmax_fp32_sse4_params)
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ DECLARE_INIT_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params)
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
DECLARE_INIT_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_init_qu8_avgpool_minmax_scalar_params)
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
@@ -192,6 +210,24 @@
int32_t bias, \
float scale);
+DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_fp32_scalar_fmagic_params)
+DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params)
+DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_fp32_scalar_lrintf_params)
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_fp32_neon_params)
+ DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_fp32_neonv8_params)
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_fp32_sse2_params)
+ DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_fp32_sse4_params)
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params)
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_scalar_params)
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index b5a5e90..7f0c4b3 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -1476,6 +1476,74 @@
union xnn_qu8_avgpool_minmax_params {
struct {
+ int32_t init_bias;
+ float scale;
+ float output_min_less_zero_point;
+ float output_max_less_zero_point;
+ float magic_bias;
+ int32_t magic_bias_less_output_zero_point;
+ } fp32_scalar_fmagic;
+ struct {
+ int32_t init_bias;
+ float scale;
+ float magic_bias;
+ int32_t magic_min;
+ int32_t magic_max;
+ int32_t magic_bias_less_zero_point;
+ } fp32_scalar_imagic;
+ struct {
+ int32_t init_bias;
+ float scale;
+ float output_min_less_zero_point;
+ float output_max_less_zero_point;
+ int32_t output_zero_point;
+ } fp32_scalar_lrintf;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ struct {
+ int32_t init_bias;
+ float scale;
+ float magic_bias;
+ int32_t magic_bias_less_output_zero_point;
+ uint8_t output_min;
+ uint8_t output_max;
+ } fp32_neon;
+ struct {
+ int32_t init_bias;
+ float scale;
+ int16_t output_zero_point;
+ uint8_t output_min;
+ uint8_t output_max;
+ } fp32_neonv8;
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) int32_t init_bias[4];
+ XNN_ALIGN(16) float scale[4];
+ XNN_ALIGN(16) float output_max_less_zero_point[4];
+ XNN_ALIGN(16) int16_t output_zero_point[8];
+ XNN_ALIGN(16) uint8_t output_min[16];
+ } fp32_sse2;
+ struct {
+ XNN_ALIGN(16) int32_t init_bias[4];
+ XNN_ALIGN(16) float scale[4];
+ XNN_ALIGN(16) float output_max_less_zero_point[4];
+ XNN_ALIGN(16) int16_t output_zero_point[8];
+ XNN_ALIGN(16) uint8_t output_min[16];
+ } fp32_sse4;
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ struct {
+ XNN_ALIGN(8) int32_t init_bias[2];
+ XNN_ALIGN(8) float scale[2];
+ XNN_ALIGN(8) float magic_bias[2];
+ XNN_ALIGN(8) int32_t magic_min[2];
+ XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
+ XNN_ALIGN(8) uint8_t output_max[8];
+ } fp32_wasmsimd;
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+ // Legacy parameters used by QU8 AVGPOOL microkernels
+ struct {
int32_t bias;
int32_t multiplier;
int64_t rounding;