Support FP32 requantization in AVX2 QS8 microkernels
PiperOrigin-RevId: 375822588
diff --git a/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index de48d33..9741b73 100644
--- a/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -148,13 +148,13 @@
}
}
}
- const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->gemmlowp_neon.multiplier);
vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
- const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->gemmlowp_neon.right_shift);
const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
@@ -166,7 +166,7 @@
vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
- const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->gemmlowp_neon.output_zero_point);
#if XNN_ARCH_ARM64
const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
@@ -178,8 +178,8 @@
int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
#endif
- const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
- const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->gemmlowp_neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->gemmlowp_neon.output_max);
vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);