Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 1 | // Copyright 2020 Google LLC |
| 2 | // |
| 3 | // This source code is licensed under the BSD-style license found in the |
| 4 | // LICENSE file in the root directory of this source tree. |
| 5 | |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 6 | $assert DATATYPE in ["QS8", "QU8"] |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 7 | $assert CHANNEL_TILE % 8 == 0 |
| 8 | $assert CHANNEL_TILE >= 8 |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 9 | $assert ROW_TILE >= 3 |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 10 | $assert REQUANTIZATION in ["FP32", "RNDNU"] |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 11 | $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 12 | #include <assert.h> |
| 13 | |
| 14 | #include <arm_neon.h> |
| 15 | |
| 16 | #include <xnnpack/gavgpool.h> |
Marat Dukhan | 139337c | 2022-01-12 14:41:11 -0800 | [diff] [blame] | 17 | $if ARMV8: |
| 18 | #include <xnnpack/intrinsics-polyfill.h> |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 19 | |
| 20 | |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 21 | $PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("neonv8" if ARMV8 else "neon") |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 22 | $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] |
| 23 | $XINT8X8_T = {"QS8": "int8x8_t", "QU8": "uint8x8_t"}[DATATYPE] |
| 24 | $XINT8X16_T = {"QS8": "int8x16_t", "QU8": "uint8x16_t"}[DATATYPE] |
| 25 | $XINT16X8_T = {"QS8": "int16x8_t", "QU8": "uint16x8_t"}[DATATYPE] |
| 26 | $VLD1_X8 = {"QS8": "vld1_s8", "QU8": "vld1_u8"}[DATATYPE] |
| 27 | $VLD1_DUP_X8 = {"QS8": "vld1_dup_s8", "QU8": "vld1_dup_u8"}[DATATYPE] |
| 28 | $VLD1Q_DUP_X8 = {"QS8": "vld1q_dup_s8", "QU8": "vld1q_dup_u8"}[DATATYPE] |
| 29 | $VST1_X8 = {"QS8": "vst1_s8", "QU8": "vst1_u8"}[DATATYPE] |
| 30 | $VST1Q_X8 = {"QS8": "vst1q_s8", "QU8": "vst1q_u8"}[DATATYPE] |
| 31 | $VST1_LANE_X8 = {"QS8": "vst1_lane_s8", "QU8": "vst1_lane_u8"}[DATATYPE] |
| 32 | $VADDL_X8 = {"QS8": "vaddl_s8", "QU8": "vaddl_u8"}[DATATYPE] |
| 33 | $VADDW_X8 = {"QS8": "vaddw_s8", "QU8": "vaddw_u8"}[DATATYPE] |
| 34 | $VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE] |
| 35 | $VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE] |
| 36 | $VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE] |
| 37 | $VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE] |
| 38 | $VEXT_X8 = {"QS8": "vext_s8", "QU8": "vext_u8"}[DATATYPE] |
| 39 | $VQMOVXN_S16 = {"QS8": "vqmovn_s16", "QU8": "vqmovun_s16"}[DATATYPE] |
| 40 | $VQMOVXN_HIGH_S16 = {"QS8": "vqmovn_high_s16", "QU8": "vqmovun_high_s16"}[DATATYPE] |
| 41 | $VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE] |
| 42 | $VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE] |
| 43 | $VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE] |
| 44 | $VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE] |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 45 | $ISA = "neonv8" if ARMV8 else "neon" |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 46 | void xnn_${DATATYPE.lower()}_gavgpool_minmax_${REQUANTIZATION.lower()}_ukernel_${ROW_TILE}x__${ISA}_c${CHANNEL_TILE}( |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 47 | size_t rows, |
| 48 | size_t channels, |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 49 | const ${XINT8_T}* input, |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 50 | size_t input_stride, |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 51 | const ${XINT8_T}* zero, |
| 52 | ${XINT8_T}* output, |
| 53 | const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 54 | { |
| 55 | assert(rows != 0); |
| 56 | assert(rows <= ${ROW_TILE}); |
| 57 | assert(channels != 0); |
| 58 | |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 59 | const ${XINT8_T}* i0 = input; |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 60 | $for M in range(1, ROW_TILE): |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 61 | const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 62 | $if M % 2 == 1: |
| 63 | if XNN_UNPREDICTABLE(rows < ${M+1}) { |
| 64 | i${M} = zero; |
| 65 | } |
| 66 | $else: |
| 67 | if XNN_UNPREDICTABLE(rows <= ${M}) { |
| 68 | i${M} = zero; |
| 69 | } |
| 70 | |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 71 | const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.init_bias); |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 72 | $if REQUANTIZATION == "FP32": |
| 73 | const float32x4_t vscale = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.scale); |
| 74 | $if ARMV8: |
| 75 | const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); |
| 76 | $else: |
| 77 | const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); |
| 78 | const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); |
| 79 | $elif REQUANTIZATION == "RNDNU": |
| 80 | const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); |
| 81 | const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); |
| 82 | const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); |
| 83 | const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 84 | $if CHANNEL_TILE > 8: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 85 | const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min); |
| 86 | const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 87 | $else: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 88 | const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min); |
| 89 | const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 90 | for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 91 | $for M in range(2): |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 92 | $for C in range(0, CHANNEL_TILE, 8): |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 93 | const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8; |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 94 | |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 95 | $for C in range(0, CHANNEL_TILE, 8): |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 96 | const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8; |
| 97 | ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 98 | |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 99 | $for M in range(2, ROW_TILE): |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 100 | $for C in range(0, CHANNEL_TILE, 8): |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 101 | $if M + 1 != ROW_TILE: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 102 | const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; |
| 103 | vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 104 | |
| 105 | $for C in range(0, CHANNEL_TILE, 8): |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 106 | $if DATATYPE == "QS8": |
| 107 | int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[C:C+8]})); |
| 108 | int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[C:C+8]})); |
| 109 | $else: |
| 110 | int32x4_t vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[C:C+8]}))); |
| 111 | int32x4_t vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[C:C+8]}))); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 112 | |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 113 | $if REQUANTIZATION == "FP32": |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 114 | $for C in range(0, CHANNEL_TILE, 4): |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 115 | float32x4_t vfpacc${ABC[C:C+4]} = vcvtq_f32_s32(vacc${ABC[C:C+4]}); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 116 | |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 117 | $for C in range(0, CHANNEL_TILE, 4): |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 118 | vfpacc${ABC[C:C+4]} = vmulq_f32(vfpacc${ABC[C:C+4]}, vscale); |
| 119 | |
| 120 | $if ARMV8: |
| 121 | $for C in range(0, CHANNEL_TILE, 4): |
| 122 | vacc${ABC[C:C+4]} = vcvtnq_s32_f32(vfpacc${ABC[C:C+4]}); |
| 123 | $else: |
| 124 | $for C in range(0, CHANNEL_TILE, 4): |
| 125 | vacc${ABC[C:C+4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[C:C+4]}, vmagic_bias)); |
| 126 | |
| 127 | $for C in range(0, CHANNEL_TILE, 4): |
| 128 | vacc${ABC[C:C+4]} = vqsubq_s32(vacc${ABC[C:C+4]}, vmagic_bias_less_output_zero_point); |
| 129 | $elif REQUANTIZATION == "RNDNU": |
| 130 | $for C in range(0, CHANNEL_TILE, 4): |
| 131 | vacc${ABC[C:C+4]} = vqshlq_s32(vacc${ABC[C:C+4]}, vleft_pre_shift); |
| 132 | |
| 133 | $for C in range(0, CHANNEL_TILE, 4): |
| 134 | vacc${ABC[C:C+4]} = vqdmulhq_s32(vacc${ABC[C:C+4]}, vmultiplier); |
| 135 | |
| 136 | $for C in range(0, CHANNEL_TILE, 4): |
| 137 | vacc${ABC[C:C+4]} = vrshlq_s32(vacc${ABC[C:C+4]}, vleft_post_shift); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 138 | |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 139 | #if XNN_ARCH_ARM64 |
| 140 | $for C in range(0, CHANNEL_TILE, 8): |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 141 | int16x8_t vacc${ABC[C:C+8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc${ABC[C+4:C+8]}); |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 142 | #else // !XNN_ARCH_ARM64 |
| 143 | $for C in range(0, CHANNEL_TILE, 8): |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 144 | int16x8_t vacc${ABC[C:C+8]} = vcombine_s16(vqmovn_s32(vacc${ABC[C:C+4]}), vqmovn_s32(vacc${ABC[C+4:C+8]})); |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 145 | #endif // !XNN_ARCH_ARM64 |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 146 | |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 147 | $if REQUANTIZATION != "FP32" or ARMV8: |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 148 | $for C in range(0, CHANNEL_TILE, 8): |
| 149 | vacc${ABC[C:C+8]} = vqaddq_s16(vacc${ABC[C:C+8]}, voutput_zero_point); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 150 | |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 151 | #if XNN_ARCH_ARM64 |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 152 | $for C in range(0, CHANNEL_TILE, 16): |
| 153 | $if C + 8 < CHANNEL_TILE: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 154 | ${XINT8X16_T} vout${ABC[C:C+16]} = ${VQMOVXN_HIGH_S16}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), vacc${ABC[C+8:C+16]}); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 155 | $else: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 156 | ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]}); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 157 | #else // !XNN_ARCH_ARM64 |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 158 | $for C in range(0, CHANNEL_TILE, 16): |
| 159 | $if C + 8 < CHANNEL_TILE: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 160 | ${XINT8X16_T} vout${ABC[C:C+16]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), ${VQMOVXN_S16}(vacc${ABC[C+8:C+16]})); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 161 | $else: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 162 | ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]}); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 163 | #endif // !XNN_ARCH_ARM64 |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 164 | |
| 165 | $for C in range(0, CHANNEL_TILE, 16): |
| 166 | $if C + 8 < CHANNEL_TILE: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 167 | vout${ABC[C:C+16]} = ${VMAXQ_X8}(vout${ABC[C:C+16]}, voutput_min); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 168 | $elif CHANNEL_TILE > 8: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 169 | vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_min)); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 170 | $else: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 171 | vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, voutput_min); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 172 | |
| 173 | $for C in range(0, CHANNEL_TILE, 16): |
| 174 | $if C + 8 < CHANNEL_TILE: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 175 | vout${ABC[C:C+16]} = ${VMINQ_X8}(vout${ABC[C:C+16]}, voutput_max); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 176 | $elif CHANNEL_TILE > 8: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 177 | vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_max)); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 178 | $else: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 179 | vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, voutput_max); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 180 | |
| 181 | $for C in range(0, CHANNEL_TILE, 16): |
| 182 | $if C + 8 < CHANNEL_TILE: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 183 | ${VST1Q_X8}(output, vout${ABC[C:C+16]}); output += 16; |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 184 | $else: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 185 | ${VST1_X8}(output, vout${ABC[C:C+8]}); output += 8; |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 186 | } |
| 187 | if XNN_UNLIKELY(channels != 0) { |
| 188 | ${"do " if CHANNEL_TILE > 8 else ""}{ |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 189 | $for M in range(3): |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 190 | const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); i${M} += 8; |
| 191 | ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 192 | |
Marat Dukhan | 9e258d6 | 2022-01-12 10:50:51 -0800 | [diff] [blame] | 193 | $for M in range(2, ROW_TILE): |
| 194 | $if M + 1 != ROW_TILE: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 195 | const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; |
| 196 | vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]}); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 197 | |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 198 | $if DATATYPE == "QS8": |
| 199 | int32x4_t vacc${ABC[0:4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[0:8]})); |
| 200 | int32x4_t vacc${ABC[4:8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[0:8]})); |
| 201 | $else: |
| 202 | int32x4_t vacc${ABC[0:4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[0:8]}))); |
| 203 | int32x4_t vacc${ABC[4:8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[0:8]}))); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 204 | |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 205 | $if REQUANTIZATION == "FP32": |
| 206 | float32x4_t vfpacc${ABC[0:4]} = vcvtq_f32_s32(vacc${ABC[0:4]}); |
| 207 | float32x4_t vfpacc${ABC[4:8]} = vcvtq_f32_s32(vacc${ABC[4:8]}); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 208 | |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 209 | vfpacc${ABC[0:4]} = vmulq_f32(vfpacc${ABC[0:4]}, vscale); |
| 210 | vfpacc${ABC[4:8]} = vmulq_f32(vfpacc${ABC[4:8]}, vscale); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 211 | |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 212 | $if ARMV8: |
| 213 | vacc${ABC[0:4]} = vcvtnq_s32_f32(vfpacc${ABC[0:4]}); |
| 214 | vacc${ABC[4:8]} = vcvtnq_s32_f32(vfpacc${ABC[4:8]}); |
| 215 | $else: |
| 216 | vacc${ABC[0:4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[0:4]}, vmagic_bias)); |
| 217 | vacc${ABC[4:8]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[4:8]}, vmagic_bias)); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 218 | |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 219 | vacc${ABC[0:4]} = vqsubq_s32(vacc${ABC[0:4]}, vmagic_bias_less_output_zero_point); |
| 220 | vacc${ABC[4:8]} = vqsubq_s32(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point); |
| 221 | $elif REQUANTIZATION == "RNDNU": |
| 222 | vacc${ABC[0:4]} = vqshlq_s32(vacc${ABC[0:4]}, vleft_pre_shift); |
| 223 | vacc${ABC[4:8]} = vqshlq_s32(vacc${ABC[4:8]}, vleft_pre_shift); |
| 224 | |
| 225 | vacc${ABC[0:4]} = vqdmulhq_s32(vacc${ABC[0:4]}, vmultiplier); |
| 226 | vacc${ABC[4:8]} = vqdmulhq_s32(vacc${ABC[4:8]}, vmultiplier); |
| 227 | |
| 228 | vacc${ABC[0:4]} = vrshlq_s32(vacc${ABC[0:4]}, vleft_post_shift); |
| 229 | vacc${ABC[4:8]} = vrshlq_s32(vacc${ABC[4:8]}, vleft_post_shift); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 230 | |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 231 | #if XNN_ARCH_ARM64 |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 232 | int16x8_t vacc${ABC[0:8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[0:4]}), vacc${ABC[4:8]}); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 233 | #else |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 234 | int16x8_t vacc${ABC[0:8]} = vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]})); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 235 | #endif |
Marat Dukhan | 8575504 | 2022-01-13 01:46:05 -0800 | [diff] [blame] | 236 | $if REQUANTIZATION != "FP32" or ARMV8: |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 237 | vacc${ABC[0:8]} = vqaddq_s16(vacc${ABC[0:8]}, voutput_zero_point); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 238 | |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 239 | ${XINT8X8_T} vout${ABC[0:8]} = ${VQMOVXN_S16}(vacc${ABC[0:8]}); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 240 | $if CHANNEL_TILE > 8: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 241 | vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min)); |
| 242 | vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max)); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 243 | |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 244 | if XNN_LIKELY(channels >= 8) { |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 245 | ${VST1_X8}(output, vout${ABC[0:8]}); output += 8; |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 246 | channels -= 8; |
| 247 | } else { |
| 248 | if (channels & 4) { |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 249 | vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4; |
| 250 | vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 251 | } |
| 252 | if (channels & 2) { |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 253 | vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2; |
| 254 | vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 255 | } |
| 256 | if (channels & 1) { |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 257 | ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0); output += 1; |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 258 | } |
| 259 | channels = 0; |
| 260 | } |
| 261 | $else: |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 262 | vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, voutput_min); |
| 263 | vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, voutput_max); |
Marat Dukhan | 53f4106 | 2022-01-11 19:44:57 -0800 | [diff] [blame] | 264 | |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 265 | if (channels & 4) { |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 266 | vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4; |
| 267 | vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 268 | } |
| 269 | if (channels & 2) { |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 270 | vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2; |
| 271 | vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 272 | } |
| 273 | if (channels & 1) { |
Marat Dukhan | d1f53e4 | 2022-01-12 22:34:51 -0800 | [diff] [blame] | 274 | ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0); |
Marat Dukhan | 281262d | 2020-08-10 13:23:21 -0700 | [diff] [blame] | 275 | } |
| 276 | }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} |
| 277 | } |
| 278 | } |