blob: bfe60d69e2ebb7073680ee006850df10bab99aaa [file] [log] [blame]
Marat Dukhan4c4eb002019-12-08 21:27:49 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert ELEMENTS_TILE % 8 == 0
7$assert ELEMENTS_TILE >= 8
8$SIMD_TILE = ELEMENTS_TILE // 8
9$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
10#include <assert.h>
11
12#include <immintrin.h>
13
14#include <xnnpack/raddstoreexpminusmax.h>
15
16
17static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
18
19void xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
20 size_t elements,
21 const float* input,
22 float* output,
23 float* sum,
24 float max)
25{
26 assert(elements % sizeof(float) == 0);
27
28 const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f);
29 // The smallest x for which expf(x) is normalized.
30 const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f);
31 const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f);
32 const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E43p-1f);
33 const __m256 vminus_ln2_lo = _mm256_set1_ps(0x1.05C61p-29f);
34
35 const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f);
36 const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f);
37 const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f);
38 const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f);
39 const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f);
40
41 const __m256 vi_max = _mm256_set1_ps(max);
42
43 $for K in range(ACCUMULATORS):
44 __m256 vacc${K} = _mm256_setzero_ps();
45 for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) {
46 // Load ${ELEMENTS_TILE} (${SIMD_TILE}x8) inputs at a time.
47 const __m256 vi0 = _mm256_loadu_ps(input);
48 $for N in range(1, SIMD_TILE):
49 const __m256 vi${N} = _mm256_loadu_ps(input + ${N * 8});
50 input += ${ELEMENTS_TILE};
51
52 // Subtract maximum input x := i - i_max. This implies x <= 0.
53 $for N in range(SIMD_TILE):
54 const __m256 vx${N} = _mm256_sub_ps(vi${N}, vi_max);
55
56 // Compute reduced argument elements := round(x / log(2)).
57 $for N in range(SIMD_TILE):
58 __m256 vn${N} = _mm256_fmadd_ps(vx${N}, vlog2e, vmagic_bias);
59
60 // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
61 // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
62 $for N in range(SIMD_TILE):
63 const __m256 vs${N} = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn${N}), 23));
64
65 // Subtract the large number back to get final elements := round(x / log(2)).
66 $for N in range(SIMD_TILE):
67 vn${N} = _mm256_sub_ps(vn${N}, vmagic_bias);
68
69 // Compute reduced argument t := x - elements * log(2).
70 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
71 $for N in range(SIMD_TILE):
72 __m256 vt${N} = _mm256_fmadd_ps(vn${N}, vminus_ln2_hi, vx${N});
73
74 $for N in range(SIMD_TILE):
75 vt${N} = _mm256_fmadd_ps(vn${N}, vminus_ln2_lo, vt${N});
76
77 // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
78 $for N in range(SIMD_TILE):
79 __m256 vp${N} = _mm256_fmadd_ps(vc5, vt${N}, vc4);
80
81 $for N in range(SIMD_TILE):
82 vp${N} = _mm256_fmadd_ps(vp${N}, vt${N}, vc3);
83
84 $for N in range(SIMD_TILE):
85 vp${N} = _mm256_fmadd_ps(vp${N}, vt${N}, vc2);
86
87 $for N in range(SIMD_TILE):
88 vp${N} = _mm256_fmadd_ps(vp${N}, vt${N}, vc1);
89
90 // Reconstruct the final f value:
91 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
92 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
93 // = s + (t * s) * p
94 $for N in range(SIMD_TILE):
95 vt${N} = _mm256_mul_ps(vt${N}, vs${N});
96
97 $for N in range(SIMD_TILE):
98 __m256 vf${N} = _mm256_fmadd_ps(vt${N}, vp${N}, vs${N});
99
100 // For inputs below zero cutoff, replace output with +0.0f.
101 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
102 $for N in range(SIMD_TILE):
103 vf${N} = _mm256_andnot_ps(_mm256_cmp_ps(vx${N}, vdenorm_cutoff, _CMP_LT_OS), vf${N});
104
105 // Store ${ELEMENTS_TILE} (${SIMD_TILE}x8) outputs at a time.
106 _mm256_storeu_ps(output, vf0);
107 $for N in range(1, SIMD_TILE):
108 _mm256_storeu_ps(output + ${N * 8}, vf${N});
109 output += ${ELEMENTS_TILE};
110
111 // Accumulate computed exponents.
112 $for N in range(SIMD_TILE):
113 vacc${N % ACCUMULATORS} = _mm256_add_ps(vacc${N % ACCUMULATORS}, vf${N});
114 }
115 $if ACCUMULATORS > 1:
116 // Add up all accumulators to vacc0
117 $ACC_SLICE = 1
118 $while ACC_SLICE < ACCUMULATORS:
119 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
120 $if A + ACC_SLICE < ACCUMULATORS:
121 vacc${A} = _mm256_add_ps(vacc${A}, vacc${A + ACC_SLICE});
122 $ACC_SLICE *= 2
123
124 __m256 vacc = vacc0;
125 for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
126 // Load 8 inputs at a time.
127 const __m256 vi = _mm256_loadu_ps(input);
128 input += 8;
129
130 // Subtract maximum input x := i - i_max. This implies x <= 0.
131 const __m256 vx = _mm256_sub_ps(vi, vi_max);
132
133 // Compute reduced argument elements := round(x / log(2)).
134 __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias);
135
136 // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
137 // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
138 const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
139
140 // Subtract the large number back to get final elements := round(x / log(2)).
141 vn = _mm256_sub_ps(vn, vmagic_bias);
142
143 // Compute reduced argument t := x - elements * log(2).
144 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
145 __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2_hi, vx);
146 vt = _mm256_fmadd_ps(vn, vminus_ln2_lo, vt);
147
148 // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
149 __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
150 vp = _mm256_fmadd_ps(vp, vt, vc3);
151 vp = _mm256_fmadd_ps(vp, vt, vc2);
152 vp = _mm256_fmadd_ps(vp, vt, vc1);
153
154 // Reconstruct the final f value:
155 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
156 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
157 // = s + (t * s) * p
158 vt = _mm256_mul_ps(vt, vs);
159 __m256 vf = _mm256_fmadd_ps(vt, vp, vs);
160
161 // For inputs below zero cutoff, replace output with +0.0f.
162 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
163 vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf);
164
165 // Store 8 outputs at a time.
166 _mm256_storeu_ps(output, vf);
167 output += 8;
168
169 // Accumulate computed exponents.
170 vacc = _mm256_add_ps(vacc, vf);
171 }
172 if (elements != 0) {
173 assert(elements >= 1 * sizeof(float));
174 assert(elements <= 7 * sizeof(float));
175 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - elements));
176
177 // Load up to 7 inputs at a time.
178 const __m256 vi = _mm256_maskload_ps(input, vmask);
179
180 // Subtract maximum input x := i - i_max. This implies x <= 0.
181 const __m256 vx = _mm256_sub_ps(vi, vi_max);
182
183 // Compute reduced argument elements := round(x / log(2)).
184 __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias);
185
186 // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
187 // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
188 const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
189
190 // Subtract the large number back to get final elements := round(x / log(2)).
191 vn = _mm256_sub_ps(vn, vmagic_bias);
192
193 // Compute reduced argument t := x - elements * log(2).
194 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
195 __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2_hi, vx);
196 vt = _mm256_fmadd_ps(vn, vminus_ln2_lo, vt);
197
198 // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
199 __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
200 vp = _mm256_fmadd_ps(vp, vt, vc3);
201 vp = _mm256_fmadd_ps(vp, vt, vc2);
202 vp = _mm256_fmadd_ps(vp, vt, vc1);
203
204 // Reconstruct the final f value:
205 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
206 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
207 // = s + (t * s) * p
208 vt = _mm256_mul_ps(vt, vs);
209 __m256 vf = _mm256_fmadd_ps(vt, vp, vs);
210
211 // For inputs below zero cutoff, replace output with +0.0f.
212 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
213 vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf);
214
215 // Store up to 7 outputs at a time.
216 _mm256_maskstore_ps(output, vmask, vf);
217
218 // Accumulate computed exponents. And addend with mask to leave unmasked 32-bit lanes unchanged.
219 vacc = _mm256_add_ps(vacc, _mm256_and_ps(vf, _mm256_castsi256_ps(vmask)));
220 }
221 // Reduce 8 elements in the SIMD register
222 __m128 vacc_lo = _mm_add_ps(_mm256_castps256_ps128(vacc), _mm256_extractf128_ps(vacc, 1));
223 vacc_lo = _mm_add_ps(vacc_lo, _mm_movehl_ps(vacc_lo, vacc_lo));
224 vacc_lo = _mm_add_ss(vacc_lo, _mm_movehdup_ps(vacc_lo));
225 _mm_store_ss(sum, vacc_lo);
226 _mm256_zeroupper();
227}