blob: f02b343a869d8ac37a19fa2e628790986c18bba3 [file] [log] [blame]
Marat Dukhanf46f6752020-01-21 11:03:49 -08001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert ELEMENTS_TILE >= 1
7$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
8#include <assert.h>
9
10#include <xnnpack/common.h>
11#include <xnnpack/raddstoreexpminusmax.h>
12
13#include <fp16/bitcasts.h>
14
15
16void xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
17 size_t elements,
18 const float* input,
19 float* output,
20 float* sum,
21 float vi_max)
22{
23 assert(elements % sizeof(float) == 0);
24
25 const float vmagic_bias = 0x1.8000FEp23f;
26 // The smallest x for which expf(x) is normalized.
27 const float vdenorm_cutoff = -0x1.5D589Ep6f;
28 const float vlog2e = 0x1.715476p+0f;
29 // Last 7 bits are zeroes
30 const float vminus_ln2_hi = -0x1.62E400p-1f;
31 const float vminus_ln2_lo = -0x1.7F7D1Cp-20f;
32
33 const float vc1 = 0x1.FFFFF6p-1f;
34 const float vc2 = 0x1.FFFDC6p-2f;
35 const float vc3 = 0x1.555A80p-3f;
36 const float vc4 = 0x1.573A1Ap-5f;
37 const float vc5 = 0x1.0F9F9Cp-7f;
38
39 $if ELEMENTS_TILE > 1:
40 $for K in range(ACCUMULATORS):
41 float vacc${K} = 0.0f;
42 for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) {
43 // Load ${ELEMENTS_TILE} inputs at a time.
44 $for N in range(ELEMENTS_TILE):
45 const float vi${N} = input[${N}];
46 input += ${ELEMENTS_TILE};
47
48 // Subtract maximum input x := i - i_max. This implies x <= 0.
49 $for N in range(ELEMENTS_TILE):
50 const float vx${N} = vi${N} - vi_max;
51
52 // Compute reduced argument n := round(x / log(2)).
53 // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
54 // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
55 // certain bounds (|x| <= 2**22), but thats ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
56 // anyway. We fixup the result for such inputs at the very end of the algorithm.
57 $for N in range(ELEMENTS_TILE):
58 float vn${N} = vx${N} * vlog2e + vmagic_bias;
59
60 // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
61 // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly.
62 $for N in range(ELEMENTS_TILE):
63 const float vs${N} = fp32_from_bits(fp32_to_bits(vn${N}) << 23);
64
65 // Subtract the large number back to get final n := round(x / log(2)).
66 $for N in range(ELEMENTS_TILE):
67 vn${N} -= vmagic_bias;
68
69 // Compute reduced argument t := x - n * log(2).
70 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
71 $for N in range(ELEMENTS_TILE):
72 float vt${N} = vn${N} * vminus_ln2_hi + vx${N};
73
74 $for N in range(ELEMENTS_TILE):
75 vt${N} = vn${N} * vminus_ln2_lo + vt${N};
76
77 // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
78 $for N in range(ELEMENTS_TILE):
79 float vp${N} = vc5 * vt${N} + vc4;
80
81 $for N in range(ELEMENTS_TILE):
82 vp${N} = vp${N} * vt${N} + vc3;
83
84 $for N in range(ELEMENTS_TILE):
85 vp${N} = vp${N} * vt${N} + vc2;
86
87 $for N in range(ELEMENTS_TILE):
88 vp${N} = vp${N} * vt${N} + vc1;
89
90 // Reconstruct the final f value:
91 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
92 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
93 // = s + (t * s) * p
94 $for N in range(ELEMENTS_TILE):
95 vt${N} *= vs${N};
96
97 $for N in range(ELEMENTS_TILE):
98 float vf${N} = vt${N} * vp${N} + vs${N};
99
100 // For inputs below denormal cutoff, replace output with +0.0f.
101 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
102 $for N in range(ELEMENTS_TILE):
103 if XNN_UNPREDICTABLE(vx${N} < vdenorm_cutoff) {
104 vf${N} = 0.0f;
105 }
106
107 // Store ${ELEMENTS_TILE} outputs at a time.
108 $for N in range(ELEMENTS_TILE):
109 output[${N}] = vf${N};
110 output += ${ELEMENTS_TILE};
111
112 // Accumulate computed exponents.
113 $for N in range(ELEMENTS_TILE):
114 vacc${N % ACCUMULATORS} += vf${N};
115 }
116 $if ACCUMULATORS > 1:
117 // Add up all accumulators to vacc0
118 $ACC_SLICE = 1
119 $while ACC_SLICE < ACCUMULATORS:
120 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
121 $if A + ACC_SLICE < ACCUMULATORS:
122 vacc${A} += vacc${A + ACC_SLICE};
123 $ACC_SLICE *= 2
124
125 float vacc = vacc0;
126 $else:
127 float vacc = 0.0f;
128 for (; elements >= sizeof(float); elements -= sizeof(float)) {
129 // Load 1 input at a time.
130 const float vi = *input++;
131
132 // Subtract maximum input x := i - i_max. This implies x <= 0.
133 const float vx = vi - vi_max;
134
135 // Compute reduced argument n := round(x / log(2)).
136 // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result
137 // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
138 // certain bounds (|x| <= 2**22), but thats ok, because inputs outside of [-87.336540, 0.0] underflow expf(x)
139 // anyway. We fixup the result for such inputs at the very end of the algorithm.
140 float vn = vx * vlog2e + vmagic_bias;
141
142 // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
143 // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly.
144 const float vs = fp32_from_bits(fp32_to_bits(vn) << 23);
145
146 // Subtract the large number back to get final n := round(x / log(2)).
147 vn -= vmagic_bias;
148
149 // Compute reduced argument t := x - n * log(2).
150 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
151 float vt = vn * vminus_ln2_hi + vx;
152 vt = vn * vminus_ln2_lo + vt;
153
154 // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
155 float vp = vc5 * vt + vc4;
156 vp = vp * vt + vc3;
157 vp = vp * vt + vc2;
158 vp = vp * vt + vc1;
159
160 // Reconstruct the final f value:
161 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
162 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
163 // = s + (t * s) * p
164 vt *= vs;
165 float vf = vt * vp + vs;
166
167 // For inputs below denormal cutoff, replace output with +0.0f.
168 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
169 if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
170 vf = 0.0f;
171 }
172
173 // Store 1 output at a time.
174 *output++ = vf;
175
176 // Accumulate computed exponents.
177 vacc += vf;
178 }
179 *sum = vacc;
180}