blob: 0aa6b1df29b224ee6ff4312d18352d71df317b3e [file] [log] [blame]
Marat Dukhanf7291fc2020-12-15 11:02:50 -08001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cmath>
8#include <cstddef>
9#include <cstdint>
10#include <cstdlib>
11#include <iomanip>
12#include <ios>
13#include <vector>
14
15#include <gtest/gtest.h>
16
17#include <fp16.h>
18
19#include <xnnpack/AlignedAllocator.h>
20#include <xnnpack/common.h>
21#include <xnnpack/isa-checks.h>
22#include <xnnpack/math-stubs.h>
23
24
25constexpr int kBlockSize = 1024;
26
27
28#if XNN_ARCH_ARM || XNN_ARCH_ARM64
29 TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_zero) {
30 TEST_REQUIRES_ARM_NEON_FMA;
31
32 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
33 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
34 std::fill(inputs.begin(), inputs.end(), -0.0f);
35 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
36 const float reference_output = 1.0f;
37 ASSERT_EQ(reference_output, outputs[0])
38 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
39 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
40 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
41 }
42
43 TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_zero) {
44 TEST_REQUIRES_ARM_NEON_FMA;
45
46 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
47 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
48 std::fill(inputs.begin(), inputs.end(), +0.0f);
49 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
50 const float reference_output = 1.0f;
51 ASSERT_EQ(reference_output, outputs[0])
52 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
53 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
54 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
55 }
56
57 TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_saturation) {
58 TEST_REQUIRES_ARM_NEON_FMA;
59
60 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
61 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
62 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
63 for (uint32_t i = 0; i < kBlockSize; i++) {
64 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
65 }
66 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
67 for (uint32_t i = 0; i < kBlockSize; i++) {
68 const uint32_t reference_output = UINT32_C(0x00000000);
69 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
70 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
71 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
72 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
73 }
74 }
75 }
76
77 TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_overflow) {
78 TEST_REQUIRES_ARM_NEON_FMA;
79
80 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
81 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
82 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
83 for (uint32_t i = 0; i < kBlockSize; i++) {
84 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
85 }
86 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
87 for (uint32_t i = 0; i < kBlockSize; i++) {
88 const uint32_t reference_output = UINT32_C(0x7F800000);
89 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
90 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
91 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
92 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
93 }
94 }
95 }
96
97 TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_nan) {
98 TEST_REQUIRES_ARM_NEON_FMA;
99
100 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
101 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
102 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
103 for (uint32_t i = 0; i < kBlockSize; i++) {
104 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
105 }
106 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
107 for (uint32_t i = 0; i < kBlockSize; i++) {
108 ASSERT_TRUE(std::isnan(outputs[i]))
109 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
110 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
111 }
112 }
113 }
114
115 TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_nan) {
116 TEST_REQUIRES_ARM_NEON_FMA;
117
118 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
119 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
120 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
121 for (uint32_t i = 0; i < kBlockSize; i++) {
122 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
123 }
124 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
125 for (uint32_t i = 0; i < kBlockSize; i++) {
126 ASSERT_TRUE(std::isnan(outputs[i]))
127 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
128 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
129 }
130 }
131 }
132#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
133
134
135#if XNN_ARCH_ARM || XNN_ARCH_ARM64
136 TEST(EXP__NEONFMA_RR2_P5, negative_zero) {
137 TEST_REQUIRES_ARM_NEON_FMA;
138
139 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
140 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
141 std::fill(inputs.begin(), inputs.end(), -0.0f);
142 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
143 const float reference_output = 1.0f;
144 ASSERT_EQ(reference_output, outputs[0])
145 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
146 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
147 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
148 }
149
150 TEST(EXP__NEONFMA_RR2_P5, positive_zero) {
151 TEST_REQUIRES_ARM_NEON_FMA;
152
153 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
154 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
155 std::fill(inputs.begin(), inputs.end(), +0.0f);
156 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
157 const float reference_output = 1.0f;
158 ASSERT_EQ(reference_output, outputs[0])
159 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
160 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
161 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
162 }
163
164 TEST(EXP__NEONFMA_RR2_P5, negative_saturation) {
165 TEST_REQUIRES_ARM_NEON_FMA;
166
167 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
168 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
169 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
170 for (uint32_t i = 0; i < kBlockSize; i++) {
171 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
172 }
173 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
174 for (uint32_t i = 0; i < kBlockSize; i++) {
175 const uint32_t reference_output = UINT32_C(0x00000000);
176 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
177 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
178 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
179 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
180 }
181 }
182 }
183
184 TEST(EXP__NEONFMA_RR2_P5, positive_overflow) {
185 TEST_REQUIRES_ARM_NEON_FMA;
186
187 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
188 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
189 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
190 for (uint32_t i = 0; i < kBlockSize; i++) {
191 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
192 }
193 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
194 for (uint32_t i = 0; i < kBlockSize; i++) {
195 const uint32_t reference_output = UINT32_C(0x7F800000);
196 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
197 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
198 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
199 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
200 }
201 }
202 }
203
204 TEST(EXP__NEONFMA_RR2_P5, positive_nan) {
205 TEST_REQUIRES_ARM_NEON_FMA;
206
207 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
208 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
209 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
210 for (uint32_t i = 0; i < kBlockSize; i++) {
211 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
212 }
213 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
214 for (uint32_t i = 0; i < kBlockSize; i++) {
215 ASSERT_TRUE(std::isnan(outputs[i]))
216 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
217 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
218 }
219 }
220 }
221
222 TEST(EXP__NEONFMA_RR2_P5, negative_nan) {
223 TEST_REQUIRES_ARM_NEON_FMA;
224
225 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
226 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
227 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
228 for (uint32_t i = 0; i < kBlockSize; i++) {
229 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
230 }
231 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
232 for (uint32_t i = 0; i < kBlockSize; i++) {
233 ASSERT_TRUE(std::isnan(outputs[i]))
234 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
235 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
236 }
237 }
238 }
239#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
240
241
242#if XNN_ARCH_X86 || XNN_ARCH_X86_64
243 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_zero) {
244 TEST_REQUIRES_X86_AVX512F;
245
246 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
247 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
248 std::fill(inputs.begin(), inputs.end(), -0.0f);
249 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
250 const float reference_output = 1.0f;
251 ASSERT_EQ(reference_output, outputs[0])
252 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
253 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
254 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
255 }
256
257 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_zero) {
258 TEST_REQUIRES_X86_AVX512F;
259
260 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
261 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
262 std::fill(inputs.begin(), inputs.end(), +0.0f);
263 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
264 const float reference_output = 1.0f;
265 ASSERT_EQ(reference_output, outputs[0])
266 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
267 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
268 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
269 }
270
271 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_saturation) {
272 TEST_REQUIRES_X86_AVX512F;
273
274 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
275 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
276 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
277 for (uint32_t i = 0; i < kBlockSize; i++) {
278 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
279 }
280 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
281 for (uint32_t i = 0; i < kBlockSize; i++) {
282 const uint32_t reference_output = UINT32_C(0x00000000);
283 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
284 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
285 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
286 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
287 }
288 }
289 }
290
291 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_overflow) {
292 TEST_REQUIRES_X86_AVX512F;
293
294 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
295 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
296 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
297 for (uint32_t i = 0; i < kBlockSize; i++) {
298 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
299 }
300 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
301 for (uint32_t i = 0; i < kBlockSize; i++) {
302 const uint32_t reference_output = UINT32_C(0x7F800000);
303 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
304 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
305 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
306 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
307 }
308 }
309 }
310
311 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_nan) {
312 TEST_REQUIRES_X86_AVX512F;
313
314 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
315 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
316 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
317 for (uint32_t i = 0; i < kBlockSize; i++) {
318 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
319 }
320 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
321 for (uint32_t i = 0; i < kBlockSize; i++) {
322 ASSERT_TRUE(std::isnan(outputs[i]))
323 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
324 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
325 }
326 }
327 }
328
329 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_nan) {
330 TEST_REQUIRES_X86_AVX512F;
331
332 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
333 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
334 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
335 for (uint32_t i = 0; i < kBlockSize; i++) {
336 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
337 }
338 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
339 for (uint32_t i = 0; i < kBlockSize; i++) {
340 ASSERT_TRUE(std::isnan(outputs[i]))
341 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
342 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
343 }
344 }
345 }
346#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
347
348
349#if XNN_ARCH_X86 || XNN_ARCH_X86_64
350 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_zero) {
351 TEST_REQUIRES_X86_AVX512F;
352
353 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
354 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
355 std::fill(inputs.begin(), inputs.end(), -0.0f);
356 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
357 const float reference_output = 1.0f;
358 ASSERT_EQ(reference_output, outputs[0])
359 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
360 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
361 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
362 }
363
364 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_zero) {
365 TEST_REQUIRES_X86_AVX512F;
366
367 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
368 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
369 std::fill(inputs.begin(), inputs.end(), +0.0f);
370 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
371 const float reference_output = 1.0f;
372 ASSERT_EQ(reference_output, outputs[0])
373 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
374 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
375 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
376 }
377
378 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_saturation) {
379 TEST_REQUIRES_X86_AVX512F;
380
381 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
382 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
383 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
384 for (uint32_t i = 0; i < kBlockSize; i++) {
385 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
386 }
387 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
388 for (uint32_t i = 0; i < kBlockSize; i++) {
389 const uint32_t reference_output = UINT32_C(0x00000000);
390 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
391 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
392 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
393 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
394 }
395 }
396 }
397
398 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_overflow) {
399 TEST_REQUIRES_X86_AVX512F;
400
401 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
402 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
403 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
404 for (uint32_t i = 0; i < kBlockSize; i++) {
405 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
406 }
407 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
408 for (uint32_t i = 0; i < kBlockSize; i++) {
409 const uint32_t reference_output = UINT32_C(0x7F800000);
410 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
411 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
412 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
413 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
414 }
415 }
416 }
417
418 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_nan) {
419 TEST_REQUIRES_X86_AVX512F;
420
421 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
422 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
423 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
424 for (uint32_t i = 0; i < kBlockSize; i++) {
425 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
426 }
427 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
428 for (uint32_t i = 0; i < kBlockSize; i++) {
429 ASSERT_TRUE(std::isnan(outputs[i]))
430 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
431 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
432 }
433 }
434 }
435
436 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_nan) {
437 TEST_REQUIRES_X86_AVX512F;
438
439 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
440 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
441 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
442 for (uint32_t i = 0; i < kBlockSize; i++) {
443 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
444 }
445 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
446 for (uint32_t i = 0; i < kBlockSize; i++) {
447 ASSERT_TRUE(std::isnan(outputs[i]))
448 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
449 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
450 }
451 }
452 }
453#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
454
455
456#if XNN_ARCH_X86 || XNN_ARCH_X86_64
457 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_zero) {
458 TEST_REQUIRES_X86_AVX512F;
459
460 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
461 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
462 std::fill(inputs.begin(), inputs.end(), -0.0f);
463 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
464 const float reference_output = 1.0f;
465 ASSERT_EQ(reference_output, outputs[0])
466 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
467 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
468 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
469 }
470
471 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_zero) {
472 TEST_REQUIRES_X86_AVX512F;
473
474 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
475 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
476 std::fill(inputs.begin(), inputs.end(), +0.0f);
477 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
478 const float reference_output = 1.0f;
479 ASSERT_EQ(reference_output, outputs[0])
480 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
481 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
482 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
483 }
484
485 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_saturation) {
486 TEST_REQUIRES_X86_AVX512F;
487
488 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
489 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
490 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
491 for (uint32_t i = 0; i < kBlockSize; i++) {
492 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
493 }
494 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
495 for (uint32_t i = 0; i < kBlockSize; i++) {
496 const uint32_t reference_output = UINT32_C(0x00000000);
497 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
498 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
499 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
500 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
501 }
502 }
503 }
504
505 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_overflow) {
506 TEST_REQUIRES_X86_AVX512F;
507
508 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
509 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
510 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
511 for (uint32_t i = 0; i < kBlockSize; i++) {
512 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
513 }
514 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
515 for (uint32_t i = 0; i < kBlockSize; i++) {
516 const uint32_t reference_output = UINT32_C(0x7F800000);
517 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
518 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
519 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
520 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
521 }
522 }
523 }
524
525 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_nan) {
526 TEST_REQUIRES_X86_AVX512F;
527
528 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
529 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
530 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
531 for (uint32_t i = 0; i < kBlockSize; i++) {
532 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
533 }
534 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
535 for (uint32_t i = 0; i < kBlockSize; i++) {
536 ASSERT_TRUE(std::isnan(outputs[i]))
537 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
538 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
539 }
540 }
541 }
542
543 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_nan) {
544 TEST_REQUIRES_X86_AVX512F;
545
546 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
547 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
548 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
549 for (uint32_t i = 0; i < kBlockSize; i++) {
550 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
551 }
552 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
553 for (uint32_t i = 0; i < kBlockSize; i++) {
554 ASSERT_TRUE(std::isnan(outputs[i]))
555 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
556 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
557 }
558 }
559 }
560#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
561
562
563#if XNN_ARCH_X86 || XNN_ARCH_X86_64
564 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_zero) {
565 TEST_REQUIRES_X86_AVX512F;
566
567 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
568 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
569 std::fill(inputs.begin(), inputs.end(), -0.0f);
570 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
571 const float reference_output = 1.0f;
572 ASSERT_EQ(reference_output, outputs[0])
573 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
574 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
575 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
576 }
577
578 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_zero) {
579 TEST_REQUIRES_X86_AVX512F;
580
581 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
582 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
583 std::fill(inputs.begin(), inputs.end(), +0.0f);
584 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
585 const float reference_output = 1.0f;
586 ASSERT_EQ(reference_output, outputs[0])
587 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
588 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
589 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
590 }
591
592 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_saturation) {
593 TEST_REQUIRES_X86_AVX512F;
594
595 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
596 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
597 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
598 for (uint32_t i = 0; i < kBlockSize; i++) {
599 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
600 }
601 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
602 for (uint32_t i = 0; i < kBlockSize; i++) {
603 const uint32_t reference_output = UINT32_C(0x00000000);
604 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
605 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
606 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
607 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
608 }
609 }
610 }
611
612 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_overflow) {
613 TEST_REQUIRES_X86_AVX512F;
614
615 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
616 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
617 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
618 for (uint32_t i = 0; i < kBlockSize; i++) {
619 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
620 }
621 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
622 for (uint32_t i = 0; i < kBlockSize; i++) {
623 const uint32_t reference_output = UINT32_C(0x7F800000);
624 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
625 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
626 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
627 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
628 }
629 }
630 }
631
632 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_nan) {
633 TEST_REQUIRES_X86_AVX512F;
634
635 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
636 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
637 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
638 for (uint32_t i = 0; i < kBlockSize; i++) {
639 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
640 }
641 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
642 for (uint32_t i = 0; i < kBlockSize; i++) {
643 ASSERT_TRUE(std::isnan(outputs[i]))
644 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
645 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
646 }
647 }
648 }
649
650 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_nan) {
651 TEST_REQUIRES_X86_AVX512F;
652
653 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
654 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
655 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
656 for (uint32_t i = 0; i < kBlockSize; i++) {
657 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
658 }
659 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
660 for (uint32_t i = 0; i < kBlockSize; i++) {
661 ASSERT_TRUE(std::isnan(outputs[i]))
662 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
663 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
664 }
665 }
666 }
667#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
668
669
670#if XNN_ARCH_X86 || XNN_ARCH_X86_64
671 TEST(EXP__AVX512F_RR2_P5, negative_zero) {
672 TEST_REQUIRES_X86_AVX512F;
673
674 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
675 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
676 std::fill(inputs.begin(), inputs.end(), -0.0f);
677 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
678 const float reference_output = 1.0f;
679 ASSERT_EQ(reference_output, outputs[0])
680 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
681 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
682 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
683 }
684
685 TEST(EXP__AVX512F_RR2_P5, positive_zero) {
686 TEST_REQUIRES_X86_AVX512F;
687
688 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
689 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
690 std::fill(inputs.begin(), inputs.end(), +0.0f);
691 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
692 const float reference_output = 1.0f;
693 ASSERT_EQ(reference_output, outputs[0])
694 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
695 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
696 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
697 }
698
699 TEST(EXP__AVX512F_RR2_P5, negative_saturation) {
700 TEST_REQUIRES_X86_AVX512F;
701
702 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
703 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
704 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
705 for (uint32_t i = 0; i < kBlockSize; i++) {
706 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
707 }
708 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
709 for (uint32_t i = 0; i < kBlockSize; i++) {
710 const uint32_t reference_output = UINT32_C(0x00000000);
711 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
712 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
713 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
714 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
715 }
716 }
717 }
718
719 TEST(EXP__AVX512F_RR2_P5, positive_overflow) {
720 TEST_REQUIRES_X86_AVX512F;
721
722 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
723 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
724 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
725 for (uint32_t i = 0; i < kBlockSize; i++) {
726 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
727 }
728 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
729 for (uint32_t i = 0; i < kBlockSize; i++) {
730 const uint32_t reference_output = UINT32_C(0x7F800000);
731 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
732 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
733 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
734 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
735 }
736 }
737 }
738
739 TEST(EXP__AVX512F_RR2_P5, positive_nan) {
740 TEST_REQUIRES_X86_AVX512F;
741
742 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
743 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
744 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
745 for (uint32_t i = 0; i < kBlockSize; i++) {
746 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
747 }
748 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
749 for (uint32_t i = 0; i < kBlockSize; i++) {
750 ASSERT_TRUE(std::isnan(outputs[i]))
751 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
752 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
753 }
754 }
755 }
756
757 TEST(EXP__AVX512F_RR2_P5, negative_nan) {
758 TEST_REQUIRES_X86_AVX512F;
759
760 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
761 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
762 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
763 for (uint32_t i = 0; i < kBlockSize; i++) {
764 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
765 }
766 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
767 for (uint32_t i = 0; i < kBlockSize; i++) {
768 ASSERT_TRUE(std::isnan(outputs[i]))
769 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
770 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
771 }
772 }
773 }
774#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
775
776
777#if XNN_ARCH_X86 || XNN_ARCH_X86_64
778 TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_zero) {
779 TEST_REQUIRES_X86_AVX512F;
780
781 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
782 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
783 std::fill(inputs.begin(), inputs.end(), -0.0f);
784 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
785 const float reference_output = 1.0f;
786 ASSERT_EQ(reference_output, outputs[0])
787 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
788 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
789 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
790 }
791
792 TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_zero) {
793 TEST_REQUIRES_X86_AVX512F;
794
795 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
796 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
797 std::fill(inputs.begin(), inputs.end(), +0.0f);
798 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
799 const float reference_output = 1.0f;
800 ASSERT_EQ(reference_output, outputs[0])
801 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
802 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
803 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
804 }
805
806 TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_saturation) {
807 TEST_REQUIRES_X86_AVX512F;
808
809 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
810 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
811 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
812 for (uint32_t i = 0; i < kBlockSize; i++) {
813 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
814 }
815 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
816 for (uint32_t i = 0; i < kBlockSize; i++) {
817 const uint32_t reference_output = UINT32_C(0x00000000);
818 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
819 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
820 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
821 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
822 }
823 }
824 }
825
826 TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_overflow) {
827 TEST_REQUIRES_X86_AVX512F;
828
829 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
830 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
831 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
832 for (uint32_t i = 0; i < kBlockSize; i++) {
833 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
834 }
835 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
836 for (uint32_t i = 0; i < kBlockSize; i++) {
837 const uint32_t reference_output = UINT32_C(0x7F800000);
838 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
839 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
840 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
841 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
842 }
843 }
844 }
845
846 TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_nan) {
847 TEST_REQUIRES_X86_AVX512F;
848
849 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
850 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
851 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
852 for (uint32_t i = 0; i < kBlockSize; i++) {
853 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
854 }
855 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
856 for (uint32_t i = 0; i < kBlockSize; i++) {
857 ASSERT_TRUE(std::isnan(outputs[i]))
858 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
859 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
860 }
861 }
862 }
863
864 TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_nan) {
865 TEST_REQUIRES_X86_AVX512F;
866
867 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
868 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
869 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
870 for (uint32_t i = 0; i < kBlockSize; i++) {
871 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
872 }
873 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
874 for (uint32_t i = 0; i < kBlockSize; i++) {
875 ASSERT_TRUE(std::isnan(outputs[i]))
876 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
877 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
878 }
879 }
880 }
881#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
882
883
884#if XNN_ARCH_X86 || XNN_ARCH_X86_64
885 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_zero) {
886 TEST_REQUIRES_X86_AVX2;
887
888 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
889 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
890 std::fill(inputs.begin(), inputs.end(), -0.0f);
891 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
892 const float reference_output = 1.0f;
893 ASSERT_EQ(reference_output, outputs[0])
894 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
895 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
896 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
897 }
898
899 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_zero) {
900 TEST_REQUIRES_X86_AVX2;
901
902 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
903 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
904 std::fill(inputs.begin(), inputs.end(), +0.0f);
905 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
906 const float reference_output = 1.0f;
907 ASSERT_EQ(reference_output, outputs[0])
908 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
909 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
910 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
911 }
912
913 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_saturation) {
914 TEST_REQUIRES_X86_AVX2;
915
916 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
917 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
918 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
919 for (uint32_t i = 0; i < kBlockSize; i++) {
920 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
921 }
922 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
923 for (uint32_t i = 0; i < kBlockSize; i++) {
924 const uint32_t reference_output = UINT32_C(0x00000000);
925 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
926 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
927 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
928 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
929 }
930 }
931 }
932
933 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_overflow) {
934 TEST_REQUIRES_X86_AVX2;
935
936 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
937 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
938 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
939 for (uint32_t i = 0; i < kBlockSize; i++) {
940 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
941 }
942 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
943 for (uint32_t i = 0; i < kBlockSize; i++) {
944 const uint32_t reference_output = UINT32_C(0x7F800000);
945 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
946 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
947 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
948 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
949 }
950 }
951 }
952
953 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_nan) {
954 TEST_REQUIRES_X86_AVX2;
955
956 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
957 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
958 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
959 for (uint32_t i = 0; i < kBlockSize; i++) {
960 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
961 }
962 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
963 for (uint32_t i = 0; i < kBlockSize; i++) {
964 ASSERT_TRUE(std::isnan(outputs[i]))
965 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
966 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
967 }
968 }
969 }
970
971 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_nan) {
972 TEST_REQUIRES_X86_AVX2;
973
974 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
975 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
976 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
977 for (uint32_t i = 0; i < kBlockSize; i++) {
978 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
979 }
980 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
981 for (uint32_t i = 0; i < kBlockSize; i++) {
982 ASSERT_TRUE(std::isnan(outputs[i]))
983 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
984 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
985 }
986 }
987 }
988#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
989
990
991#if XNN_ARCH_X86 || XNN_ARCH_X86_64
992 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_zero) {
993 TEST_REQUIRES_X86_AVX2;
994
995 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
996 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
997 std::fill(inputs.begin(), inputs.end(), -0.0f);
998 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
999 const float reference_output = 1.0f;
1000 ASSERT_EQ(reference_output, outputs[0])
1001 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1002 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1003 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1004 }
1005
1006 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_zero) {
1007 TEST_REQUIRES_X86_AVX2;
1008
1009 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1010 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1011 std::fill(inputs.begin(), inputs.end(), +0.0f);
1012 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1013 const float reference_output = 1.0f;
1014 ASSERT_EQ(reference_output, outputs[0])
1015 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1016 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1017 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1018 }
1019
1020 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_saturation) {
1021 TEST_REQUIRES_X86_AVX2;
1022
1023 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1024 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1025 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1026 for (uint32_t i = 0; i < kBlockSize; i++) {
1027 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1028 }
1029 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1030 for (uint32_t i = 0; i < kBlockSize; i++) {
1031 const uint32_t reference_output = UINT32_C(0x00000000);
1032 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1033 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1034 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1035 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1036 }
1037 }
1038 }
1039
1040 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_overflow) {
1041 TEST_REQUIRES_X86_AVX2;
1042
1043 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1044 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1045 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1046 for (uint32_t i = 0; i < kBlockSize; i++) {
1047 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1048 }
1049 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1050 for (uint32_t i = 0; i < kBlockSize; i++) {
1051 const uint32_t reference_output = UINT32_C(0x7F800000);
1052 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1053 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1054 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1055 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1056 }
1057 }
1058 }
1059
1060 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_nan) {
1061 TEST_REQUIRES_X86_AVX2;
1062
1063 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1064 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1065 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1066 for (uint32_t i = 0; i < kBlockSize; i++) {
1067 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1068 }
1069 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1070 for (uint32_t i = 0; i < kBlockSize; i++) {
1071 ASSERT_TRUE(std::isnan(outputs[i]))
1072 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1073 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1074 }
1075 }
1076 }
1077
1078 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_nan) {
1079 TEST_REQUIRES_X86_AVX2;
1080
1081 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1082 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1083 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1084 for (uint32_t i = 0; i < kBlockSize; i++) {
1085 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1086 }
1087 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1088 for (uint32_t i = 0; i < kBlockSize; i++) {
1089 ASSERT_TRUE(std::isnan(outputs[i]))
1090 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1091 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1092 }
1093 }
1094 }
1095#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1096
1097
1098#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1099 TEST(EXP__AVX2_RR2_P5, negative_zero) {
1100 TEST_REQUIRES_X86_AVX2;
1101
1102 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1103 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1104 std::fill(inputs.begin(), inputs.end(), -0.0f);
1105 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1106 const float reference_output = 1.0f;
1107 ASSERT_EQ(reference_output, outputs[0])
1108 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1109 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1110 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1111 }
1112
1113 TEST(EXP__AVX2_RR2_P5, positive_zero) {
1114 TEST_REQUIRES_X86_AVX2;
1115
1116 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1117 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1118 std::fill(inputs.begin(), inputs.end(), +0.0f);
1119 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1120 const float reference_output = 1.0f;
1121 ASSERT_EQ(reference_output, outputs[0])
1122 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1123 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1124 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1125 }
1126
1127 TEST(EXP__AVX2_RR2_P5, negative_saturation) {
1128 TEST_REQUIRES_X86_AVX2;
1129
1130 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1131 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1132 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1133 for (uint32_t i = 0; i < kBlockSize; i++) {
1134 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1135 }
1136 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1137 for (uint32_t i = 0; i < kBlockSize; i++) {
1138 const uint32_t reference_output = UINT32_C(0x00000000);
1139 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1140 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1141 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1142 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1143 }
1144 }
1145 }
1146
1147 TEST(EXP__AVX2_RR2_P5, positive_overflow) {
1148 TEST_REQUIRES_X86_AVX2;
1149
1150 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1151 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1152 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1153 for (uint32_t i = 0; i < kBlockSize; i++) {
1154 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1155 }
1156 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1157 for (uint32_t i = 0; i < kBlockSize; i++) {
1158 const uint32_t reference_output = UINT32_C(0x7F800000);
1159 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1160 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1161 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1162 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1163 }
1164 }
1165 }
1166
1167 TEST(EXP__AVX2_RR2_P5, positive_nan) {
1168 TEST_REQUIRES_X86_AVX2;
1169
1170 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1171 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1172 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1173 for (uint32_t i = 0; i < kBlockSize; i++) {
1174 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1175 }
1176 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1177 for (uint32_t i = 0; i < kBlockSize; i++) {
1178 ASSERT_TRUE(std::isnan(outputs[i]))
1179 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1180 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1181 }
1182 }
1183 }
1184
1185 TEST(EXP__AVX2_RR2_P5, negative_nan) {
1186 TEST_REQUIRES_X86_AVX2;
1187
1188 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1189 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1190 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1191 for (uint32_t i = 0; i < kBlockSize; i++) {
1192 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1193 }
1194 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1195 for (uint32_t i = 0; i < kBlockSize; i++) {
1196 ASSERT_TRUE(std::isnan(outputs[i]))
1197 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1198 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1199 }
1200 }
1201 }
1202#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1203
1204
1205#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1206 TEST(EXP__AVX_RR2_P5, negative_zero) {
1207 TEST_REQUIRES_X86_AVX;
1208
1209 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1210 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1211 std::fill(inputs.begin(), inputs.end(), -0.0f);
1212 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1213 const float reference_output = 1.0f;
1214 ASSERT_EQ(reference_output, outputs[0])
1215 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1216 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1217 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1218 }
1219
1220 TEST(EXP__AVX_RR2_P5, positive_zero) {
1221 TEST_REQUIRES_X86_AVX;
1222
1223 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1224 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1225 std::fill(inputs.begin(), inputs.end(), +0.0f);
1226 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1227 const float reference_output = 1.0f;
1228 ASSERT_EQ(reference_output, outputs[0])
1229 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1230 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1231 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1232 }
1233
1234 TEST(EXP__AVX_RR2_P5, negative_saturation) {
1235 TEST_REQUIRES_X86_AVX;
1236
1237 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1238 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1239 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1240 for (uint32_t i = 0; i < kBlockSize; i++) {
1241 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1242 }
1243 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1244 for (uint32_t i = 0; i < kBlockSize; i++) {
1245 const uint32_t reference_output = UINT32_C(0x00000000);
1246 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1247 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1248 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1249 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1250 }
1251 }
1252 }
1253
1254 TEST(EXP__AVX_RR2_P5, positive_overflow) {
1255 TEST_REQUIRES_X86_AVX;
1256
1257 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1258 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1259 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1260 for (uint32_t i = 0; i < kBlockSize; i++) {
1261 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1262 }
1263 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1264 for (uint32_t i = 0; i < kBlockSize; i++) {
1265 const uint32_t reference_output = UINT32_C(0x7F800000);
1266 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1267 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1268 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1269 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1270 }
1271 }
1272 }
1273
1274 TEST(EXP__AVX_RR2_P5, positive_nan) {
1275 TEST_REQUIRES_X86_AVX;
1276
1277 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1278 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1279 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1280 for (uint32_t i = 0; i < kBlockSize; i++) {
1281 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1282 }
1283 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1284 for (uint32_t i = 0; i < kBlockSize; i++) {
1285 ASSERT_TRUE(std::isnan(outputs[i]))
1286 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1287 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1288 }
1289 }
1290 }
1291
1292 TEST(EXP__AVX_RR2_P5, negative_nan) {
1293 TEST_REQUIRES_X86_AVX;
1294
1295 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1296 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1297 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1298 for (uint32_t i = 0; i < kBlockSize; i++) {
1299 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1300 }
1301 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1302 for (uint32_t i = 0; i < kBlockSize; i++) {
1303 ASSERT_TRUE(std::isnan(outputs[i]))
1304 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1305 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1306 }
1307 }
1308 }
1309#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1310
1311
1312#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1313 TEST(EXP__SSE2_RR2_LUT64_P2, negative_zero) {
1314 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1315 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1316 std::fill(inputs.begin(), inputs.end(), -0.0f);
1317 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1318 const float reference_output = 1.0f;
1319 ASSERT_EQ(reference_output, outputs[0])
1320 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1321 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1322 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1323 }
1324
1325 TEST(EXP__SSE2_RR2_LUT64_P2, positive_zero) {
1326 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1327 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1328 std::fill(inputs.begin(), inputs.end(), +0.0f);
1329 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1330 const float reference_output = 1.0f;
1331 ASSERT_EQ(reference_output, outputs[0])
1332 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1333 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1334 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1335 }
1336
1337 TEST(EXP__SSE2_RR2_LUT64_P2, negative_saturation) {
1338 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1339 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1340 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1341 for (uint32_t i = 0; i < kBlockSize; i++) {
1342 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1343 }
1344 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1345 for (uint32_t i = 0; i < kBlockSize; i++) {
1346 const uint32_t reference_output = UINT32_C(0x00000000);
1347 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1348 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1349 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1350 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1351 }
1352 }
1353 }
1354
1355 TEST(EXP__SSE2_RR2_LUT64_P2, positive_overflow) {
1356 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1357 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1358 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1359 for (uint32_t i = 0; i < kBlockSize; i++) {
1360 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1361 }
1362 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1363 for (uint32_t i = 0; i < kBlockSize; i++) {
1364 const uint32_t reference_output = UINT32_C(0x7F800000);
1365 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1366 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1367 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1368 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1369 }
1370 }
1371 }
1372
1373 TEST(EXP__SSE2_RR2_LUT64_P2, positive_nan) {
1374 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1375 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1376 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1377 for (uint32_t i = 0; i < kBlockSize; i++) {
1378 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1379 }
1380 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1381 for (uint32_t i = 0; i < kBlockSize; i++) {
1382 ASSERT_TRUE(std::isnan(outputs[i]))
1383 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1384 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1385 }
1386 }
1387 }
1388
1389 TEST(EXP__SSE2_RR2_LUT64_P2, negative_nan) {
1390 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1391 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1392 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1393 for (uint32_t i = 0; i < kBlockSize; i++) {
1394 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1395 }
1396 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1397 for (uint32_t i = 0; i < kBlockSize; i++) {
1398 ASSERT_TRUE(std::isnan(outputs[i]))
1399 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1400 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1401 }
1402 }
1403 }
1404#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1405
1406
1407#if XNN_ARCH_X86 || XNN_ARCH_X86_64
1408 TEST(EXP__SSE2_RR2_P5, negative_zero) {
1409 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1410 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1411 std::fill(inputs.begin(), inputs.end(), -0.0f);
1412 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1413 const float reference_output = 1.0f;
1414 ASSERT_EQ(reference_output, outputs[0])
1415 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1416 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1417 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1418 }
1419
1420 TEST(EXP__SSE2_RR2_P5, positive_zero) {
1421 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1422 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1423 std::fill(inputs.begin(), inputs.end(), +0.0f);
1424 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1425 const float reference_output = 1.0f;
1426 ASSERT_EQ(reference_output, outputs[0])
1427 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1428 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1429 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1430 }
1431
1432 TEST(EXP__SSE2_RR2_P5, negative_saturation) {
1433 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1434 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1435 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1436 for (uint32_t i = 0; i < kBlockSize; i++) {
1437 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1438 }
1439 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1440 for (uint32_t i = 0; i < kBlockSize; i++) {
1441 const uint32_t reference_output = UINT32_C(0x00000000);
1442 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1443 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1444 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1445 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1446 }
1447 }
1448 }
1449
1450 TEST(EXP__SSE2_RR2_P5, positive_overflow) {
1451 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1452 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1453 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1454 for (uint32_t i = 0; i < kBlockSize; i++) {
1455 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1456 }
1457 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1458 for (uint32_t i = 0; i < kBlockSize; i++) {
1459 const uint32_t reference_output = UINT32_C(0x7F800000);
1460 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1461 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1462 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1463 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1464 }
1465 }
1466 }
1467
1468 TEST(EXP__SSE2_RR2_P5, positive_nan) {
1469 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1470 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1471 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1472 for (uint32_t i = 0; i < kBlockSize; i++) {
1473 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1474 }
1475 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1476 for (uint32_t i = 0; i < kBlockSize; i++) {
1477 ASSERT_TRUE(std::isnan(outputs[i]))
1478 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1479 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1480 }
1481 }
1482 }
1483
1484 TEST(EXP__SSE2_RR2_P5, negative_nan) {
1485 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1486 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1487 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1488 for (uint32_t i = 0; i < kBlockSize; i++) {
1489 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1490 }
1491 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1492 for (uint32_t i = 0; i < kBlockSize; i++) {
1493 ASSERT_TRUE(std::isnan(outputs[i]))
1494 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1495 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1496 }
1497 }
1498 }
1499#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64