blob: 8e78456fd84b97347c22f46de0f94f918a0d885e [file] [log] [blame]
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#pragma once
10
11#if defined(__cplusplus) && (__cplusplus >= 201103L)
12 #include <cstdint>
13 #include <cstddef>
14 #include <cassert>
15 #include <cmath>
16#else
17 #include <stdint.h>
18 #include <stddef.h>
19 #include <assert.h>
20 #include <math.h>
21#endif
22
23#include <fp16.h>
24
25#include <xnnpack/common.h>
Marat Dukhan2b9efd82020-06-08 01:09:31 -070026#include <xnnpack/math.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070027#include <xnnpack/params.h>
28
29
30static inline union xnn_q8_gemm_params xnn_init_scalar_q8_gemm_params(
31 uint8_t input_zero_point,
32 uint8_t kernel_zero_point,
33 float scale,
34 uint8_t output_zero_point,
35 uint8_t output_min,
36 uint8_t output_max)
37{
38 // Compute requantization parameters
39 const uint32_t scale_bits = fp32_to_bits(scale);
40
41 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
42 const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
43 assert(multiplier >= INT32_C(0x40000000));
44 assert(multiplier <= INT32_C(0x7FFFFF80));
45
46 // Shift is in [0, 31] range.
47 const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
48 assert(shift >= 0);
49 assert(shift < 32);
50
51 const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
52 const uint32_t remainder_threshold = remainder_mask >> 1;
53
54 union xnn_q8_gemm_params params;
55 params.scalar.input_zero_point = (int32_t) (uint32_t) input_zero_point;
56 params.scalar.kernel_zero_point = (int32_t) (uint32_t) kernel_zero_point;
57 params.scalar.multiplier = multiplier;
58 params.scalar.remainder_mask = (int32_t) remainder_mask;
59 params.scalar.remainder_threshold = (int32_t) remainder_threshold;
60 params.scalar.shift = (uint32_t) shift;
61 params.scalar.output_min_less_zero_point =
62 (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
63 params.scalar.output_max_less_zero_point =
64 (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
65 params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
66 return params;
67}
68
69static inline union xnn_q8_gemm_params xnn_init_q8_gemm_params(
70 uint8_t input_zero_point,
71 uint8_t kernel_zero_point,
72 float scale,
73 uint8_t output_zero_point,
74 uint8_t output_min,
75 uint8_t output_max)
76{
77 // Compute requantization parameters.
78 const uint32_t scale_bits = fp32_to_bits(scale);
79
80 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
81 const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
82 assert(multiplier >= INT32_C(0x40000000));
83 assert(multiplier <= INT32_C(0x7FFFFF80));
84
85 // Shift is in [0, 31] range.
86 const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
87 assert(shift >= 0);
88 assert(shift < 32);
89
90 union xnn_q8_gemm_params params;
91 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
92 const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
93 const uint32_t remainder_threshold = remainder_mask >> 1;
94 for (uint32_t i = 0; i < 8; i++) {
95 params.sse2.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
96 params.sse2.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
97 }
98 params.sse2.multiplier[0] = multiplier;
99 params.sse2.multiplier[1] = multiplier;
100 params.sse2.multiplier[2] = multiplier;
101 params.sse2.multiplier[3] = multiplier;
102 params.sse2.rounding[0] = UINT64_C(0x40000000);
103 params.sse2.rounding[1] = UINT64_C(0x40000000);
104 params.sse2.remainder_mask[0] = (int32_t) remainder_mask;
105 params.sse2.remainder_mask[1] = (int32_t) remainder_mask;
106 params.sse2.remainder_mask[2] = (int32_t) remainder_mask;
107 params.sse2.remainder_mask[3] = (int32_t) remainder_mask;
108 params.sse2.remainder_threshold[0] = (int32_t) remainder_threshold;
109 params.sse2.remainder_threshold[1] = (int32_t) remainder_threshold;
110 params.sse2.remainder_threshold[2] = (int32_t) remainder_threshold;
111 params.sse2.remainder_threshold[3] = (int32_t) remainder_threshold;
112 params.sse2.shift[0] = (uint64_t) (uint32_t) shift;
113 params.sse2.shift[1] = (uint64_t) (uint32_t) shift;
114 for (uint32_t i = 0; i < 8; i++) {
115 params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
116 }
117 for (uint32_t i = 0; i < 16; i++) {
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700118 params.sse2.output_min[i] = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700119 params.sse2.output_max[i] = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700120 }
121 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
122 params.neon.input_zero_point = (int16_t) (uint16_t) input_zero_point;
123 params.neon.kernel_zero_point = (int16_t) (uint16_t) kernel_zero_point;
124 params.neon.multiplier = multiplier;
125 params.neon.right_shift = -shift;
126 params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700127 params.neon.output_min = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700128 params.neon.output_max = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700129 #else
130 const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
131 const uint32_t remainder_threshold = remainder_mask >> 1;
132 params.scalar.input_zero_point = (int32_t) (uint32_t) input_zero_point;
133 params.scalar.kernel_zero_point = (int32_t) (uint32_t) kernel_zero_point;
134 params.scalar.multiplier = multiplier;
135 params.scalar.remainder_mask = (int32_t) remainder_mask;
136 params.scalar.remainder_threshold = (int32_t) remainder_threshold;
137 params.scalar.shift = (uint32_t) shift;
138 params.scalar.output_min_less_zero_point =
139 (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
140 params.scalar.output_max_less_zero_point =
141 (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
142 params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
143 #endif
144 return params;
145}
146
147static inline union xnn_q8_avgpool_params xnn_init_q8_avgpool_params(
148 int32_t bias,
149 float scale,
150 uint8_t output_zero_point,
151 uint8_t output_min,
152 uint8_t output_max)
153{
154 // Compute requantization parameters.
155 assert(scale >= 0x1.0p-32f);
156 assert(scale < 256.0f);
157 const uint32_t scale_bits = fp32_to_bits(scale);
158
159 // Multiplier is in [0x00800000, 0x00FFFFFF] range.
160 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
161 assert(multiplier >= INT32_C(0x00800000));
162 assert(multiplier <= INT32_C(0x00FFFFFF));
163
164 // Shift is in [16, 55] range.
165 const int32_t shift = 127 + 23 - (scale_bits >> 23);
166 assert(shift >= 16);
167 assert(shift < 64);
168
169 union xnn_q8_avgpool_params params;
170 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
171 const uint32_t right_shift = (uint32_t) shift;
172 const uint64_t rounding = UINT64_C(1) << (right_shift - 1);
173 params.sse2.bias[0] = bias;
174 params.sse2.bias[1] = bias;
175 params.sse2.bias[2] = bias;
176 params.sse2.bias[3] = bias;
177 params.sse2.multiplier[0] = (uint32_t) multiplier;
178 params.sse2.multiplier[1] = (uint32_t) multiplier;
179 params.sse2.multiplier[2] = (uint32_t) multiplier;
180 params.sse2.multiplier[3] = (uint32_t) multiplier;
181 params.sse2.rounding[0] = rounding;
182 params.sse2.rounding[1] = rounding;
183 params.sse2.right_shift[0] = (uint64_t) right_shift;
184 params.sse2.right_shift[1] = (uint64_t) right_shift;
185 for (uint32_t i = 0; i < 8; i++) {
186 params.sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
187 }
188 for (uint32_t i = 0; i < 16; i++) {
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700189 params.sse2.output_min[i] = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700190 params.sse2.output_max[i] = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700191 }
192 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
193 params.neon.bias = bias;
194 params.neon.multiplier = multiplier;
195 params.neon.left_shift = (int64_t) -shift;
196 params.neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700197 params.neon.output_min = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700198 params.neon.output_max = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700199 #else
200 const uint32_t right_shift = (uint32_t) shift;
201 const int64_t rounding = INT64_C(1) << (right_shift - 1);
202 params.scalar.bias = bias;
203 params.scalar.multiplier = multiplier;
204 params.scalar.rounding = rounding;
205 params.scalar.right_shift = right_shift;
206 params.scalar.output_min_less_zero_point =
207 (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
208 params.scalar.output_max_less_zero_point =
209 (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
210 params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
211 #endif
212 return params;
213}
214
215static inline union xnn_q8_avgpool_params xnn_init_scalar_q8_avgpool_params(
216 int32_t bias,
217 float scale,
218 uint8_t output_zero_point,
219 uint8_t output_min,
220 uint8_t output_max)
221{
222 // Compute requantization parameters.
223 assert(scale >= 0x1.0p-32f);
224 assert(scale < 256.0f);
225 const uint32_t scale_bits = fp32_to_bits(scale);
226
227 // Multiplier is in [0x00800000, 0x00FFFFFF] range.
228 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
229 assert(multiplier >= INT32_C(0x00800000));
230 assert(multiplier <= INT32_C(0x00FFFFFF));
231
232 // Shift is in [16, 55] range.
233 const int32_t shift = 127 + 23 - (scale_bits >> 23);
234 assert(shift >= 16);
235 assert(shift < 64);
236
237 union xnn_q8_avgpool_params params;
238 const uint32_t right_shift = (uint32_t) shift;
239 const int64_t rounding = INT64_C(1) << (right_shift - 1);
240 params.scalar.bias = bias;
241 params.scalar.rounding = rounding;
242 params.scalar.multiplier = multiplier;
243 params.scalar.right_shift = right_shift;
244 params.scalar.output_min_less_zero_point =
245 (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
246 params.scalar.output_max_less_zero_point =
247 (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
248 params.scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
249 return params;
250}
251
Marat Dukhan8452ff52020-04-08 20:44:58 -0700252static inline void xnn_update_f32_scaleminmax_params(
253 union xnn_f32_scaleminmax_params* params,
254 float scale)
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700255{
256 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
257 for (uint32_t i = 0; i < 4; i++) {
Marat Dukhan8452ff52020-04-08 20:44:58 -0700258 params->sse2.scale[i] = scale;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700259 }
260 #else
Marat Dukhan8452ff52020-04-08 20:44:58 -0700261 params->scalar.scale = scale;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700262 #endif
263}
264
Marat Dukhan8452ff52020-04-08 20:44:58 -0700265static inline union xnn_f32_scaleminmax_params xnn_init_f32_scaleminmax_params(
266 float scale,
267 float min,
268 float max)
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700269{
Marat Dukhan8452ff52020-04-08 20:44:58 -0700270 union xnn_f32_scaleminmax_params params;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700271 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
272 for (uint32_t i = 0; i < 4; i++) {
Marat Dukhan8452ff52020-04-08 20:44:58 -0700273 params.sse2.scale[i] = scale;
274 params.sse2.min[i] = min;
275 params.sse2.max[i] = max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700276 }
277 #else
Marat Dukhan8452ff52020-04-08 20:44:58 -0700278 params.scalar.scale = scale;
279 params.scalar.min = min;
280 params.scalar.max = max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700281 #endif
282 return params;
283}
284
285static inline union xnn_f32_gavgpool_params xnn_init_f32_gavgpool_params(
286 float multiplier,
287 float output_min,
288 float output_max,
289 uint32_t width)
290{
291 union xnn_f32_gavgpool_params params;
292 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
293 for (uint32_t i = 0; i < 4; i++) {
294 params.sse.multiplier[i] = multiplier;
295 params.sse.output_min[i] = output_min;
296 params.sse.output_max[i] = output_max;
297 }
298
299 const uint32_t w = (width - 1) & 3;
300 params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
301 params.sse.mask[1] = -(uint32_t) (w >= 1);
302 params.sse.mask[2] = -(uint32_t) (w >= 2);
303 params.sse.mask[3] = -(uint32_t) (w >= 3);
304 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
305 params.neon.multiplier = multiplier;
306 params.neon.output_min = output_min;
307 params.neon.output_max = output_max;
308
309 const uint32_t w = (width - 1) & 3;
310 params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
311 params.neon.mask[1] = -(uint32_t) (w >= 1);
312 params.neon.mask[2] = -(uint32_t) (w >= 2);
313 params.neon.mask[3] = -(uint32_t) (w >= 3);
314 #else
315 params.scalar.multiplier = multiplier;
316 params.scalar.output_min = output_min;
317 params.scalar.output_max = output_max;
Erich Elsen6f278b52020-06-10 16:13:11 -0700318
319 const uint32_t w = (width - 1) & 3;
320 params.scalar.mask[0] = UINT32_C(0xFFFFFFFF);
321 params.scalar.mask[1] = -(int32_t) (w >= 1);
322 params.scalar.mask[2] = -(int32_t) (w >= 2);
323 params.scalar.mask[3] = -(int32_t) (w >= 3);
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700324 #endif
325 return params;
326}
327
328static inline void xnn_update_f32_gavgpool_params(
329 union xnn_f32_gavgpool_params* params,
330 float multiplier,
331 uint32_t width)
332{
333 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
334 for (uint32_t i = 0; i < 4; i++) {
335 params->sse.multiplier[i] = multiplier;
336 }
337
338 const uint32_t w = (width - 1) & 3;
339 params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
340 params->sse.mask[1] = -(uint32_t) (w >= 1);
341 params->sse.mask[2] = -(uint32_t) (w >= 2);
342 params->sse.mask[3] = -(uint32_t) (w >= 3);
343 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
344 params->neon.multiplier = multiplier;
345
346 const uint32_t w = (width - 1) & 3;
347 params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
348 params->neon.mask[1] = -(uint32_t) (w >= 1);
349 params->neon.mask[2] = -(uint32_t) (w >= 2);
350 params->neon.mask[3] = -(uint32_t) (w >= 3);
351 #else
352 params->scalar.multiplier = multiplier;
Erich Elsen6f278b52020-06-10 16:13:11 -0700353
354 const uint32_t w = (width - 1) & 3;
355 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
356 params->scalar.mask[1] = (int32_t) (w >= 1);
357 params->scalar.mask[2] = (int32_t) (w >= 2);
358 params->scalar.mask[3] = (int32_t) (w >= 3);
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700359 #endif
360}
361
Marat Dukhan8452ff52020-04-08 20:44:58 -0700362static inline union xnn_f32_scaleminmax_params xnn_init_scalar_f32_scaleminmax_params(
363 float scale,
364 float min,
365 float max)
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700366{
Marat Dukhan8452ff52020-04-08 20:44:58 -0700367 union xnn_f32_scaleminmax_params params;
368 params.scalar.scale = scale;
369 params.scalar.min = min;
370 params.scalar.max = max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700371 return params;
372}
373
374static inline union xnn_f32_gavgpool_params xnn_init_scalar_f32_gavgpool_params(
375 float multiplier,
376 float output_min,
377 float output_max,
378 uint32_t width)
379{
380 union xnn_f32_gavgpool_params params;
381 params.scalar.multiplier = multiplier;
382 params.scalar.output_min = output_min;
383 params.scalar.output_max = output_max;
Erich Elsen6f278b52020-06-10 16:13:11 -0700384
385 const uint32_t w = (width - 1) & 3;
386 params.scalar.mask[0] = UINT32_C(0xFFFFFFFF);
387 params.scalar.mask[1] = -(int32_t) (w >= 1);
388 params.scalar.mask[2] = -(int32_t) (w >= 2);
389 params.scalar.mask[3] = -(int32_t) (w >= 3);
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700390 return params;
391}
392
Frank Barchard99003a82020-05-04 10:39:38 -0700393static inline struct xnn_f16_scaleminmax_params xnn_init_f16_scaleminmax_params(
394 uint16_t scale,
395 uint16_t min,
396 uint16_t max)
397{
398 struct xnn_f16_scaleminmax_params params;
399 params.scale = scale;
400 params.min = min;
401 params.max = max;
402 return params;
403}
404
Frank Barchardd793f6c2020-05-08 13:37:43 -0700405static inline struct xnn_f16_minmax_params xnn_init_f16_minmax_params(
406 uint16_t min,
407 uint16_t max)
408{
409 struct xnn_f16_minmax_params params;
410 params.min = min;
411 params.max = max;
412 return params;
413}
414
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700415static inline union xnn_f32_minmax_params xnn_init_f32_minmax_params(
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700416 float output_min,
417 float output_max)
418{
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700419 union xnn_f32_minmax_params params;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700420 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
421 for (uint32_t i = 0; i < 4; i++) {
422 params.sse.min[i] = output_min;
423 params.sse.max[i] = output_max;
424 }
425 #else
426 params.scalar.min = output_min;
427 params.scalar.max = output_max;
428 #endif
429 return params;
430}
431
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700432static inline union xnn_f32_minmax_params xnn_init_scalar_f32_minmax_params(
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700433 float output_min,
434 float output_max)
435{
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700436 union xnn_f32_minmax_params params;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700437 params.scalar.min = output_min;
438 params.scalar.max = output_max;
439 return params;
440}
441
Frank Barchardb1966592020-05-12 13:47:06 -0700442static inline struct xnn_f16_hswish_params xnn_init_f16_hswish_params(void)
443{
444 struct xnn_f16_hswish_params params;
445 params.sixth = fp16_ieee_from_fp32_value(0x1.555556p-3f);
446 params.half = fp16_ieee_from_fp32_value(0.5f);
447 params.one = fp16_ieee_from_fp32_value(1.0f);
448 return params;
449}
450
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700451static inline union xnn_f32_hswish_params xnn_init_f32_hswish_params(void)
452{
453 union xnn_f32_hswish_params params;
454 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
455 for (uint32_t i = 0; i < 4; i++) {
456 params.sse.sixth[i] = 0x1.555556p-3f;
457 params.sse.half[i] = 0.5f;
458 params.sse.one[i] = 1.0f;
459 }
460 #else
461 params.scalar.sixth = 0x1.555556p-3f;
462 params.scalar.half = 0.5f;
463 params.scalar.one = 1.0f;
464 #endif
465 return params;
466}
467
468static inline union xnn_f32_hswish_params xnn_init_scalar_f32_hswish_params(void)
469{
470 union xnn_f32_hswish_params params;
471 params.scalar.sixth = 0x1.555556p-3f;
472 params.scalar.half = 0.5f;
473 params.scalar.one = 1.0f;
474 return params;
475}
476
Marat Dukhan2b9efd82020-06-08 01:09:31 -0700477static inline union xnn_f32_abs_params xnn_init_f32_abs_params(void)
478{
479 union xnn_f32_abs_params params = { 0 };
480 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
481 for (uint32_t i = 0; i < 4; i++) {
482 params.sse.nonsign_mask[i] = math_nonsign_mask_f32();
483 }
484 #endif
485 return params;
486}
487
488static inline union xnn_f32_abs_params xnn_init_scalar_f32_abs_params(void)
489{
490 union xnn_f32_abs_params params = { 0 };
491 return params;
492}
493
494static inline union xnn_f32_neg_params xnn_init_f32_neg_params(void)
495{
496 union xnn_f32_neg_params params = { 0 };
497 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
498 for (uint32_t i = 0; i < 4; i++) {
499 params.sse.sign_mask[i] = -0.0f;
500 }
501 #endif
502 return params;
503}
504
505static inline union xnn_f32_neg_params xnn_init_scalar_f32_neg_params(void)
506{
507 union xnn_f32_neg_params params = { 0 };
508 return params;
509}
510
Marat Dukhaneecf8fd2020-06-09 08:59:37 -0700511static inline union xnn_f32_rnd_params xnn_init_f32_rnd_params(void)
512{
513 union xnn_f32_rnd_params params = { 0 };
514 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
515 for (uint32_t i = 0; i < 4; i++) {
516 params.sse2.sign_mask[i] = -0.0f;
517 }
518 for (uint32_t i = 0; i < 4; i++) {
519 params.sse2.one[i] = 1.0f;
520 }
521 #endif
522 return params;
523}
524
525static inline union xnn_f32_rnd_params xnn_init_scalar_f32_rnd_params(void)
526{
527 union xnn_f32_rnd_params params = { 0 };
528 return params;
529}
530
Marat Dukhan8cc7efe2020-06-10 16:24:27 -0700531static inline union xnn_f32_lrelu_params xnn_init_f32_lrelu_params(float slope)
532{
533 union xnn_f32_lrelu_params params;
534 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
535 for (uint32_t i = 0; i < 4; i++) {
536 params.sse.slope[i] = slope;
537 }
538 #else
539 params.scalar.slope = slope;
540 #endif
541 return params;
542}
543
544static inline union xnn_f32_lrelu_params xnn_init_scalar_f32_lrelu_params(float slope)
545{
546 union xnn_f32_lrelu_params params;
547 params.scalar.slope = slope;
548 return params;
549}
550
Marat Dukhan1f29b802020-05-15 23:46:39 -0700551static inline union xnn_f32_chw_params xnn_init_f32_chw_params(
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700552 uint32_t width,
553 float output_min,
554 float output_max)
555{
Marat Dukhan1f29b802020-05-15 23:46:39 -0700556 union xnn_f32_chw_params params;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700557 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
558 for (uint32_t i = 0; i < 4; i++) {
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700559 params.sse.min[i] = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700560 params.sse.max[i] = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700561 }
562
563 const uint32_t w4 = (width - 1) & 3;
564 params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
565 params.sse.mask[1] = -(uint32_t) (w4 >= 1);
566 params.sse.mask[2] = -(uint32_t) (w4 >= 2);
567 params.sse.mask[3] = -(uint32_t) (w4 >= 3);
568
569 const uint32_t w8 = (width - 1) & 7;
570 params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
571 params.sse.mask_even[1] = -(uint32_t) (w8 >= 2);
572 params.sse.mask_even[2] = -(uint32_t) (w8 >= 4);
573 params.sse.mask_even[3] = -(uint32_t) (w8 >= 6);
574 params.sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
575 params.sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
576 params.sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
577 params.sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
578 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700579 params.neon.min = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700580 params.neon.max = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700581
582 const uint32_t w4 = (width - 1) & 3;
583 params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
584 params.neon.mask[1] = -(uint32_t) (w4 >= 1);
585 params.neon.mask[2] = -(uint32_t) (w4 >= 2);
586 params.neon.mask[3] = -(uint32_t) (w4 >= 3);
587
588 const uint32_t w8 = (width - 1) & 7;
589 params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
590 params.neon.mask_even[1] = -(uint32_t) (w8 >= 2);
591 params.neon.mask_even[2] = -(uint32_t) (w8 >= 4);
592 params.neon.mask_even[3] = -(uint32_t) (w8 >= 6);
593 params.neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
594 params.neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
595 params.neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
596 params.neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
597 #else
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700598 params.scalar.min = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700599 params.scalar.max = output_max;
Erich Elsene6214af2020-06-10 22:17:22 -0700600
601 const uint32_t w4 = (width - 1) & 3;
602 params.scalar.mask[0] = INT32_C(0xFFFFFFFF);
603 params.scalar.mask[1] = -(int32_t) (w4 >= 1);
604 params.scalar.mask[2] = -(int32_t) (w4 >= 2);
605 params.scalar.mask[3] = -(int32_t) (w4 >= 3);
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700606
607 const uint32_t w8 = (width - 1) & 7;
608 params.scalar.mask_even[0] = INT32_C(0xFFFFFFFF);
609 params.scalar.mask_even[1] = -(int32_t) (w8 >= 2);
610 params.scalar.mask_even[2] = -(int32_t) (w8 >= 4);
611 params.scalar.mask_even[3] = -(int32_t) (w8 >= 6);
612 params.scalar.mask_odd[0] = -(int32_t) (w8 >= 1);
613 params.scalar.mask_odd[1] = -(int32_t) (w8 >= 3);
614 params.scalar.mask_odd[2] = -(int32_t) (w8 >= 5);
615 params.scalar.mask_odd[3] = -(int32_t) (w8 >= 7);
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700616 #endif
617 return params;
618}
619
Marat Dukhan1f29b802020-05-15 23:46:39 -0700620static inline void xnn_update_f32_chw_params(
621 union xnn_f32_chw_params* params,
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700622 uint32_t width)
623{
624 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
625 const uint32_t w4 = (width - 1) & 3;
626 params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
627 params->sse.mask[1] = -(uint32_t) (w4 >= 1);
628 params->sse.mask[2] = -(uint32_t) (w4 >= 2);
629 params->sse.mask[3] = -(uint32_t) (w4 >= 3);
630
631 const uint32_t w8 = (width - 1) & 7;
632 params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
633 params->sse.mask_even[1] = -(uint32_t) (w8 >= 2);
634 params->sse.mask_even[2] = -(uint32_t) (w8 >= 4);
635 params->sse.mask_even[3] = -(uint32_t) (w8 >= 6);
636 params->sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
637 params->sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
638 params->sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
639 params->sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
640 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
641 const uint32_t w4 = (width - 1) & 3;
642 params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
643 params->neon.mask[1] = -(uint32_t) (w4 >= 1);
644 params->neon.mask[2] = -(uint32_t) (w4 >= 2);
645 params->neon.mask[3] = -(uint32_t) (w4 >= 3);
646
647 const uint32_t w8 = (width - 1) & 7;
648 params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
649 params->neon.mask_even[1] = -(uint32_t) (w8 >= 2);
650 params->neon.mask_even[2] = -(uint32_t) (w8 >= 4);
651 params->neon.mask_even[3] = -(uint32_t) (w8 >= 6);
652 params->neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
653 params->neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
654 params->neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
655 params->neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
656 #endif
657}
658
Marat Dukhan1f29b802020-05-15 23:46:39 -0700659static inline union xnn_f32_chw_params xnn_init_scalar_f32_chw_params(
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700660 uint32_t width,
661 float output_min,
662 float output_max)
663{
Marat Dukhan1f29b802020-05-15 23:46:39 -0700664 union xnn_f32_chw_params params;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700665 params.scalar.min = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700666 params.scalar.max = output_max;
Erich Elsene6214af2020-06-10 22:17:22 -0700667
668 const uint32_t w4 = (width - 1) & 3;
669 params.scalar.mask[0] = INT32_C(0xFFFFFFFF);
670 params.scalar.mask[1] = -(int32_t) (w4 >= 1);
671 params.scalar.mask[2] = -(int32_t) (w4 >= 2);
672 params.scalar.mask[3] = -(int32_t) (w4 >= 3);
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700673
674 const uint32_t w8 = (width - 1) & 7;
675 params.scalar.mask_even[0] = INT32_C(0xFFFFFFFF);
676 params.scalar.mask_even[1] = -(int32_t) (w8 >= 2);
677 params.scalar.mask_even[2] = -(int32_t) (w8 >= 4);
678 params.scalar.mask_even[3] = -(int32_t) (w8 >= 6);
679 params.scalar.mask_odd[0] = -(int32_t) (w8 >= 1);
680 params.scalar.mask_odd[1] = -(int32_t) (w8 >= 3);
681 params.scalar.mask_odd[2] = -(int32_t) (w8 >= 5);
682 params.scalar.mask_odd[3] = -(int32_t) (w8 >= 7);
683
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700684 return params;
685}
686
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700687static inline union xnn_u8_minmax_params xnn_init_u8_minmax_params(
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700688 uint8_t output_min,
689 uint8_t output_max)
690{
691 assert(output_min < output_max);
692
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700693 union xnn_u8_minmax_params params;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700694 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
695 for (uint32_t i = 0; i < 16; i++) {
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700696 params.sse2.min[i] = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700697 params.sse2.max[i] = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700698 }
699 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700700 params.neon.min = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700701 params.neon.max = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700702 #else
703 params.scalar.min = (int32_t) (uint32_t) output_min;
704 params.scalar.max = (int32_t) (uint32_t) output_max;
705 #endif
706 return params;
707}
708
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700709static inline union xnn_u8_minmax_params xnn_init_scalar_u8_minmax_params(
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700710 uint8_t output_min,
711 uint8_t output_max)
712{
713 assert(output_min < output_max);
714
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700715 union xnn_u8_minmax_params params;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700716 params.scalar.min = (int32_t) (uint32_t) output_min;
717 params.scalar.max = (int32_t) (uint32_t) output_max;
718 return params;
719}
720
721static inline union xnn_q8_add_params xnn_init_q8_add_params(
722 uint8_t a_zero_point,
723 uint8_t b_zero_point,
724 uint8_t output_zero_point,
725 float a_output_scale,
726 float b_output_scale,
727 uint8_t output_min,
728 uint8_t output_max)
729{
730 assert(a_output_scale >= 0x1.0p-14f);
731 assert(b_output_scale >= 0x1.0p-14f);
732 assert(a_output_scale < 0x1.0p+8f);
733 assert(b_output_scale < 0x1.0p+8f);
734
735 // Compute requantization parameters.
736 const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
737 assert(max_output_scale >= 0x1.0p-14f);
738 assert(max_output_scale < 0x1.0p+8f);
739 const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
740 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
741 // Shift is in [13, 31] range.
742 const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
743 assert(shift < 32);
744 assert(shift >= 13);
745
746 const float scale_multiplier = fp32_from_bits((uint32_t) (21 - max_scale_exponent + 127) << 23);
747
748 // Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range.
Marat Dukhanef3e7dc2020-04-13 01:19:56 -0700749 const uint32_t a_multiplier = (uint32_t) (int32_t) lrintf(a_output_scale * scale_multiplier);
750 const uint32_t b_multiplier = (uint32_t) (int32_t) lrintf(b_output_scale * scale_multiplier);
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700751 assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
752 assert(a_multiplier < UINT32_C(0x00400000));
753 assert(b_multiplier < UINT32_C(0x00400000));
754
755 union xnn_q8_add_params params;
756 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
757 const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
758 const uint32_t remainder_threshold = remainder_mask >> 1;
759 const int32_t zero_point_product =
760 (int32_t) -(a_multiplier * (uint32_t) a_zero_point + b_multiplier * (uint32_t) b_zero_point);
761 for (uint32_t i = 0; i < 4; i++) {
762 params.sse2.zero_point_product[i] = zero_point_product;
763 }
764 for (uint32_t i = 0; i < 8; i++) {
765 params.sse2.y_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
766 }
767 for (uint32_t i = 0; i < 8; i++) {
768 params.sse2.a_multiplier_lo[i] = (uint16_t) (uint32_t) a_multiplier;
769 params.sse2.a_multiplier_hi[i] = (uint16_t) ((uint32_t) a_multiplier >> 16);
770 params.sse2.b_multiplier_lo[i] = (uint16_t) (uint32_t) b_multiplier;
771 params.sse2.b_multiplier_hi[i] = (uint16_t) ((uint32_t) b_multiplier >> 16);
772 }
773 params.sse2.a_multiplier = a_multiplier;
774 params.sse2.b_multiplier = b_multiplier;
775 for (uint32_t i = 0; i < 4; i++) {
776 params.sse2.remainder_mask[i] = remainder_mask;
777 params.sse2.remainder_threshold[i] = remainder_threshold;
778 }
779 params.sse2.shift = shift;
780 for (uint32_t i = 0; i < 16; i++) {
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700781 params.sse2.y_min[i] = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700782 params.sse2.y_max[i] = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700783 }
784 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
785 params.neon.a_zero_point = a_zero_point;
786 params.neon.b_zero_point = b_zero_point;
787 params.neon.y_zero_point = (int16_t) (uint16_t) output_zero_point;
788 params.neon.a_multiplier = (int32_t) a_multiplier;
789 params.neon.b_multiplier = (int32_t) b_multiplier;
790 params.neon.right_shift = (int32_t) -shift;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700791 params.neon.y_min = output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700792 params.neon.y_max = output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700793 #else
794 const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
795 const uint32_t remainder_threshold = remainder_mask >> 1;
796 params.scalar.zero_point_product =
797 (int32_t) -(a_multiplier * (uint32_t) a_zero_point + b_multiplier * (uint32_t) b_zero_point);
798 params.scalar.a_multiplier = a_multiplier;
799 params.scalar.b_multiplier = b_multiplier;
800 params.scalar.remainder_mask = (int32_t) remainder_mask;
801 params.scalar.remainder_threshold = (int32_t) remainder_threshold;
802 params.scalar.shift = shift;
803 params.scalar.y_zero_point = (int32_t) (uint32_t) output_zero_point;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700804 params.scalar.y_min = (int32_t) (uint32_t) output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700805 params.scalar.y_max = (int32_t) (uint32_t) output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700806 #endif
807 return params;
808}
809
810static inline union xnn_q8_add_params xnn_init_scalar_q8_add_params(
811 uint8_t a_zero_point,
812 uint8_t b_zero_point,
813 uint8_t output_zero_point,
814 float a_output_scale,
815 float b_output_scale,
816 uint8_t output_min,
817 uint8_t output_max)
818{
819 assert(a_output_scale >= 0x1.0p-10f);
820 assert(b_output_scale >= 0x1.0p-10f);
821 assert(a_output_scale < 0x1.0p+8f);
822 assert(b_output_scale < 0x1.0p+8f);
823
824 // Compute requantization parameters.
825 const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
826 assert(max_output_scale >= 0x1.0p-10f);
827 assert(max_output_scale < 0x1.0p+8f);
828 const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
829 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
830 // Shift is in [13, 31] range.
831 const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
832 assert(shift < 32);
833 assert(shift >= 13);
834
835 // Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range.
Marat Dukhanef3e7dc2020-04-13 01:19:56 -0700836 const uint32_t a_multiplier = (uint32_t) (int32_t) lrintf(fp32_from_bits(fp32_to_bits(a_output_scale) + (shift << 23)));
837 const uint32_t b_multiplier = (uint32_t) (int32_t) lrintf(fp32_from_bits(fp32_to_bits(b_output_scale) + (shift << 23)));
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700838 assert((a_multiplier > b_multiplier ? a_multiplier : b_multiplier) >= UINT32_C(0x00200000));
839 assert(a_multiplier < UINT32_C(0x00400000));
840 assert(b_multiplier < UINT32_C(0x00400000));
841
842 union xnn_q8_add_params params;
843 const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
844 const uint32_t remainder_threshold = remainder_mask >> 1;
845 params.scalar.zero_point_product =
846 (int32_t) -(a_multiplier * (uint32_t) a_zero_point + b_multiplier * (uint32_t) b_zero_point);
847 params.scalar.a_multiplier = a_multiplier;
848 params.scalar.b_multiplier = b_multiplier;
849 params.scalar.remainder_mask = (int32_t) remainder_mask;
850 params.scalar.remainder_threshold = (int32_t) remainder_threshold;
851 params.scalar.shift = shift;
852 params.scalar.y_zero_point = (int32_t) (uint32_t) output_zero_point;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700853 params.scalar.y_min = (int32_t) (uint32_t) output_min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700854 params.scalar.y_max = (int32_t) (uint32_t) output_max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700855 return params;
856}
857
858static inline union xnn_q31_requantization_params xnn_init_scalar_requantization_params(
859 float scale,
860 uint8_t zero_point,
861 uint8_t min,
862 uint8_t max)
863{
864 // Compute requantization parameters.
865 assert(scale < 1.0f);
866 assert(scale >= 0x1.0p-32f);
867 const uint32_t scale_bits = fp32_to_bits(scale);
868
869 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
870 const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
871 assert(multiplier >= INT32_C(0x40000000));
872 assert(multiplier <= INT32_C(0x7FFFFF80));
873
874 // Shift is in [0, 31] range.
875 const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
876 assert(shift >= 0);
877 assert(shift < 32);
878
879 union xnn_q31_requantization_params params;
880 const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
881 const uint32_t remainder_threshold = remainder_mask >> 1;
882 params.scalar.multiplier = multiplier;
883 params.scalar.remainder_mask = (int32_t) remainder_mask;
884 params.scalar.remainder_threshold = (int32_t) remainder_threshold;
885 params.scalar.shift = (uint32_t) shift;
886 params.scalar.min_less_zero_point = (int32_t) (uint32_t) min - (int32_t) (uint32_t) zero_point;
887 params.scalar.max_less_zero_point = (int32_t) (uint32_t) max - (int32_t) (uint32_t) zero_point;
888 params.scalar.zero_point = (int32_t) (uint32_t) zero_point;
889 return params;
890}
891
892static inline union xnn_q31_requantization_params xnn_init_requantization_params(
893 float scale,
894 uint8_t zero_point,
895 uint8_t min,
896 uint8_t max)
897{
898 // Compute requantization parameters.
899 const uint32_t scale_bits = fp32_to_bits(scale);
900
901 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
902 const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
903 assert(multiplier >= INT32_C(0x40000000));
904 assert(multiplier <= INT32_C(0x7FFFFF80));
905
906 // Shift is in [0, 31] range.
907 const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
908 assert(shift >= 0);
909 assert(shift < 32);
910
911 union xnn_q31_requantization_params params;
912 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
913 const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
914 const uint32_t remainder_threshold = remainder_mask >> 1;
915 params.sse2.multiplier[0] = multiplier;
916 params.sse2.multiplier[1] = multiplier;
917 params.sse2.multiplier[2] = multiplier;
918 params.sse2.multiplier[3] = multiplier;
919 params.sse2.rounding[0] = UINT64_C(0x40000000);
920 params.sse2.rounding[1] = UINT64_C(0x40000000);
921 params.sse2.remainder_mask[0] = (int32_t) remainder_mask;
922 params.sse2.remainder_mask[1] = (int32_t) remainder_mask;
923 params.sse2.remainder_mask[2] = (int32_t) remainder_mask;
924 params.sse2.remainder_mask[3] = (int32_t) remainder_mask;
925 params.sse2.remainder_threshold[0] = (int32_t) remainder_threshold;
926 params.sse2.remainder_threshold[1] = (int32_t) remainder_threshold;
927 params.sse2.remainder_threshold[2] = (int32_t) remainder_threshold;
928 params.sse2.remainder_threshold[3] = (int32_t) remainder_threshold;
929 params.sse2.shift[0] = (uint64_t) (uint32_t) shift;
930 params.sse2.shift[1] = (uint64_t) (uint32_t) shift;
931 for (uint32_t i = 0; i < 8; i++) {
932 params.sse2.zero_point[i] = (int16_t) (uint16_t) zero_point;
933 }
934 for (uint32_t i = 0; i < 16; i++) {
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700935 params.sse2.min[i] = min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700936 params.sse2.max[i] = max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700937 }
938 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
939 params.neon.multiplier = multiplier;
940 params.neon.right_shift = -shift;
941 params.neon.zero_point = (int16_t) (uint16_t) zero_point;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700942 params.neon.min = min;
Marat Dukhana51cf482020-04-08 16:16:19 -0700943 params.neon.max = max;
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700944 #else
945 const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
946 const uint32_t remainder_threshold = remainder_mask >> 1;
947 params.scalar.multiplier = multiplier;
948 params.scalar.remainder_mask = (int32_t) remainder_mask;
949 params.scalar.remainder_threshold = (int32_t) remainder_threshold;
950 params.scalar.shift = (uint32_t) shift;
951 params.scalar.min_less_zero_point = (int32_t) (uint32_t) min - (int32_t) (uint32_t) zero_point;
952 params.scalar.max_less_zero_point = (int32_t) (uint32_t) max - (int32_t) (uint32_t) zero_point;
953 params.scalar.zero_point = (int32_t) (uint32_t) zero_point;
954 #endif
955 return params;
956}