blob: 3f198d886ec1b548b12fc67364da20440fbf10d5 [file] [log] [blame]
Marat Dukhancfb31342019-12-05 10:42:57 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#pragma once
7
Marat Dukhan8aaf1862020-04-28 10:26:13 -07008#include <xnnpack/common.h>
9
10
Marat Dukhanf124e882020-08-09 19:48:10 -070011#if defined(__SSE2__)
12#include <emmintrin.h>
13
14// GCC any, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16
15#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)) || \
16 (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \
17 (defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \
18 (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \
19 (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1600))
20
21static XNN_INTRINSIC
22__m128i _mm_loadu_si32(const void* address) {
23 return _mm_cvtsi32_si128(*((const int*) address));
24}
25
26static XNN_INTRINSIC
27void _mm_storeu_si32(const void* address, __m128i v) {
28 *((int*) address) = _mm_cvtsi128_si32(v);
29}
30#endif // GCC any, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16
31#endif // SSE2
32
Marat Dukhancfb31342019-12-05 10:42:57 -080033#ifdef __AVX512F__
34#include <immintrin.h>
35
Marat Dukhan69a6a762020-06-17 23:40:16 -070036// GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-18
Ashkan Aliabadi9520fc92019-12-18 14:02:35 -080037#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
Marat Dukhan69a6a762020-06-17 23:40:16 -070038 (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \
39 (defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \
Marat Dukhanb1878352020-03-17 16:54:11 -070040 (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \
Ashkan Aliabadi9520fc92019-12-18 14:02:35 -080041 (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
42
Marat Dukhan8aaf1862020-04-28 10:26:13 -070043static XNN_INTRINSIC
44__mmask16 _cvtu32_mask16(unsigned int mask) {
Marat Dukhancfb31342019-12-05 10:42:57 -080045 return (__mmask16) mask;
46}
Marat Dukhan65a01392019-12-05 11:23:47 -080047
Marat Dukhanbb00b1d2020-08-10 11:37:23 -070048static XNN_INTRINSIC
49__mmask64 _cvtu64_mask64(unsigned long long mask) {
50 return (__mmask64) mask;
51}
52
Marat Dukhane5b5dea2020-10-22 01:19:19 -070053static XNN_INTRINSIC
54__mmask64 _kshiftli_mask64(__mmask64 a, unsigned int count) {
55 return (__mmask64) ((unsigned long long) a << count);
56}
57
Marat Dukhanb9deb962020-10-22 10:08:59 -070058static XNN_INTRINSIC
59__mmask64 _kshiftri_mask64(__mmask64 a, unsigned int count) {
60 return (__mmask64) ((unsigned long long) a >> count);
61}
62
Marat Dukhanf124e882020-08-09 19:48:10 -070063#endif // GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-18
Ashkan Aliabadi9520fc92019-12-18 14:02:35 -080064
65// GCC pre-7, Clang pre-4, and ICC pre-18
66#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
67 (defined(__clang__) && (__clang_major__ < 4)) || \
68 (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
69
Marat Dukhan8aaf1862020-04-28 10:26:13 -070070static XNN_INTRINSIC
71float _mm512_reduce_add_ps(__m512 v) {
Marat Dukhan65a01392019-12-05 11:23:47 -080072#if __AVX512DQ__
73 const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
74#else
75 const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
Marat Dukhancfb31342019-12-05 10:42:57 -080076#endif
Marat Dukhan65a01392019-12-05 11:23:47 -080077 const __m128 sum4 = _mm_add_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
78 const __m128 sum8 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4));
79 const __m128 sum16 = _mm_add_ss(sum8, _mm_movehdup_ps(sum8));
80 return _mm_cvtss_f32(sum16);
81}
82
Marat Dukhan8aaf1862020-04-28 10:26:13 -070083static XNN_INTRINSIC
84float _mm512_reduce_max_ps(__m512 v) {
Marat Dukhan65a01392019-12-05 11:23:47 -080085#if __AVX512DQ__
86 const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
87#else
88 const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
89#endif
90 const __m128 sum4 = _mm_max_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
91 const __m128 sum8 = _mm_max_ps(sum4, _mm_movehl_ps(sum4, sum4));
92 const __m128 sum16 = _mm_max_ss(sum8, _mm_movehdup_ps(sum8));
93 return _mm_cvtss_f32(sum16);
94}
Ashkan Aliabadi9520fc92019-12-18 14:02:35 -080095
96#endif // GCC pre-7, Clang pre-4, and ICC pre-18
Marat Dukhancfb31342019-12-05 10:42:57 -080097
Marat Dukhan2a18f7e2020-08-12 13:36:26 -070098#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 9)
99static XNN_INTRINSIC
100__m512i _mm512_set_epi8(
101 char e63, char e62, char e61, char e60,
102 char e59, char e58, char e57, char e56,
103 char e55, char e54, char e53, char e52,
104 char e51, char e50, char e49, char e48,
105 char e47, char e46, char e45, char e44,
106 char e43, char e42, char e41, char e40,
107 char e39, char e38, char e37, char e36,
108 char e35, char e34, char e33, char e32,
109 char e31, char e30, char e29, char e28,
110 char e27, char e26, char e25, char e24,
111 char e23, char e22, char e21, char e20,
112 char e19, char e18, char e17, char e16,
113 char e15, char e14, char e13, char e12,
114 char e11, char e10, char e09, char e08,
115 char e07, char e06, char e05, char e04,
116 char e03, char e02, char e01, char e00)
117{
118 return (__m512i) (__v64qi) {
119 e00, e01, e02, e03, e04, e05, e06, e07,
120 e08, e09, e10, e11, e12, e13, e14, e15,
121 e16, e17, e18, e19, e20, e21, e22, e23,
122 e24, e25, e26, e27, e28, e29, e30, e31,
123 e32, e33, e34, e35, e36, e37, e38, e39,
124 e40, e41, e42, e43, e44, e45, e46, e47,
125 e48, e49, e50, e51, e52, e53, e54, e55,
126 e56, e57, e58, e59, e60, e61, e62, e63
127 };
128}
129#endif // GCC pre-9
130
Marat Dukhancfb31342019-12-05 10:42:57 -0800131#endif // __AVX512F__