Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 1 | // Copyright 2019 Google LLC |
| 2 | // |
| 3 | // This source code is licensed under the BSD-style license found in the |
| 4 | // LICENSE file in the root directory of this source tree. |
| 5 | |
| 6 | #pragma once |
| 7 | |
Marat Dukhan | 8aaf186 | 2020-04-28 10:26:13 -0700 | [diff] [blame] | 8 | #include <xnnpack/common.h> |
| 9 | |
| 10 | |
Marat Dukhan | f124e88 | 2020-08-09 19:48:10 -0700 | [diff] [blame] | 11 | #if defined(__SSE2__) |
| 12 | #include <emmintrin.h> |
| 13 | |
| 14 | // GCC any, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16 |
| 15 | #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)) || \ |
| 16 | (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \ |
| 17 | (defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \ |
| 18 | (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \ |
| 19 | (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1600)) |
| 20 | |
| 21 | static XNN_INTRINSIC |
| 22 | __m128i _mm_loadu_si32(const void* address) { |
| 23 | return _mm_cvtsi32_si128(*((const int*) address)); |
| 24 | } |
| 25 | |
| 26 | static XNN_INTRINSIC |
| 27 | void _mm_storeu_si32(const void* address, __m128i v) { |
| 28 | *((int*) address) = _mm_cvtsi128_si32(v); |
| 29 | } |
| 30 | #endif // GCC any, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16 |
| 31 | #endif // SSE2 |
| 32 | |
Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 33 | #ifdef __AVX512F__ |
| 34 | #include <immintrin.h> |
| 35 | |
Marat Dukhan | 69a6a76 | 2020-06-17 23:40:16 -0700 | [diff] [blame] | 36 | // GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-18 |
Ashkan Aliabadi | 9520fc9 | 2019-12-18 14:02:35 -0800 | [diff] [blame] | 37 | #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \ |
Marat Dukhan | 69a6a76 | 2020-06-17 23:40:16 -0700 | [diff] [blame] | 38 | (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \ |
| 39 | (defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \ |
Marat Dukhan | b187835 | 2020-03-17 16:54:11 -0700 | [diff] [blame] | 40 | (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \ |
Ashkan Aliabadi | 9520fc9 | 2019-12-18 14:02:35 -0800 | [diff] [blame] | 41 | (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) |
| 42 | |
Marat Dukhan | 8aaf186 | 2020-04-28 10:26:13 -0700 | [diff] [blame] | 43 | static XNN_INTRINSIC |
| 44 | __mmask16 _cvtu32_mask16(unsigned int mask) { |
Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 45 | return (__mmask16) mask; |
| 46 | } |
Marat Dukhan | 65a0139 | 2019-12-05 11:23:47 -0800 | [diff] [blame] | 47 | |
Marat Dukhan | bb00b1d | 2020-08-10 11:37:23 -0700 | [diff] [blame] | 48 | static XNN_INTRINSIC |
| 49 | __mmask64 _cvtu64_mask64(unsigned long long mask) { |
| 50 | return (__mmask64) mask; |
| 51 | } |
| 52 | |
Marat Dukhan | e5b5dea | 2020-10-22 01:19:19 -0700 | [diff] [blame] | 53 | static XNN_INTRINSIC |
| 54 | __mmask64 _kshiftli_mask64(__mmask64 a, unsigned int count) { |
| 55 | return (__mmask64) ((unsigned long long) a << count); |
| 56 | } |
| 57 | |
Marat Dukhan | b9deb96 | 2020-10-22 10:08:59 -0700 | [diff] [blame] | 58 | static XNN_INTRINSIC |
| 59 | __mmask64 _kshiftri_mask64(__mmask64 a, unsigned int count) { |
| 60 | return (__mmask64) ((unsigned long long) a >> count); |
| 61 | } |
| 62 | |
Marat Dukhan | f124e88 | 2020-08-09 19:48:10 -0700 | [diff] [blame] | 63 | #endif // GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-18 |
Ashkan Aliabadi | 9520fc9 | 2019-12-18 14:02:35 -0800 | [diff] [blame] | 64 | |
| 65 | // GCC pre-7, Clang pre-4, and ICC pre-18 |
| 66 | #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \ |
| 67 | (defined(__clang__) && (__clang_major__ < 4)) || \ |
| 68 | (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) |
| 69 | |
Marat Dukhan | 8aaf186 | 2020-04-28 10:26:13 -0700 | [diff] [blame] | 70 | static XNN_INTRINSIC |
| 71 | float _mm512_reduce_add_ps(__m512 v) { |
Marat Dukhan | 65a0139 | 2019-12-05 11:23:47 -0800 | [diff] [blame] | 72 | #if __AVX512DQ__ |
| 73 | const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); |
| 74 | #else |
| 75 | const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1))); |
Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 76 | #endif |
Marat Dukhan | 65a0139 | 2019-12-05 11:23:47 -0800 | [diff] [blame] | 77 | const __m128 sum4 = _mm_add_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1)); |
| 78 | const __m128 sum8 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4)); |
| 79 | const __m128 sum16 = _mm_add_ss(sum8, _mm_movehdup_ps(sum8)); |
| 80 | return _mm_cvtss_f32(sum16); |
| 81 | } |
| 82 | |
Marat Dukhan | 8aaf186 | 2020-04-28 10:26:13 -0700 | [diff] [blame] | 83 | static XNN_INTRINSIC |
| 84 | float _mm512_reduce_max_ps(__m512 v) { |
Marat Dukhan | 65a0139 | 2019-12-05 11:23:47 -0800 | [diff] [blame] | 85 | #if __AVX512DQ__ |
| 86 | const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); |
| 87 | #else |
| 88 | const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1))); |
| 89 | #endif |
| 90 | const __m128 sum4 = _mm_max_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1)); |
| 91 | const __m128 sum8 = _mm_max_ps(sum4, _mm_movehl_ps(sum4, sum4)); |
| 92 | const __m128 sum16 = _mm_max_ss(sum8, _mm_movehdup_ps(sum8)); |
| 93 | return _mm_cvtss_f32(sum16); |
| 94 | } |
Ashkan Aliabadi | 9520fc9 | 2019-12-18 14:02:35 -0800 | [diff] [blame] | 95 | |
| 96 | #endif // GCC pre-7, Clang pre-4, and ICC pre-18 |
Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 97 | |
Marat Dukhan | 2a18f7e | 2020-08-12 13:36:26 -0700 | [diff] [blame] | 98 | #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 9) |
| 99 | static XNN_INTRINSIC |
| 100 | __m512i _mm512_set_epi8( |
| 101 | char e63, char e62, char e61, char e60, |
| 102 | char e59, char e58, char e57, char e56, |
| 103 | char e55, char e54, char e53, char e52, |
| 104 | char e51, char e50, char e49, char e48, |
| 105 | char e47, char e46, char e45, char e44, |
| 106 | char e43, char e42, char e41, char e40, |
| 107 | char e39, char e38, char e37, char e36, |
| 108 | char e35, char e34, char e33, char e32, |
| 109 | char e31, char e30, char e29, char e28, |
| 110 | char e27, char e26, char e25, char e24, |
| 111 | char e23, char e22, char e21, char e20, |
| 112 | char e19, char e18, char e17, char e16, |
| 113 | char e15, char e14, char e13, char e12, |
| 114 | char e11, char e10, char e09, char e08, |
| 115 | char e07, char e06, char e05, char e04, |
| 116 | char e03, char e02, char e01, char e00) |
| 117 | { |
| 118 | return (__m512i) (__v64qi) { |
| 119 | e00, e01, e02, e03, e04, e05, e06, e07, |
| 120 | e08, e09, e10, e11, e12, e13, e14, e15, |
| 121 | e16, e17, e18, e19, e20, e21, e22, e23, |
| 122 | e24, e25, e26, e27, e28, e29, e30, e31, |
| 123 | e32, e33, e34, e35, e36, e37, e38, e39, |
| 124 | e40, e41, e42, e43, e44, e45, e46, e47, |
| 125 | e48, e49, e50, e51, e52, e53, e54, e55, |
| 126 | e56, e57, e58, e59, e60, e61, e62, e63 |
| 127 | }; |
| 128 | } |
| 129 | #endif // GCC pre-9 |
| 130 | |
Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 131 | #endif // __AVX512F__ |