Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 1 | // Copyright 2019 Google LLC |
| 2 | // |
| 3 | // This source code is licensed under the BSD-style license found in the |
| 4 | // LICENSE file in the root directory of this source tree. |
| 5 | |
| 6 | #pragma once |
| 7 | |
Marat Dukhan | 8aaf186 | 2020-04-28 10:26:13 -0700 | [diff] [blame] | 8 | #include <xnnpack/common.h> |
| 9 | |
| 10 | |
| 11 | #if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) |
| 12 | #include <xmmintrin.h> |
| 13 | |
| 14 | static XNN_INTRINSIC XNN_DISABLE_TSAN |
| 15 | __m128 _mm_loadu_ps_notsan(const float* address) { |
| 16 | return _mm_loadu_ps(address); |
| 17 | } |
| 18 | #endif |
| 19 | |
| 20 | #if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) |
| 21 | #include <emmintrin.h> |
| 22 | |
| 23 | static XNN_INTRINSIC XNN_DISABLE_TSAN |
| 24 | __m128i _mm_loadu_si128_notsan(const __m128i* address) { |
| 25 | return _mm_loadu_si128(address); |
| 26 | } |
| 27 | #endif |
| 28 | |
Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 29 | |
| 30 | #ifdef __AVX512F__ |
| 31 | #include <immintrin.h> |
| 32 | |
Marat Dukhan | b187835 | 2020-03-17 16:54:11 -0700 | [diff] [blame] | 33 | // GCC pre-7, Clang pre-8, Apple Clang pre-11, and ICC pre-18 |
Ashkan Aliabadi | 9520fc9 | 2019-12-18 14:02:35 -0800 | [diff] [blame] | 34 | #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \ |
| 35 | (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \ |
Marat Dukhan | b187835 | 2020-03-17 16:54:11 -0700 | [diff] [blame] | 36 | (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \ |
Ashkan Aliabadi | 9520fc9 | 2019-12-18 14:02:35 -0800 | [diff] [blame] | 37 | (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) |
| 38 | |
Marat Dukhan | 8aaf186 | 2020-04-28 10:26:13 -0700 | [diff] [blame] | 39 | static XNN_INTRINSIC |
| 40 | __mmask16 _cvtu32_mask16(unsigned int mask) { |
Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 41 | return (__mmask16) mask; |
| 42 | } |
Marat Dukhan | 65a0139 | 2019-12-05 11:23:47 -0800 | [diff] [blame] | 43 | |
Ashkan Aliabadi | 9520fc9 | 2019-12-18 14:02:35 -0800 | [diff] [blame] | 44 | #endif // GCC pre-7, Clang pre-8, Apple Clang pre-10, and ICC pre-18 |
| 45 | |
| 46 | // GCC pre-7, Clang pre-4, and ICC pre-18 |
| 47 | #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \ |
| 48 | (defined(__clang__) && (__clang_major__ < 4)) || \ |
| 49 | (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) |
| 50 | |
Marat Dukhan | 8aaf186 | 2020-04-28 10:26:13 -0700 | [diff] [blame] | 51 | static XNN_INTRINSIC |
| 52 | float _mm512_reduce_add_ps(__m512 v) { |
Marat Dukhan | 65a0139 | 2019-12-05 11:23:47 -0800 | [diff] [blame] | 53 | #if __AVX512DQ__ |
| 54 | const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); |
| 55 | #else |
| 56 | const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1))); |
Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 57 | #endif |
Marat Dukhan | 65a0139 | 2019-12-05 11:23:47 -0800 | [diff] [blame] | 58 | const __m128 sum4 = _mm_add_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1)); |
| 59 | const __m128 sum8 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4)); |
| 60 | const __m128 sum16 = _mm_add_ss(sum8, _mm_movehdup_ps(sum8)); |
| 61 | return _mm_cvtss_f32(sum16); |
| 62 | } |
| 63 | |
Marat Dukhan | 8aaf186 | 2020-04-28 10:26:13 -0700 | [diff] [blame] | 64 | static XNN_INTRINSIC |
| 65 | float _mm512_reduce_max_ps(__m512 v) { |
Marat Dukhan | 65a0139 | 2019-12-05 11:23:47 -0800 | [diff] [blame] | 66 | #if __AVX512DQ__ |
| 67 | const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1)); |
| 68 | #else |
| 69 | const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1))); |
| 70 | #endif |
| 71 | const __m128 sum4 = _mm_max_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1)); |
| 72 | const __m128 sum8 = _mm_max_ps(sum4, _mm_movehl_ps(sum4, sum4)); |
| 73 | const __m128 sum16 = _mm_max_ss(sum8, _mm_movehdup_ps(sum8)); |
| 74 | return _mm_cvtss_f32(sum16); |
| 75 | } |
Ashkan Aliabadi | 9520fc9 | 2019-12-18 14:02:35 -0800 | [diff] [blame] | 76 | |
| 77 | #endif // GCC pre-7, Clang pre-4, and ICC pre-18 |
Marat Dukhan | cfb3134 | 2019-12-05 10:42:57 -0800 | [diff] [blame] | 78 | |
| 79 | #endif // __AVX512F__ |