blob: caa192d306f8372e3f389b19de097c26974f33cf [file] [log] [blame]
Marat Dukhancfb31342019-12-05 10:42:57 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#pragma once
7
Marat Dukhan8aaf1862020-04-28 10:26:13 -07008#include <xnnpack/common.h>
9
10
11#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1))
12#include <xmmintrin.h>
13
14static XNN_INTRINSIC XNN_DISABLE_TSAN
15__m128 _mm_loadu_ps_notsan(const float* address) {
16 return _mm_loadu_ps(address);
17}
18#endif
19
20#if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
21#include <emmintrin.h>
22
23static XNN_INTRINSIC XNN_DISABLE_TSAN
24__m128i _mm_loadu_si128_notsan(const __m128i* address) {
25 return _mm_loadu_si128(address);
26}
27#endif
28
Marat Dukhancfb31342019-12-05 10:42:57 -080029
30#ifdef __AVX512F__
31#include <immintrin.h>
32
Marat Dukhanb1878352020-03-17 16:54:11 -070033// GCC pre-7, Clang pre-8, Apple Clang pre-11, and ICC pre-18
Ashkan Aliabadi9520fc92019-12-18 14:02:35 -080034#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
35 (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \
Marat Dukhanb1878352020-03-17 16:54:11 -070036 (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \
Ashkan Aliabadi9520fc92019-12-18 14:02:35 -080037 (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
38
Marat Dukhan8aaf1862020-04-28 10:26:13 -070039static XNN_INTRINSIC
40__mmask16 _cvtu32_mask16(unsigned int mask) {
Marat Dukhancfb31342019-12-05 10:42:57 -080041 return (__mmask16) mask;
42}
Marat Dukhan65a01392019-12-05 11:23:47 -080043
Ashkan Aliabadi9520fc92019-12-18 14:02:35 -080044#endif // GCC pre-7, Clang pre-8, Apple Clang pre-10, and ICC pre-18
45
46// GCC pre-7, Clang pre-4, and ICC pre-18
47#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
48 (defined(__clang__) && (__clang_major__ < 4)) || \
49 (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
50
Marat Dukhan8aaf1862020-04-28 10:26:13 -070051static XNN_INTRINSIC
52float _mm512_reduce_add_ps(__m512 v) {
Marat Dukhan65a01392019-12-05 11:23:47 -080053#if __AVX512DQ__
54 const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
55#else
56 const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
Marat Dukhancfb31342019-12-05 10:42:57 -080057#endif
Marat Dukhan65a01392019-12-05 11:23:47 -080058 const __m128 sum4 = _mm_add_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
59 const __m128 sum8 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4));
60 const __m128 sum16 = _mm_add_ss(sum8, _mm_movehdup_ps(sum8));
61 return _mm_cvtss_f32(sum16);
62}
63
Marat Dukhan8aaf1862020-04-28 10:26:13 -070064static XNN_INTRINSIC
65float _mm512_reduce_max_ps(__m512 v) {
Marat Dukhan65a01392019-12-05 11:23:47 -080066#if __AVX512DQ__
67 const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
68#else
69 const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
70#endif
71 const __m128 sum4 = _mm_max_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
72 const __m128 sum8 = _mm_max_ps(sum4, _mm_movehl_ps(sum4, sum4));
73 const __m128 sum16 = _mm_max_ss(sum8, _mm_movehdup_ps(sum8));
74 return _mm_cvtss_f32(sum16);
75}
Ashkan Aliabadi9520fc92019-12-18 14:02:35 -080076
77#endif // GCC pre-7, Clang pre-4, and ICC pre-18
Marat Dukhancfb31342019-12-05 10:42:57 -080078
79#endif // __AVX512F__