Logan Chien | bedbf4f | 2020-01-06 19:35:19 -0800 | [diff] [blame] | 1 | /*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------=== |
| 2 | * |
| 3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | * See https://llvm.org/LICENSE.txt for license information. |
| 5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | * |
| 7 | *===-----------------------------------------------------------------------=== |
| 8 | */ |
| 9 | |
| 10 | /* Implemented from the specification included in the Intel C++ Compiler |
| 11 | User Guide and Reference, version 9.0. */ |
| 12 | |
| 13 | #ifndef NO_WARN_X86_INTRINSICS |
| 14 | /* This header is distributed to simplify porting x86_64 code that |
| 15 | makes explicit use of Intel intrinsics to powerpc64le. |
| 16 | It is the user's responsibility to determine if the results are |
| 17 | acceptable and make additional changes as necessary. |
| 18 | Note that much code that uses Intel intrinsics can be rewritten in |
| 19 | standard C or GNU C extensions, which are more portable and better |
| 20 | optimized across multiple targets. |
| 21 | |
| 22 | In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA |
| 23 | is a good match for most SIMD operations. However the Horizontal |
| 24 | add/sub requires the data pairs be permuted into a separate |
| 25 | registers with vertical even/odd alignment for the operation. |
| 26 | And the addsub operation requires the sign of only the even numbered |
| 27 | elements be flipped (xored with -0.0). |
| 28 | For larger blocks of code using these intrinsic implementations, |
| 29 | the compiler be should be able to schedule instructions to avoid |
| 30 | additional latency. |
| 31 | |
| 32 | In the specific case of the monitor and mwait instructions there are |
| 33 | no direct equivalent in the PowerISA at this time. So those |
| 34 | intrinsics are not implemented. */ |
| 35 | #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." |
| 36 | #endif |
| 37 | |
| 38 | #ifndef PMMINTRIN_H_ |
| 39 | #define PMMINTRIN_H_ |
| 40 | |
Pirama Arumuga Nainar | ec8c89d | 2022-02-23 09:26:16 -0800 | [diff] [blame] | 41 | #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) |
Logan Chien | bedbf4f | 2020-01-06 19:35:19 -0800 | [diff] [blame] | 42 | |
| 43 | /* We need definitions from the SSE2 and SSE header files*/ |
| 44 | #include <emmintrin.h> |
| 45 | |
| 46 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 47 | _mm_addsub_ps (__m128 __X, __m128 __Y) |
| 48 | { |
| 49 | const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0}; |
| 50 | __v4sf even_neg_Y = vec_xor(__Y, even_n0); |
| 51 | return (__m128) vec_add (__X, even_neg_Y); |
| 52 | } |
| 53 | |
| 54 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 55 | _mm_addsub_pd (__m128d __X, __m128d __Y) |
| 56 | { |
| 57 | const __v2df even_n0 = {-0.0, 0.0}; |
| 58 | __v2df even_neg_Y = vec_xor(__Y, even_n0); |
| 59 | return (__m128d) vec_add (__X, even_neg_Y); |
| 60 | } |
| 61 | |
| 62 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 63 | _mm_hadd_ps (__m128 __X, __m128 __Y) |
| 64 | { |
| 65 | __vector unsigned char xform2 = { |
| 66 | 0x00, 0x01, 0x02, 0x03, |
| 67 | 0x08, 0x09, 0x0A, 0x0B, |
| 68 | 0x10, 0x11, 0x12, 0x13, |
| 69 | 0x18, 0x19, 0x1A, 0x1B |
| 70 | }; |
| 71 | __vector unsigned char xform1 = { |
| 72 | 0x04, 0x05, 0x06, 0x07, |
| 73 | 0x0C, 0x0D, 0x0E, 0x0F, |
| 74 | 0x14, 0x15, 0x16, 0x17, |
| 75 | 0x1C, 0x1D, 0x1E, 0x1F |
| 76 | }; |
| 77 | return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), |
| 78 | vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); |
| 79 | } |
| 80 | |
| 81 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 82 | _mm_hsub_ps (__m128 __X, __m128 __Y) |
| 83 | { |
| 84 | __vector unsigned char xform2 = { |
| 85 | 0x00, 0x01, 0x02, 0x03, |
| 86 | 0x08, 0x09, 0x0A, 0x0B, |
| 87 | 0x10, 0x11, 0x12, 0x13, |
| 88 | 0x18, 0x19, 0x1A, 0x1B |
| 89 | }; |
| 90 | __vector unsigned char xform1 = { |
| 91 | 0x04, 0x05, 0x06, 0x07, |
| 92 | 0x0C, 0x0D, 0x0E, 0x0F, |
| 93 | 0x14, 0x15, 0x16, 0x17, |
| 94 | 0x1C, 0x1D, 0x1E, 0x1F |
| 95 | }; |
| 96 | return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), |
| 97 | vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); |
| 98 | } |
| 99 | |
| 100 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 101 | _mm_hadd_pd (__m128d __X, __m128d __Y) |
| 102 | { |
| 103 | return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y), |
| 104 | vec_mergel ((__v2df) __X, (__v2df)__Y)); |
| 105 | } |
| 106 | |
| 107 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 108 | _mm_hsub_pd (__m128d __X, __m128d __Y) |
| 109 | { |
| 110 | return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y), |
| 111 | vec_mergel ((__v2df) __X, (__v2df)__Y)); |
| 112 | } |
| 113 | |
| 114 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 115 | _mm_movehdup_ps (__m128 __X) |
| 116 | { |
| 117 | return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); |
| 118 | } |
| 119 | |
| 120 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 121 | _mm_moveldup_ps (__m128 __X) |
| 122 | { |
| 123 | return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); |
| 124 | } |
| 125 | |
| 126 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 127 | _mm_loaddup_pd (double const *__P) |
| 128 | { |
| 129 | return (__m128d) vec_splats (*__P); |
| 130 | } |
| 131 | |
| 132 | extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 133 | _mm_movedup_pd (__m128d __X) |
| 134 | { |
| 135 | return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); |
| 136 | } |
| 137 | |
| 138 | extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| 139 | _mm_lddqu_si128 (__m128i const *__P) |
| 140 | { |
| 141 | return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); |
| 142 | } |
| 143 | |
| 144 | /* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */ |
| 145 | |
| 146 | #else |
| 147 | #include_next <pmmintrin.h> |
Pirama Arumuga Nainar | ec8c89d | 2022-02-23 09:26:16 -0800 | [diff] [blame] | 148 | #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ |
| 149 | */ |
Logan Chien | bedbf4f | 2020-01-06 19:35:19 -0800 | [diff] [blame] | 150 | |
| 151 | #endif /* PMMINTRIN_H_ */ |