Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 1 | // Copyright (c) Facebook, Inc. and its affiliates. |
| 2 | // All rights reserved. |
| 3 | // |
| 4 | // Copyright 2019 Google LLC |
| 5 | // |
| 6 | // This source code is licensed under the BSD-style license found in the |
| 7 | // LICENSE file in the root directory of this source tree. |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 8 | |
| 9 | #include <emmintrin.h> |
| 10 | |
| 11 | #include <xnnpack/zip.h> |
| 12 | |
| 13 | |
| 14 | void xnn_x8_zip_x3_ukernel__sse2( |
| 15 | size_t n, |
| 16 | const uint8_t* input, |
| 17 | uint8_t* output) |
| 18 | { |
| 19 | const uint8_t* x = input; |
| 20 | const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); |
| 21 | const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); |
| 22 | uint8_t* o = output; |
| 23 | |
| 24 | if (n >= 16) { |
| 25 | const __m128i vmask0x00FF00FF = _mm_set1_epi16(0x00FF); |
| 26 | const __m128i vmask0x0000FFFF = _mm_set1_epi32(0x0000FFFF); |
| 27 | do { |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 28 | // vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 29 | const __m128i vx = _mm_loadu_si128((const __m128i*) x); |
| 30 | x += 16; |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 31 | // vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 32 | const __m128i vy = _mm_loadu_si128((const __m128i*) y); |
| 33 | y += 16; |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 34 | // vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 35 | const __m128i vz = _mm_loadu_si128((const __m128i*) z); |
| 36 | z += 16; |
| 37 | |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 38 | // vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 39 | const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 40 | // vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 41 | const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 42 | // vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 43 | const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx)); |
| 44 | |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 45 | // vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 46 | const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 47 | // vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 48 | const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 49 | // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 50 | const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16)); |
| 51 | |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 52 | // vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 53 | const __m128i vtemp0 = _mm_castps_si128( |
| 54 | _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0))); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 55 | // vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 56 | const __m128i vtemp1 = _mm_castps_si128( |
| 57 | _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0))); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 58 | // vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 59 | const __m128i vtemp2 = _mm_castps_si128( |
| 60 | _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1))); |
| 61 | |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 62 | // vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 63 | const __m128i vxyz0 = _mm_castps_si128( |
| 64 | _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0))); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 65 | // vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 66 | const __m128i vxyz1 = _mm_castps_si128( |
| 67 | _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0))); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 68 | // vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 69 | const __m128i vxyz2 = _mm_castps_si128( |
| 70 | _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1))); |
| 71 | |
| 72 | _mm_storeu_si128((__m128i*) o, vxyz0); |
| 73 | _mm_storeu_si128((__m128i*) o + 1, vxyz1); |
| 74 | _mm_storeu_si128((__m128i*) o + 2, vxyz2); |
| 75 | o += 48; |
| 76 | n -= 16; |
| 77 | } while (n >= 16); |
| 78 | if (n != 0) { |
| 79 | const size_t address_increment = n - 16; |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 80 | // vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 81 | const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 82 | // vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 83 | const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 84 | // vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 85 | const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment)); |
| 86 | |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 87 | // vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 88 | const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 89 | // vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 90 | const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 91 | // vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 92 | const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx)); |
| 93 | |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 94 | // vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 95 | const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 96 | // vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 97 | const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye)); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 98 | // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 99 | const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16)); |
| 100 | |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 101 | // vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 102 | const __m128i vtemp0 = _mm_castps_si128( |
| 103 | _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0))); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 104 | // vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 105 | const __m128i vtemp1 = _mm_castps_si128( |
| 106 | _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0))); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 107 | // vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 108 | const __m128i vtemp2 = _mm_castps_si128( |
| 109 | _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1))); |
| 110 | |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 111 | // vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 112 | const __m128i vxyz0 = _mm_castps_si128( |
| 113 | _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0))); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 114 | // vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 115 | const __m128i vxyz1 = _mm_castps_si128( |
| 116 | _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0))); |
Marat Dukhan | 80fc932 | 2019-09-29 21:06:36 -0700 | [diff] [blame] | 117 | // vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 ) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 118 | const __m128i vxyz2 = _mm_castps_si128( |
| 119 | _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1))); |
| 120 | |
| 121 | o = (uint8_t*) ((uintptr_t) o + address_increment * 3); |
| 122 | _mm_storeu_si128((__m128i*) o, vxyz0); |
| 123 | _mm_storeu_si128((__m128i*) o + 1, vxyz1); |
| 124 | _mm_storeu_si128((__m128i*) o + 2, vxyz2); |
| 125 | } |
| 126 | } else { |
| 127 | do { |
| 128 | const uint8_t vx = *x++; |
| 129 | const uint8_t vy = *y++; |
| 130 | const uint8_t vz = *z++; |
| 131 | o[0] = vx; |
| 132 | o[1] = vy; |
| 133 | o[2] = vz; |
| 134 | o += 3; |
| 135 | } while (--n != 0); |
| 136 | } |
| 137 | } |