blob: 04298c389870c20b111fe63b550a6b2b65800651 [file] [log] [blame]
Marat Dukhan80fc9322019-09-29 21:06:36 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07008
9#include <emmintrin.h>
10
11#include <xnnpack/zip.h>
12
13
14void xnn_x8_zip_x3_ukernel__sse2(
15 size_t n,
16 const uint8_t* input,
17 uint8_t* output)
18{
19 const uint8_t* x = input;
20 const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
21 const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
22 uint8_t* o = output;
23
24 if (n >= 16) {
25 const __m128i vmask0x00FF00FF = _mm_set1_epi16(0x00FF);
26 const __m128i vmask0x0000FFFF = _mm_set1_epi32(0x0000FFFF);
27 do {
Marat Dukhan80fc9322019-09-29 21:06:36 -070028 // vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070029 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
30 x += 16;
Marat Dukhan80fc9322019-09-29 21:06:36 -070031 // vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070032 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
33 y += 16;
Marat Dukhan80fc9322019-09-29 21:06:36 -070034 // vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070035 const __m128i vz = _mm_loadu_si128((const __m128i*) z);
36 z += 16;
37
Marat Dukhan80fc9322019-09-29 21:06:36 -070038 // vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070039 const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
Marat Dukhan80fc9322019-09-29 21:06:36 -070040 // vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070041 const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
Marat Dukhan80fc9322019-09-29 21:06:36 -070042 // vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070043 const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
44
Marat Dukhan80fc9322019-09-29 21:06:36 -070045 // vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070046 const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
Marat Dukhan80fc9322019-09-29 21:06:36 -070047 // vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070048 const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
Marat Dukhan80fc9322019-09-29 21:06:36 -070049 // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070050 const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
51
Marat Dukhan80fc9322019-09-29 21:06:36 -070052 // vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070053 const __m128i vtemp0 = _mm_castps_si128(
54 _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
Marat Dukhan80fc9322019-09-29 21:06:36 -070055 // vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070056 const __m128i vtemp1 = _mm_castps_si128(
57 _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
Marat Dukhan80fc9322019-09-29 21:06:36 -070058 // vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070059 const __m128i vtemp2 = _mm_castps_si128(
60 _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
61
Marat Dukhan80fc9322019-09-29 21:06:36 -070062 // vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070063 const __m128i vxyz0 = _mm_castps_si128(
64 _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
Marat Dukhan80fc9322019-09-29 21:06:36 -070065 // vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070066 const __m128i vxyz1 = _mm_castps_si128(
67 _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
Marat Dukhan80fc9322019-09-29 21:06:36 -070068 // vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070069 const __m128i vxyz2 = _mm_castps_si128(
70 _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
71
72 _mm_storeu_si128((__m128i*) o, vxyz0);
73 _mm_storeu_si128((__m128i*) o + 1, vxyz1);
74 _mm_storeu_si128((__m128i*) o + 2, vxyz2);
75 o += 48;
76 n -= 16;
77 } while (n >= 16);
78 if (n != 0) {
79 const size_t address_increment = n - 16;
Marat Dukhan80fc9322019-09-29 21:06:36 -070080 // vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
Marat Dukhan80fc9322019-09-29 21:06:36 -070082 // vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070083 const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
Marat Dukhan80fc9322019-09-29 21:06:36 -070084 // vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070085 const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment));
86
Marat Dukhan80fc9322019-09-29 21:06:36 -070087 // vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070088 const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
Marat Dukhan80fc9322019-09-29 21:06:36 -070089 // vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070090 const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
Marat Dukhan80fc9322019-09-29 21:06:36 -070091 // vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070092 const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
93
Marat Dukhan80fc9322019-09-29 21:06:36 -070094 // vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070095 const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
Marat Dukhan80fc9322019-09-29 21:06:36 -070096 // vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070097 const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
Marat Dukhan80fc9322019-09-29 21:06:36 -070098 // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 )
XNNPACK Teamb455b122019-09-27 18:10:33 -070099 const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
100
Marat Dukhan80fc9322019-09-29 21:06:36 -0700101 // vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 )
XNNPACK Teamb455b122019-09-27 18:10:33 -0700102 const __m128i vtemp0 = _mm_castps_si128(
103 _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
Marat Dukhan80fc9322019-09-29 21:06:36 -0700104 // vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -0700105 const __m128i vtemp1 = _mm_castps_si128(
106 _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
Marat Dukhan80fc9322019-09-29 21:06:36 -0700107 // vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 )
XNNPACK Teamb455b122019-09-27 18:10:33 -0700108 const __m128i vtemp2 = _mm_castps_si128(
109 _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
110
Marat Dukhan80fc9322019-09-29 21:06:36 -0700111 // vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 )
XNNPACK Teamb455b122019-09-27 18:10:33 -0700112 const __m128i vxyz0 = _mm_castps_si128(
113 _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
Marat Dukhan80fc9322019-09-29 21:06:36 -0700114 // vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 )
XNNPACK Teamb455b122019-09-27 18:10:33 -0700115 const __m128i vxyz1 = _mm_castps_si128(
116 _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
Marat Dukhan80fc9322019-09-29 21:06:36 -0700117 // vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
XNNPACK Teamb455b122019-09-27 18:10:33 -0700118 const __m128i vxyz2 = _mm_castps_si128(
119 _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
120
121 o = (uint8_t*) ((uintptr_t) o + address_increment * 3);
122 _mm_storeu_si128((__m128i*) o, vxyz0);
123 _mm_storeu_si128((__m128i*) o + 1, vxyz1);
124 _mm_storeu_si128((__m128i*) o + 2, vxyz2);
125 }
126 } else {
127 do {
128 const uint8_t vx = *x++;
129 const uint8_t vy = *y++;
130 const uint8_t vz = *z++;
131 o[0] = vx;
132 o[1] = vy;
133 o[2] = vz;
134 o += 3;
135 } while (--n != 0);
136 }
137}