blob: 045fc20a05f5ca87e1dad28135e105266029eb33 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001/*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 * All rights reserved.
4 *
5 * Copyright 2019 Google LLC
6 *
7 * This source code is licensed under the BSD-style license found in the
8 * LICENSE file in the root directory of this source tree.
9 */
10
11#include <emmintrin.h>
12
13#include <xnnpack/zip.h>
14
15
16void xnn_x8_zip_x3_ukernel__sse2(
17 size_t n,
18 const uint8_t* input,
19 uint8_t* output)
20{
21 const uint8_t* x = input;
22 const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
23 const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
24 uint8_t* o = output;
25
26 if (n >= 16) {
27 const __m128i vmask0x00FF00FF = _mm_set1_epi16(0x00FF);
28 const __m128i vmask0x0000FFFF = _mm_set1_epi32(0x0000FFFF);
29 do {
30 /* vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ) */
31 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
32 x += 16;
33 /* vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 ) */
34 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
35 y += 16;
36 /* vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 ) */
37 const __m128i vz = _mm_loadu_si128((const __m128i*) z);
38 z += 16;
39
40 /* vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 ) */
41 const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
42 /* vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 ) */
43 const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
44 /* vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 ) */
45 const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
46
47 /* vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 ) */
48 const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
49 /* vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 ) */
50 const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
51 /* vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 ) */
52 const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
53
54 /* vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 ) */
55 const __m128i vtemp0 = _mm_castps_si128(
56 _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
57 /* vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 ) */
58 const __m128i vtemp1 = _mm_castps_si128(
59 _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
60 /* vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 ) */
61 const __m128i vtemp2 = _mm_castps_si128(
62 _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
63
64 /* vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 ) */
65 const __m128i vxyz0 = _mm_castps_si128(
66 _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
67 /* vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 ) */
68 const __m128i vxyz1 = _mm_castps_si128(
69 _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
70 /* vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 ) */
71 const __m128i vxyz2 = _mm_castps_si128(
72 _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
73
74 _mm_storeu_si128((__m128i*) o, vxyz0);
75 _mm_storeu_si128((__m128i*) o + 1, vxyz1);
76 _mm_storeu_si128((__m128i*) o + 2, vxyz2);
77 o += 48;
78 n -= 16;
79 } while (n >= 16);
80 if (n != 0) {
81 const size_t address_increment = n - 16;
82 /* vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ) */
83 const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
84 /* vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 ) */
85 const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
86 /* vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 ) */
87 const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment));
88
89 /* vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 ) */
90 const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
91 /* vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 ) */
92 const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
93 /* vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 ) */
94 const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
95
96 /* vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 ) */
97 const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
98 /* vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 ) */
99 const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
100 /* vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 ) */
101 const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
102
103 /* vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 ) */
104 const __m128i vtemp0 = _mm_castps_si128(
105 _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
106 /* vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 ) */
107 const __m128i vtemp1 = _mm_castps_si128(
108 _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
109 /* vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 ) */
110 const __m128i vtemp2 = _mm_castps_si128(
111 _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
112
113 /* vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 ) */
114 const __m128i vxyz0 = _mm_castps_si128(
115 _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
116 /* vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 ) */
117 const __m128i vxyz1 = _mm_castps_si128(
118 _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
119 /* vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 ) */
120 const __m128i vxyz2 = _mm_castps_si128(
121 _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
122
123 o = (uint8_t*) ((uintptr_t) o + address_increment * 3);
124 _mm_storeu_si128((__m128i*) o, vxyz0);
125 _mm_storeu_si128((__m128i*) o + 1, vxyz1);
126 _mm_storeu_si128((__m128i*) o + 2, vxyz2);
127 }
128 } else {
129 do {
130 const uint8_t vx = *x++;
131 const uint8_t vy = *y++;
132 const uint8_t vz = *z++;
133 o[0] = vx;
134 o[1] = vy;
135 o[2] = vz;
136 o += 3;
137 } while (--n != 0);
138 }
139}