blob: 044048d33356cddd65fbdadcadcceecdfd56dfa2 [file] [log] [blame]
Hangyu Kuangf047e7c2016-07-06 14:21:45 -07001/*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070011#include "libyuv/rotate_row.h"
Frank Barchardcead1e02017-03-10 12:03:05 -080012#include "libyuv/row.h"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070013
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19// This module is for 32 bit Visual C x86 and clangcl
20#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21
Frank Barchardb83bb382017-02-22 18:01:07 -080022__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
23 int src_stride,
24 uint8* dst,
25 int dst_stride,
26 int width) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070027 __asm {
28 push edi
29 push esi
30 push ebp
Frank Barchardb83bb382017-02-22 18:01:07 -080031 mov eax, [esp + 12 + 4] // src
32 mov edi, [esp + 12 + 8] // src_stride
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070033 mov edx, [esp + 12 + 12] // dst
34 mov esi, [esp + 12 + 16] // dst_stride
35 mov ecx, [esp + 12 + 20] // width
36
37 // Read in the data from the source pointer.
38 // First round of bit swap.
39 align 4
40 convertloop:
41 movq xmm0, qword ptr [eax]
42 lea ebp, [eax + 8]
43 movq xmm1, qword ptr [eax + edi]
44 lea eax, [eax + 2 * edi]
45 punpcklbw xmm0, xmm1
46 movq xmm2, qword ptr [eax]
47 movdqa xmm1, xmm0
48 palignr xmm1, xmm1, 8
49 movq xmm3, qword ptr [eax + edi]
50 lea eax, [eax + 2 * edi]
51 punpcklbw xmm2, xmm3
52 movdqa xmm3, xmm2
53 movq xmm4, qword ptr [eax]
54 palignr xmm3, xmm3, 8
55 movq xmm5, qword ptr [eax + edi]
56 punpcklbw xmm4, xmm5
57 lea eax, [eax + 2 * edi]
58 movdqa xmm5, xmm4
59 movq xmm6, qword ptr [eax]
60 palignr xmm5, xmm5, 8
61 movq xmm7, qword ptr [eax + edi]
62 punpcklbw xmm6, xmm7
63 mov eax, ebp
64 movdqa xmm7, xmm6
65 palignr xmm7, xmm7, 8
66 // Second round of bit swap.
67 punpcklwd xmm0, xmm2
68 punpcklwd xmm1, xmm3
69 movdqa xmm2, xmm0
70 movdqa xmm3, xmm1
71 palignr xmm2, xmm2, 8
72 palignr xmm3, xmm3, 8
73 punpcklwd xmm4, xmm6
74 punpcklwd xmm5, xmm7
75 movdqa xmm6, xmm4
76 movdqa xmm7, xmm5
77 palignr xmm6, xmm6, 8
78 palignr xmm7, xmm7, 8
79 // Third round of bit swap.
80 // Write to the destination pointer.
81 punpckldq xmm0, xmm4
82 movq qword ptr [edx], xmm0
83 movdqa xmm4, xmm0
84 palignr xmm4, xmm4, 8
85 movq qword ptr [edx + esi], xmm4
86 lea edx, [edx + 2 * esi]
87 punpckldq xmm2, xmm6
88 movdqa xmm6, xmm2
89 palignr xmm6, xmm6, 8
90 movq qword ptr [edx], xmm2
91 punpckldq xmm1, xmm5
92 movq qword ptr [edx + esi], xmm6
93 lea edx, [edx + 2 * esi]
94 movdqa xmm5, xmm1
95 movq qword ptr [edx], xmm1
96 palignr xmm5, xmm5, 8
97 punpckldq xmm3, xmm7
98 movq qword ptr [edx + esi], xmm5
99 lea edx, [edx + 2 * esi]
100 movq qword ptr [edx], xmm3
101 movdqa xmm7, xmm3
102 palignr xmm7, xmm7, 8
103 sub ecx, 8
104 movq qword ptr [edx + esi], xmm7
105 lea edx, [edx + 2 * esi]
106 jg convertloop
107
108 pop ebp
109 pop esi
110 pop edi
111 ret
112 }
113}
114
Frank Barchardb83bb382017-02-22 18:01:07 -0800115__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
116 int src_stride,
117 uint8* dst_a,
118 int dst_stride_a,
119 uint8* dst_b,
120 int dst_stride_b,
121 int w) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700122 __asm {
123 push ebx
124 push esi
125 push edi
126 push ebp
Frank Barchardb83bb382017-02-22 18:01:07 -0800127 mov eax, [esp + 16 + 4] // src
128 mov edi, [esp + 16 + 8] // src_stride
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700129 mov edx, [esp + 16 + 12] // dst_a
130 mov esi, [esp + 16 + 16] // dst_stride_a
131 mov ebx, [esp + 16 + 20] // dst_b
132 mov ebp, [esp + 16 + 24] // dst_stride_b
133 mov ecx, esp
134 sub esp, 4 + 16
135 and esp, ~15
136 mov [esp + 16], ecx
137 mov ecx, [ecx + 16 + 28] // w
138
139 align 4
140 convertloop:
Frank Barchardb83bb382017-02-22 18:01:07 -0800141 // Read in the data from the source pointer.
142 // First round of bit swap.
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700143 movdqu xmm0, [eax]
144 movdqu xmm1, [eax + edi]
145 lea eax, [eax + 2 * edi]
146 movdqa xmm7, xmm0 // use xmm7 as temp register.
147 punpcklbw xmm0, xmm1
148 punpckhbw xmm7, xmm1
149 movdqa xmm1, xmm7
150 movdqu xmm2, [eax]
151 movdqu xmm3, [eax + edi]
152 lea eax, [eax + 2 * edi]
153 movdqa xmm7, xmm2
154 punpcklbw xmm2, xmm3
155 punpckhbw xmm7, xmm3
156 movdqa xmm3, xmm7
157 movdqu xmm4, [eax]
158 movdqu xmm5, [eax + edi]
159 lea eax, [eax + 2 * edi]
160 movdqa xmm7, xmm4
161 punpcklbw xmm4, xmm5
162 punpckhbw xmm7, xmm5
163 movdqa xmm5, xmm7
164 movdqu xmm6, [eax]
165 movdqu xmm7, [eax + edi]
166 lea eax, [eax + 2 * edi]
167 movdqu [esp], xmm5 // backup xmm5
168 neg edi
Frank Barchardb83bb382017-02-22 18:01:07 -0800169 movdqa xmm5, xmm6 // use xmm5 as temp register.
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700170 punpcklbw xmm6, xmm7
171 punpckhbw xmm5, xmm7
172 movdqa xmm7, xmm5
173 lea eax, [eax + 8 * edi + 16]
174 neg edi
175 // Second round of bit swap.
176 movdqa xmm5, xmm0
177 punpcklwd xmm0, xmm2
178 punpckhwd xmm5, xmm2
179 movdqa xmm2, xmm5
180 movdqa xmm5, xmm1
181 punpcklwd xmm1, xmm3
182 punpckhwd xmm5, xmm3
183 movdqa xmm3, xmm5
184 movdqa xmm5, xmm4
185 punpcklwd xmm4, xmm6
186 punpckhwd xmm5, xmm6
187 movdqa xmm6, xmm5
188 movdqu xmm5, [esp] // restore xmm5
189 movdqu [esp], xmm6 // backup xmm6
Frank Barchardb83bb382017-02-22 18:01:07 -0800190 movdqa xmm6, xmm5 // use xmm6 as temp register.
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700191 punpcklwd xmm5, xmm7
192 punpckhwd xmm6, xmm7
193 movdqa xmm7, xmm6
194 // Third round of bit swap.
195 // Write to the destination pointer.
196 movdqa xmm6, xmm0
197 punpckldq xmm0, xmm4
198 punpckhdq xmm6, xmm4
199 movdqa xmm4, xmm6
200 movdqu xmm6, [esp] // restore xmm6
201 movlpd qword ptr [edx], xmm0
202 movhpd qword ptr [ebx], xmm0
203 movlpd qword ptr [edx + esi], xmm4
204 lea edx, [edx + 2 * esi]
205 movhpd qword ptr [ebx + ebp], xmm4
206 lea ebx, [ebx + 2 * ebp]
Frank Barchardb83bb382017-02-22 18:01:07 -0800207 movdqa xmm0, xmm2 // use xmm0 as the temp register.
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700208 punpckldq xmm2, xmm6
209 movlpd qword ptr [edx], xmm2
210 movhpd qword ptr [ebx], xmm2
211 punpckhdq xmm0, xmm6
212 movlpd qword ptr [edx + esi], xmm0
213 lea edx, [edx + 2 * esi]
214 movhpd qword ptr [ebx + ebp], xmm0
215 lea ebx, [ebx + 2 * ebp]
Frank Barchardb83bb382017-02-22 18:01:07 -0800216 movdqa xmm0, xmm1 // use xmm0 as the temp register.
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700217 punpckldq xmm1, xmm5
218 movlpd qword ptr [edx], xmm1
219 movhpd qword ptr [ebx], xmm1
220 punpckhdq xmm0, xmm5
221 movlpd qword ptr [edx + esi], xmm0
222 lea edx, [edx + 2 * esi]
223 movhpd qword ptr [ebx + ebp], xmm0
224 lea ebx, [ebx + 2 * ebp]
Frank Barchardb83bb382017-02-22 18:01:07 -0800225 movdqa xmm0, xmm3 // use xmm0 as the temp register.
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700226 punpckldq xmm3, xmm7
227 movlpd qword ptr [edx], xmm3
228 movhpd qword ptr [ebx], xmm3
229 punpckhdq xmm0, xmm7
230 sub ecx, 8
231 movlpd qword ptr [edx + esi], xmm0
232 lea edx, [edx + 2 * esi]
233 movhpd qword ptr [ebx + ebp], xmm0
234 lea ebx, [ebx + 2 * ebp]
235 jg convertloop
236
237 mov esp, [esp + 16]
238 pop ebp
239 pop edi
240 pop esi
241 pop ebx
242 ret
243 }
244}
245
246#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
247
248#ifdef __cplusplus
249} // extern "C"
250} // namespace libyuv
251#endif