blob: 9a1770e5faddf8fee1713cb55afabe7f0d74c32a [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
21#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
22
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000023#ifdef __APPLE__
24#define CONST
25#else
26#define CONST static const
27#endif
28
fbarchard@google.comb6149762011-11-07 21:58:52 +000029#ifdef HAS_ARGBTOUVROW_SSSE3
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000030CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000031 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
32};
33
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000034CONST uvec8 kARGBToV = {
fbarchard@google.com2430e042011-11-11 21:57:06 +000035 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
fbarchard@google.comb6149762011-11-07 21:58:52 +000036};
fbarchard@google.com2430e042011-11-11 21:57:06 +000037
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
40 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
41};
42#endif
43
fbarchard@google.com228bdc22011-11-15 21:58:26 +000044#ifdef HAS_ARGBTOYROW_SSSE3
45
46// Constant multiplication table for converting ARGB to I400.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000047CONST vec8 kARGBToY = {
fbarchard@google.com228bdc22011-11-15 21:58:26 +000048 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
49};
50
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000051CONST uvec8 kAddY16 = {
fbarchard@google.com228bdc22011-11-15 21:58:26 +000052 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
53};
54
fbarchard@google.comba1f5262012-01-12 19:22:41 +000055// Shuffle table for converting RGB24 to ARGB.
56CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000057 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
58};
59
60// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000061CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000062 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
63};
64
fbarchard@google.comb6149762011-11-07 21:58:52 +000065// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000066CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000067 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
68};
69
70// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000071CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000072 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
73};
74
75void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000076 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +000077 "pcmpeqb %%xmm5,%%xmm5 \n"
78 "pslld $0x18,%%xmm5 \n"
79 "1: \n"
80 "movq (%0),%%xmm0 \n"
81 "lea 0x8(%0),%0 \n"
82 "punpcklbw %%xmm0,%%xmm0 \n"
83 "movdqa %%xmm0,%%xmm1 \n"
84 "punpcklwd %%xmm0,%%xmm0 \n"
85 "punpckhwd %%xmm1,%%xmm1 \n"
86 "por %%xmm5,%%xmm0 \n"
87 "por %%xmm5,%%xmm1 \n"
88 "movdqa %%xmm0,(%1) \n"
89 "movdqa %%xmm1,0x10(%1) \n"
90 "lea 0x20(%1),%1 \n"
91 "sub $0x8,%2 \n"
92 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 : "+r"(src_y), // %0
94 "+r"(dst_argb), // %1
95 "+r"(pix) // %2
96 :
97 : "memory", "cc"
98#if defined(__SSE2__)
99 , "xmm0", "xmm1", "xmm5"
100#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +0000101);
102}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000103
104void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000105 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000106 "movdqa %3,%%xmm5 \n"
107 "1: \n"
108 "movdqa (%0),%%xmm0 \n"
109 "lea 0x10(%0),%0 \n"
110 "pshufb %%xmm5,%%xmm0 \n"
111 "movdqa %%xmm0,(%1) \n"
112 "lea 0x10(%1),%1 \n"
113 "sub $0x4,%2 \n"
114 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000115 : "+r"(src_abgr), // %0
116 "+r"(dst_argb), // %1
117 "+r"(pix) // %2
118 : "m"(kShuffleMaskABGRToARGB) // %3
119 : "memory", "cc"
120#if defined(__SSE2__)
121 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000122#endif
123
fbarchard@google.comb6149762011-11-07 21:58:52 +0000124);
125}
126
127void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000128 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000129 "movdqa %3,%%xmm5 \n"
130 "1: \n"
131 "movdqa (%0),%%xmm0 \n"
132 "lea 0x10(%0),%0 \n"
133 "pshufb %%xmm5,%%xmm0 \n"
134 "movdqa %%xmm0,(%1) \n"
135 "lea 0x10(%1),%1 \n"
136 "sub $0x4,%2 \n"
137 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000138 : "+r"(src_bgra), // %0
139 "+r"(dst_argb), // %1
140 "+r"(pix) // %2
141 : "m"(kShuffleMaskBGRAToARGB) // %3
142 : "memory", "cc"
143#if defined(__SSE2__)
144 , "xmm0", "xmm5"
145#endif
146);
147}
148
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000149void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000150 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000151 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
152 "pslld $0x18,%%xmm5 \n"
153 "movdqa %3,%%xmm4 \n"
154 "1: \n"
155 "movdqu (%0),%%xmm0 \n"
156 "movdqu 0x10(%0),%%xmm1 \n"
157 "movdqu 0x20(%0),%%xmm3 \n"
158 "lea 0x30(%0),%0 \n"
159 "movdqa %%xmm3,%%xmm2 \n"
160 "palignr $0x8,%%xmm1,%%xmm2 \n"
161 "pshufb %%xmm4,%%xmm2 \n"
162 "por %%xmm5,%%xmm2 \n"
163 "palignr $0xc,%%xmm0,%%xmm1 \n"
164 "pshufb %%xmm4,%%xmm0 \n"
165 "movdqa %%xmm2,0x20(%1) \n"
166 "por %%xmm5,%%xmm0 \n"
167 "pshufb %%xmm4,%%xmm1 \n"
168 "movdqa %%xmm0,(%1) \n"
169 "por %%xmm5,%%xmm1 \n"
170 "palignr $0x4,%%xmm3,%%xmm3 \n"
171 "pshufb %%xmm4,%%xmm3 \n"
172 "movdqa %%xmm1,0x10(%1) \n"
173 "por %%xmm5,%%xmm3 \n"
174 "movdqa %%xmm3,0x30(%1) \n"
175 "lea 0x40(%1),%1 \n"
176 "sub $0x10,%2 \n"
177 "ja 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000178 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000179 "+r"(dst_argb), // %1
180 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000181 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000182 : "memory", "cc"
183#if defined(__SSE2__)
184 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
185#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000186);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000187}
188
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000189void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000190 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000191 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
192 "pslld $0x18,%%xmm5 \n"
193 "movdqa %3,%%xmm4 \n"
194 "1: \n"
195 "movdqu (%0),%%xmm0 \n"
196 "movdqu 0x10(%0),%%xmm1 \n"
197 "movdqu 0x20(%0),%%xmm3 \n"
198 "lea 0x30(%0),%0 \n"
199 "movdqa %%xmm3,%%xmm2 \n"
200 "palignr $0x8,%%xmm1,%%xmm2 \n"
201 "pshufb %%xmm4,%%xmm2 \n"
202 "por %%xmm5,%%xmm2 \n"
203 "palignr $0xc,%%xmm0,%%xmm1 \n"
204 "pshufb %%xmm4,%%xmm0 \n"
205 "movdqa %%xmm2,0x20(%1) \n"
206 "por %%xmm5,%%xmm0 \n"
207 "pshufb %%xmm4,%%xmm1 \n"
208 "movdqa %%xmm0,(%1) \n"
209 "por %%xmm5,%%xmm1 \n"
210 "palignr $0x4,%%xmm3,%%xmm3 \n"
211 "pshufb %%xmm4,%%xmm3 \n"
212 "movdqa %%xmm1,0x10(%1) \n"
213 "por %%xmm5,%%xmm3 \n"
214 "movdqa %%xmm3,0x30(%1) \n"
215 "lea 0x40(%1),%1 \n"
216 "sub $0x10,%2 \n"
217 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000218 : "+r"(src_raw), // %0
219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000221 : "m"(kShuffleMaskRAWToARGB) // %3
222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
225#endif
226);
227}
228
229void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "movdqa %4,%%xmm5 \n"
232 "movdqa %3,%%xmm4 \n"
233 "1: \n"
234 "movdqa (%0),%%xmm0 \n"
235 "movdqa 0x10(%0),%%xmm1 \n"
236 "movdqa 0x20(%0),%%xmm2 \n"
237 "movdqa 0x30(%0),%%xmm3 \n"
238 "pmaddubsw %%xmm4,%%xmm0 \n"
239 "pmaddubsw %%xmm4,%%xmm1 \n"
240 "pmaddubsw %%xmm4,%%xmm2 \n"
241 "pmaddubsw %%xmm4,%%xmm3 \n"
242 "lea 0x40(%0),%0 \n"
243 "phaddw %%xmm1,%%xmm0 \n"
244 "phaddw %%xmm3,%%xmm2 \n"
245 "psrlw $0x7,%%xmm0 \n"
246 "psrlw $0x7,%%xmm2 \n"
247 "packuswb %%xmm2,%%xmm0 \n"
248 "paddb %%xmm5,%%xmm0 \n"
249 "movdqa %%xmm0,(%1) \n"
250 "lea 0x10(%1),%1 \n"
251 "sub $0x10,%2 \n"
252 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000253 : "+r"(src_argb), // %0
254 "+r"(dst_y), // %1
255 "+r"(pix) // %2
256 : "m"(kARGBToY), // %3
257 "m"(kAddY16) // %4
258 : "memory", "cc"
259#if defined(__SSE2__)
260 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
261#endif
262
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000263);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000264}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000265
266void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
267 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000268 "movdqa %4,%%xmm5 \n"
269 "movdqa %3,%%xmm4 \n"
270 "1: \n"
271 "movdqu (%0),%%xmm0 \n"
272 "movdqu 0x10(%0),%%xmm1 \n"
273 "movdqu 0x20(%0),%%xmm2 \n"
274 "movdqu 0x30(%0),%%xmm3 \n"
275 "pmaddubsw %%xmm4,%%xmm0 \n"
276 "pmaddubsw %%xmm4,%%xmm1 \n"
277 "pmaddubsw %%xmm4,%%xmm2 \n"
278 "pmaddubsw %%xmm4,%%xmm3 \n"
279 "lea 0x40(%0),%0 \n"
280 "phaddw %%xmm1,%%xmm0 \n"
281 "phaddw %%xmm3,%%xmm2 \n"
282 "psrlw $0x7,%%xmm0 \n"
283 "psrlw $0x7,%%xmm2 \n"
284 "packuswb %%xmm2,%%xmm0 \n"
285 "paddb %%xmm5,%%xmm0 \n"
286 "movdqu %%xmm0,(%1) \n"
287 "lea 0x10(%1),%1 \n"
288 "sub $0x10,%2 \n"
289 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000290 : "+r"(src_argb), // %0
291 "+r"(dst_y), // %1
292 "+r"(pix) // %2
293 : "m"(kARGBToY), // %3
294 "m"(kAddY16) // %4
295 : "memory", "cc"
296#if defined(__SSE2__)
297 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
298#endif
299
300);
301}
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000302#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +0000303
fbarchard@google.comb6149762011-11-07 21:58:52 +0000304#ifdef HAS_ARGBTOUVROW_SSSE3
305void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
306 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000307 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000308 "movdqa %0,%%xmm4 \n"
309 "movdqa %1,%%xmm3 \n"
310 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000311 :
312 : "m"(kARGBToU), // %0
313 "m"(kARGBToV), // %1
314 "m"(kAddUV128) // %2
315 :
316#if defined(__SSE2__)
317 "xmm3", "xmm4", "xmm5"
318#endif
319 );
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000320 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000321 "sub %1,%2 \n"
322 "1: \n"
323 "movdqa (%0),%%xmm0 \n"
324 "movdqa 0x10(%0),%%xmm1 \n"
325 "movdqa 0x20(%0),%%xmm2 \n"
326 "movdqa 0x30(%0),%%xmm6 \n"
327 "pavgb (%0,%4,1),%%xmm0 \n"
328 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
329 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
330 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
331 "lea 0x40(%0),%0 \n"
332 "movdqa %%xmm0,%%xmm7 \n"
333 "shufps $0x88,%%xmm1,%%xmm0 \n"
334 "shufps $0xdd,%%xmm1,%%xmm7 \n"
335 "pavgb %%xmm7,%%xmm0 \n"
336 "movdqa %%xmm2,%%xmm7 \n"
337 "shufps $0x88,%%xmm6,%%xmm2 \n"
338 "shufps $0xdd,%%xmm6,%%xmm7 \n"
339 "pavgb %%xmm7,%%xmm2 \n"
340 "movdqa %%xmm0,%%xmm1 \n"
341 "movdqa %%xmm2,%%xmm6 \n"
342 "pmaddubsw %%xmm4,%%xmm0 \n"
343 "pmaddubsw %%xmm4,%%xmm2 \n"
344 "pmaddubsw %%xmm3,%%xmm1 \n"
345 "pmaddubsw %%xmm3,%%xmm6 \n"
346 "phaddw %%xmm2,%%xmm0 \n"
347 "phaddw %%xmm6,%%xmm1 \n"
348 "psraw $0x8,%%xmm0 \n"
349 "psraw $0x8,%%xmm1 \n"
350 "packsswb %%xmm1,%%xmm0 \n"
351 "paddb %%xmm5,%%xmm0 \n"
352 "movlps %%xmm0,(%1) \n"
353 "movhps %%xmm0,(%1,%2,1) \n"
354 "lea 0x8(%1),%1 \n"
355 "sub $0x10,%3 \n"
356 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000357 : "+r"(src_argb0), // %0
358 "+r"(dst_u), // %1
359 "+r"(dst_v), // %2
360 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000361 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000362 : "memory", "cc"
363#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000364 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000365#endif
366);
367}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000368
369void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
370 uint8* dst_u, uint8* dst_v, int width) {
371 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000372 "movdqa %0,%%xmm4 \n"
373 "movdqa %1,%%xmm3 \n"
374 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000375 :
376 : "m"(kARGBToU), // %0
377 "m"(kARGBToV), // %1
378 "m"(kAddUV128) // %2
379 :
380#if defined(__SSE2__)
381 "xmm3", "xmm4", "xmm5"
382#endif
383 );
384 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000385 "sub %1,%2 \n"
386 "1: \n"
387 "movdqu (%0),%%xmm0 \n"
388 "movdqu 0x10(%0),%%xmm1 \n"
389 "movdqu 0x20(%0),%%xmm2 \n"
390 "movdqu 0x30(%0),%%xmm6 \n"
391 "movdqu (%0,%4,1),%%xmm7 \n"
392 "pavgb %%xmm7,%%xmm0 \n"
393 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
394 "pavgb %%xmm7,%%xmm1 \n"
395 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
396 "pavgb %%xmm7,%%xmm2 \n"
397 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
398 "pavgb %%xmm7,%%xmm6 \n"
399 "lea 0x40(%0),%0 \n"
400 "movdqa %%xmm0,%%xmm7 \n"
401 "shufps $0x88,%%xmm1,%%xmm0 \n"
402 "shufps $0xdd,%%xmm1,%%xmm7 \n"
403 "pavgb %%xmm7,%%xmm0 \n"
404 "movdqa %%xmm2,%%xmm7 \n"
405 "shufps $0x88,%%xmm6,%%xmm2 \n"
406 "shufps $0xdd,%%xmm6,%%xmm7 \n"
407 "pavgb %%xmm7,%%xmm2 \n"
408 "movdqa %%xmm0,%%xmm1 \n"
409 "movdqa %%xmm2,%%xmm6 \n"
410 "pmaddubsw %%xmm4,%%xmm0 \n"
411 "pmaddubsw %%xmm4,%%xmm2 \n"
412 "pmaddubsw %%xmm3,%%xmm1 \n"
413 "pmaddubsw %%xmm3,%%xmm6 \n"
414 "phaddw %%xmm2,%%xmm0 \n"
415 "phaddw %%xmm6,%%xmm1 \n"
416 "psraw $0x8,%%xmm0 \n"
417 "psraw $0x8,%%xmm1 \n"
418 "packsswb %%xmm1,%%xmm0 \n"
419 "paddb %%xmm5,%%xmm0 \n"
420 "movlps %%xmm0,(%1) \n"
421 "movhps %%xmm0,(%1,%2,1) \n"
422 "lea 0x8(%1),%1 \n"
423 "sub $0x10,%3 \n"
424 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000425 : "+r"(src_argb0), // %0
426 "+r"(dst_u), // %1
427 "+r"(dst_v), // %2
428 "+rm"(width) // %3
429 : "r"(static_cast<intptr_t>(src_stride_argb))
430 : "memory", "cc"
431#if defined(__SSE2__)
432 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
433#endif
434);
435}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000436#endif
437
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000438#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000439#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
440#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
441#define UR 0
442
443#define VB 0
444#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
445#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
446
447// Bias
448#define BB UB * 128 + VB * 128
449#define BG UG * 128 + VG * 128
450#define BR UR * 128 + VR * 128
451
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000452#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000453
fbarchard@google.comb6149762011-11-07 21:58:52 +0000454#if defined(__APPLE__) || defined(__x86_64__)
455#define OMITFP
456#else
457#define OMITFP __attribute__((optimize("omit-frame-pointer")))
458#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000459
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000460struct {
461 vec8 kUVToB;
462 vec8 kUVToG;
463 vec8 kUVToR;
464 vec16 kUVBiasB;
465 vec16 kUVBiasG;
466 vec16 kUVBiasR;
467 vec16 kYSub16;
468 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000469} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000470 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
471 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
472 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
473 { BB, BB, BB, BB, BB, BB, BB, BB },
474 { BG, BG, BG, BG, BG, BG, BG, BG },
475 { BR, BR, BR, BR, BR, BR, BR, BR },
476 { 16, 16, 16, 16, 16, 16, 16, 16 },
477 { YG, YG, YG, YG, YG, YG, YG, YG }
478};
479
480// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +0000481#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000482 "movd (%1),%%xmm0 \n" \
483 "movd (%1,%2,1),%%xmm1 \n" \
484 "lea 0x4(%1),%1 \n" \
485 "punpcklbw %%xmm1,%%xmm0 \n" \
486 "punpcklwd %%xmm0,%%xmm0 \n" \
487 "movdqa %%xmm0,%%xmm1 \n" \
488 "movdqa %%xmm0,%%xmm2 \n" \
489 "pmaddubsw (%5),%%xmm0 \n" \
490 "pmaddubsw 16(%5),%%xmm1 \n" \
491 "pmaddubsw 32(%5),%%xmm2 \n" \
492 "psubw 48(%5),%%xmm0 \n" \
493 "psubw 64(%5),%%xmm1 \n" \
494 "psubw 80(%5),%%xmm2 \n" \
495 "movq (%0),%%xmm3 \n" \
496 "lea 0x8(%0),%0 \n" \
497 "punpcklbw %%xmm4,%%xmm3 \n" \
498 "psubsw 96(%5),%%xmm3 \n" \
499 "pmullw 112(%5),%%xmm3 \n" \
500 "paddsw %%xmm3,%%xmm0 \n" \
501 "paddsw %%xmm3,%%xmm1 \n" \
502 "paddsw %%xmm3,%%xmm2 \n" \
503 "psraw $0x6,%%xmm0 \n" \
504 "psraw $0x6,%%xmm1 \n" \
505 "psraw $0x6,%%xmm2 \n" \
506 "packuswb %%xmm0,%%xmm0 \n" \
507 "packuswb %%xmm1,%%xmm1 \n" \
508 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000509
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000510void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, // rdi
511 const uint8* u_buf, // rsi
512 const uint8* v_buf, // rdx
fbarchard@google.comb6149762011-11-07 21:58:52 +0000513 uint8* rgb_buf, // rcx
514 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000515 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000516 "sub %1,%2 \n"
517 "pcmpeqb %%xmm5,%%xmm5 \n"
518 "pxor %%xmm4,%%xmm4 \n"
519
520 "1: \n"
521 YUVTORGB
522 "punpcklbw %%xmm1,%%xmm0 \n"
523 "punpcklbw %%xmm5,%%xmm2 \n"
524 "movdqa %%xmm0,%%xmm1 \n"
525 "punpcklwd %%xmm2,%%xmm0 \n"
526 "punpckhwd %%xmm2,%%xmm1 \n"
527 "movdqa %%xmm0,(%3) \n"
528 "movdqa %%xmm1,0x10(%3) \n"
529 "lea 0x20(%3),%3 \n"
530 "sub $0x8,%4 \n"
531 "ja 1b \n"
532 : "+r"(y_buf), // %0
533 "+r"(u_buf), // %1
534 "+r"(v_buf), // %2
535 "+r"(rgb_buf), // %3
536 "+rm"(width) // %4
537 : "r"(&kYuvConstants.kUVToB) // %5
538 : "memory", "cc"
539#if defined(__SSE2__)
540 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
541#endif
542 );
543}
544
545void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, // rdi
546 const uint8* u_buf, // rsi
547 const uint8* v_buf, // rdx
548 uint8* rgb_buf, // rcx
549 int width) { // r8
550 asm volatile (
551 "sub %1,%2 \n"
552 "pcmpeqb %%xmm5,%%xmm5 \n"
553 "pxor %%xmm4,%%xmm4 \n"
554
555 "1: \n"
556 YUVTORGB
557 "pcmpeqb %%xmm5,%%xmm5 \n"
558 "punpcklbw %%xmm0,%%xmm1 \n"
559 "punpcklbw %%xmm2,%%xmm5 \n"
560 "movdqa %%xmm5,%%xmm0 \n"
561 "punpcklwd %%xmm1,%%xmm5 \n"
562 "punpckhwd %%xmm1,%%xmm0 \n"
563 "movdqa %%xmm5,(%3) \n"
564 "movdqa %%xmm0,0x10(%3) \n"
565 "lea 0x20(%3),%3 \n"
566 "sub $0x8,%4 \n"
567 "ja 1b \n"
568 : "+r"(y_buf), // %0
569 "+r"(u_buf), // %1
570 "+r"(v_buf), // %2
571 "+r"(rgb_buf), // %3
572 "+rm"(width) // %4
573 : "r"(&kYuvConstants.kUVToB) // %5
574 : "memory", "cc"
575#if defined(__SSE2__)
576 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
577#endif
578 );
579}
580
581void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, // rdi
582 const uint8* u_buf, // rsi
583 const uint8* v_buf, // rdx
584 uint8* rgb_buf, // rcx
585 int width) { // r8
586 asm volatile (
587 "sub %1,%2 \n"
588 "pcmpeqb %%xmm5,%%xmm5 \n"
589 "pxor %%xmm4,%%xmm4 \n"
590
591 "1: \n"
592 YUVTORGB
593 "punpcklbw %%xmm1,%%xmm2 \n"
594 "punpcklbw %%xmm5,%%xmm0 \n"
595 "movdqa %%xmm2,%%xmm1 \n"
596 "punpcklwd %%xmm0,%%xmm2 \n"
597 "punpckhwd %%xmm0,%%xmm1 \n"
598 "movdqa %%xmm2,(%3) \n"
599 "movdqa %%xmm1,0x10(%3) \n"
600 "lea 0x20(%3),%3 \n"
601 "sub $0x8,%4 \n"
602 "ja 1b \n"
603 : "+r"(y_buf), // %0
604 "+r"(u_buf), // %1
605 "+r"(v_buf), // %2
606 "+r"(rgb_buf), // %3
607 "+rm"(width) // %4
608 : "r"(&kYuvConstants.kUVToB) // %5
609 : "memory", "cc"
610#if defined(__SSE2__)
611 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
612#endif
613 );
614}
615
616void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
617 const uint8* u_buf, // rsi
618 const uint8* v_buf, // rdx
619 uint8* rgb_buf, // rcx
620 int width) { // r8
621 asm volatile (
622 "sub %1,%2 \n"
623 "pcmpeqb %%xmm5,%%xmm5 \n"
624 "pxor %%xmm4,%%xmm4 \n"
625
626 "1: \n"
627 "movd (%1),%%xmm0 \n"
628 "movd (%1,%2,1),%%xmm1 \n"
629 "lea 0x4(%1),%1 \n"
630 "punpcklbw %%xmm1,%%xmm0 \n"
631 "movdqa %%xmm0,%%xmm1 \n"
632 "movdqa %%xmm0,%%xmm2 \n"
633 "pmaddubsw (%5),%%xmm0 \n"
634 "pmaddubsw 16(%5),%%xmm1 \n"
635 "pmaddubsw 32(%5),%%xmm2 \n"
636 "psubw 48(%5),%%xmm0 \n"
637 "psubw 64(%5),%%xmm1 \n"
638 "psubw 80(%5),%%xmm2 \n"
639 "movd (%0),%%xmm3 \n"
640 "lea 0x4(%0),%0 \n"
641 "punpcklbw %%xmm4,%%xmm3 \n"
642 "psubsw 96(%5),%%xmm3 \n"
643 "pmullw 112(%5),%%xmm3 \n"
644 "paddsw %%xmm3,%%xmm0 \n"
645 "paddsw %%xmm3,%%xmm1 \n"
646 "paddsw %%xmm3,%%xmm2 \n"
647 "psraw $0x6,%%xmm0 \n"
648 "psraw $0x6,%%xmm1 \n"
649 "psraw $0x6,%%xmm2 \n"
650 "packuswb %%xmm0,%%xmm0 \n"
651 "packuswb %%xmm1,%%xmm1 \n"
652 "packuswb %%xmm2,%%xmm2 \n"
653 "punpcklbw %%xmm1,%%xmm0 \n"
654 "punpcklbw %%xmm5,%%xmm2 \n"
655 "punpcklwd %%xmm2,%%xmm0 \n"
656 "movdqa %%xmm0,(%3) \n"
657 "lea 0x10(%3),%3 \n"
658 "sub $0x4,%4 \n"
659 "ja 1b \n"
660 : "+r"(y_buf), // %0
661 "+r"(u_buf), // %1
662 "+r"(v_buf), // %2
663 "+r"(rgb_buf), // %3
664 "+rm"(width) // %4
665 : "r"(&kYuvConstants.kUVToB) // %5
666 : "memory", "cc"
667#if defined(__SSE2__)
668 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
669#endif
670 );
671}
672#endif
673
674#ifdef HAS_YTOARGBROW_SSE2
675
676void YToARGBRow_SSE2(const uint8* y_buf, // rdi
677 uint8* rgb_buf, // rcx
678 int width) { // r8
679 asm volatile (
680 "pcmpeqb %%xmm4,%%xmm4 \n"
681 "pslld $0x18,%%xmm4 \n"
682 "mov $0x10001000,%%eax \n"
683 "movd %%eax,%%xmm3 \n"
684 "pshufd $0x0,%%xmm3,%%xmm3 \n"
685 "mov $0x012a012a,%%eax \n"
686 "movd %%eax,%%xmm2 \n"
687 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000688
689 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000690 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000691 "movq (%0),%%xmm0 \n"
692 "lea 0x8(%0),%0 \n"
693 "punpcklbw %%xmm0,%%xmm0 \n"
694 "psubusw %%xmm3,%%xmm0 \n"
695 "pmulhuw %%xmm2,%%xmm0 \n"
696 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000697
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000698 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000699 "punpcklbw %%xmm0,%%xmm0 \n"
700 "movdqa %%xmm0,%%xmm1 \n"
701 "punpcklwd %%xmm0,%%xmm0 \n"
702 "punpckhwd %%xmm1,%%xmm1 \n"
703 "por %%xmm4,%%xmm0 \n"
704 "por %%xmm4,%%xmm1 \n"
705 "movdqa %%xmm0,(%1) \n"
706 "movdqa %%xmm1,16(%1) \n"
707 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000708
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000709 "sub $0x8,%2 \n"
710 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000711 : "+r"(y_buf), // %0
712 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +0000713 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000714 :
715 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000716#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000717 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000718#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000719 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000720}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000721#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000722
frkoenig@google.come5185422011-11-07 23:07:57 +0000723#ifdef HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000724void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
725 SIMD_ALIGNED(uint8 row[kMaxStride]);
726 ABGRToARGBRow_SSSE3(src_argb, row, pix);
727 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000728}
729
fbarchard@google.comb6149762011-11-07 21:58:52 +0000730void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
731 SIMD_ALIGNED(uint8 row[kMaxStride]);
732 BGRAToARGBRow_SSSE3(src_argb, row, pix);
733 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000734}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000735
736void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
737 SIMD_ALIGNED(uint8 row[kMaxStride]);
738 ABGRToARGBRow_C(src_argb, row, pix);
739 ARGBToYRow_SSSE3(row, dst_y, pix);
740}
741
742void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
743 SIMD_ALIGNED(uint8 row[kMaxStride]);
744 BGRAToARGBRow_C(src_argb, row, pix);
745 ARGBToYRow_SSSE3(row, dst_y, pix);
746}
frkoenig@google.come5185422011-11-07 23:07:57 +0000747#endif
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000748
fbarchard@google.comb6149762011-11-07 21:58:52 +0000749#ifdef HAS_ARGBTOUVROW_SSSE3
750void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
751 uint8* dst_u, uint8* dst_v, int pix) {
752 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
753 ABGRToARGBRow_SSSE3(src_argb, row, pix);
754 ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
755 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000756}
757
fbarchard@google.comb6149762011-11-07 21:58:52 +0000758void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
759 uint8* dst_u, uint8* dst_v, int pix) {
760 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
761 BGRAToARGBRow_SSSE3(src_argb, row, pix);
762 BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
763 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000764}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000765
766void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
767 uint8* dst_u, uint8* dst_v, int pix) {
768 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
769 ABGRToARGBRow_C(src_argb, row, pix);
770 ABGRToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
771 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
772}
773
774void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
775 uint8* dst_u, uint8* dst_v, int pix) {
776 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
777 BGRAToARGBRow_C(src_argb, row, pix);
778 BGRAToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
779 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
780}
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000781#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000782
fbarchard@google.com42831e02012-01-21 02:54:17 +0000783#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +0000784
785// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +0000786CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +0000787 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
788};
789
fbarchard@google.com42831e02012-01-21 02:54:17 +0000790void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +0000791 intptr_t temp_width = static_cast<intptr_t>(width);
792 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000793 "movdqa %3,%%xmm5 \n"
794 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000795 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000796 "movdqa (%0,%2),%%xmm0 \n"
797 "pshufb %%xmm5,%%xmm0 \n"
798 "sub $0x10,%2 \n"
799 "movdqa %%xmm0,(%1) \n"
800 "lea 0x10(%1),%1 \n"
801 "ja 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +0000802 : "+r"(src), // %0
803 "+r"(dst), // %1
804 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +0000805 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +0000806 : "memory", "cc"
807#if defined(__SSE2__)
808 , "xmm0", "xmm5"
809#endif
810 );
811}
812#endif
813
fbarchard@google.com42831e02012-01-21 02:54:17 +0000814#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000815
fbarchard@google.com42831e02012-01-21 02:54:17 +0000816void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000817 intptr_t temp_width = static_cast<intptr_t>(width);
818 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000819 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000820 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000821 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +0000822 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000823 "psllw $0x8,%%xmm0 \n"
824 "psrlw $0x8,%%xmm1 \n"
825 "por %%xmm1,%%xmm0 \n"
826 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
827 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
828 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
829 "sub $0x10,%2 \n"
830 "movdqu %%xmm0,(%1) \n"
831 "lea 0x10(%1),%1 \n"
832 "ja 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000833 : "+r"(src), // %0
834 "+r"(dst), // %1
835 "+r"(temp_width) // %2
836 :
837 : "memory", "cc"
838#if defined(__SSE2__)
839 , "xmm0", "xmm1"
840#endif
841 );
842}
843#endif
844
fbarchard@google.com2d11d432012-02-16 02:50:39 +0000845#ifdef HAS_SPLITUV_SSE2
846void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
847 asm volatile (
848 "pcmpeqb %%xmm5,%%xmm5 \n"
849 "psrlw $0x8,%%xmm5 \n"
850 "sub %1,%2 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +0000851 "1: \n"
852 "movdqa (%0),%%xmm0 \n"
853 "movdqa 0x10(%0),%%xmm1 \n"
854 "lea 0x20(%0),%0 \n"
855 "movdqa %%xmm0,%%xmm2 \n"
856 "movdqa %%xmm1,%%xmm3 \n"
857 "pand %%xmm5,%%xmm0 \n"
858 "pand %%xmm5,%%xmm1 \n"
859 "packuswb %%xmm1,%%xmm0 \n"
860 "psrlw $0x8,%%xmm2 \n"
861 "psrlw $0x8,%%xmm3 \n"
862 "packuswb %%xmm3,%%xmm2 \n"
863 "movdqa %%xmm0,(%1) \n"
864 "movdqa %%xmm2,(%1,%2) \n"
865 "lea 0x10(%1),%1 \n"
866 "sub $0x10,%3 \n"
867 "ja 1b \n"
868 : "+r"(src_uv), // %0
869 "+r"(dst_u), // %1
870 "+r"(dst_v), // %2
871 "+r"(pix) // %3
872 :
873 : "memory", "cc"
874#if defined(__SSE2__)
875 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
876#endif
877 );
878}
879#endif
880
fbarchard@google.com19932f82012-02-16 22:19:14 +0000881#ifdef HAS_COPYROW_SSE2
882void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
883 asm volatile (
884 "sub %0,%1 \n"
885 "1: \n"
886 "movdqa (%0),%%xmm0 \n"
887 "movdqa 0x10(%0),%%xmm1 \n"
888 "movdqa %%xmm0,(%0,%1) \n"
889 "movdqa %%xmm1,0x10(%0,%1) \n"
890 "lea 0x20(%0),%0 \n"
891 "sub $0x20,%2 \n"
892 "ja 1b \n"
893 : "+r"(src), // %0
894 "+r"(dst), // %1
895 "+r"(count) // %2
896 :
897 : "memory", "cc"
898#if defined(__SSE2__)
899 , "xmm0", "xmm1"
900#endif
901 );
902}
903#endif // HAS_COPYROW_SSE2
904
905#ifdef HAS_COPYROW_X86
906void CopyRow_X86(const uint8* src, uint8* dst, int width) {
907 size_t width_tmp = static_cast<size_t>(width);
908 asm volatile (
909 "shr $0x2,%2 \n"
910 "rep movsl \n"
911 : "+S"(src), // %0
912 "+D"(dst), // %1
913 "+c"(width_tmp) // %2
914 :
915 : "memory", "cc"
916 );
917}
918#endif
919
fbarchard@google.comb95dbf22012-02-11 01:18:30 +0000920#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000921void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
922 asm volatile (
923 "pcmpeqb %%xmm5,%%xmm5 \n"
924 "psrlw $0x8,%%xmm5 \n"
925 "1: \n"
926 "movdqa (%0),%%xmm0 \n"
927 "movdqa 0x10(%0),%%xmm1 \n"
928 "lea 0x20(%0),%0 \n"
929 "pand %%xmm5,%%xmm0 \n"
930 "pand %%xmm5,%%xmm1 \n"
931 "packuswb %%xmm1,%%xmm0 \n"
932 "movdqa %%xmm0,(%1) \n"
933 "lea 0x10(%1),%1 \n"
934 "sub $0x10,%2 \n"
935 "ja 1b \n"
936 : "+r"(src_yuy2), // %0
937 "+r"(dst_y), // %1
938 "+r"(pix) // %2
939 :
940 : "memory", "cc"
941#if defined(__SSE2__)
942 , "xmm0", "xmm1", "xmm5"
943#endif
944 );
945}
946
947void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
948 uint8* dst_u, uint8* dst_y, int pix) {
949 asm volatile (
950 "pcmpeqb %%xmm5,%%xmm5 \n"
951 "psrlw $0x8,%%xmm5 \n"
952 "sub %1,%2 \n"
953 "1: \n"
954 "movdqa (%0),%%xmm0 \n"
955 "movdqa 0x10(%0),%%xmm1 \n"
956 "movdqa (%0,%4,1),%%xmm2 \n"
957 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
958 "lea 0x20(%0),%0 \n"
959 "pavgb %%xmm2,%%xmm0 \n"
960 "pavgb %%xmm3,%%xmm1 \n"
961 "psrlw $0x8,%%xmm0 \n"
962 "psrlw $0x8,%%xmm1 \n"
963 "packuswb %%xmm1,%%xmm0 \n"
964 "movdqa %%xmm0,%%xmm1 \n"
965 "pand %%xmm5,%%xmm0 \n"
966 "packuswb %%xmm0,%%xmm0 \n"
967 "psrlw $0x8,%%xmm1 \n"
968 "packuswb %%xmm1,%%xmm1 \n"
969 "movq %%xmm0,(%1) \n"
970 "movq %%xmm1,(%1,%2) \n"
971 "lea 0x8(%1),%1 \n"
972 "sub $0x10,%3 \n"
973 "ja 1b \n"
974 : "+r"(src_yuy2), // %0
975 "+r"(dst_u), // %1
976 "+r"(dst_y), // %2
977 "+r"(pix) // %3
978 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
979 : "memory", "cc"
980#if defined(__SSE2__)
981 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
982#endif
983 );
984}
985
986void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
987 uint8* dst_y, int pix) {
988 asm volatile (
989 "pcmpeqb %%xmm5,%%xmm5 \n"
990 "psrlw $0x8,%%xmm5 \n"
991 "1: \n"
992 "movdqu (%0),%%xmm0 \n"
993 "movdqu 0x10(%0),%%xmm1 \n"
994 "lea 0x20(%0),%0 \n"
995 "pand %%xmm5,%%xmm0 \n"
996 "pand %%xmm5,%%xmm1 \n"
997 "packuswb %%xmm1,%%xmm0 \n"
998 "movdqu %%xmm0,(%1) \n"
999 "lea 0x10(%1),%1 \n"
1000 "sub $0x10,%2 \n"
1001 "ja 1b \n"
1002 : "+r"(src_yuy2), // %0
1003 "+r"(dst_y), // %1
1004 "+r"(pix) // %2
1005 :
1006 : "memory", "cc"
1007#if defined(__SSE2__)
1008 , "xmm0", "xmm1", "xmm5"
1009#endif
1010 );
1011}
1012
1013void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1014 int stride_yuy2,
1015 uint8* dst_u, uint8* dst_y,
1016 int pix) {
1017 asm volatile (
1018 "pcmpeqb %%xmm5,%%xmm5 \n"
1019 "psrlw $0x8,%%xmm5 \n"
1020 "sub %1,%2 \n"
1021 "1: \n"
1022 "movdqu (%0),%%xmm0 \n"
1023 "movdqu 0x10(%0),%%xmm1 \n"
1024 "movdqu (%0,%4,1),%%xmm2 \n"
1025 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1026 "lea 0x20(%0),%0 \n"
1027 "pavgb %%xmm2,%%xmm0 \n"
1028 "pavgb %%xmm3,%%xmm1 \n"
1029 "psrlw $0x8,%%xmm0 \n"
1030 "psrlw $0x8,%%xmm1 \n"
1031 "packuswb %%xmm1,%%xmm0 \n"
1032 "movdqa %%xmm0,%%xmm1 \n"
1033 "pand %%xmm5,%%xmm0 \n"
1034 "packuswb %%xmm0,%%xmm0 \n"
1035 "psrlw $0x8,%%xmm1 \n"
1036 "packuswb %%xmm1,%%xmm1 \n"
1037 "movq %%xmm0,(%1) \n"
1038 "movq %%xmm1,(%1,%2) \n"
1039 "lea 0x8(%1),%1 \n"
1040 "sub $0x10,%3 \n"
1041 "ja 1b \n"
1042 : "+r"(src_yuy2), // %0
1043 "+r"(dst_u), // %1
1044 "+r"(dst_y), // %2
1045 "+r"(pix) // %3
1046 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1047 : "memory", "cc"
1048#if defined(__SSE2__)
1049 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1050#endif
1051);
1052}
1053
1054void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
1055 asm volatile (
1056 "1: \n"
1057 "movdqa (%0),%%xmm0 \n"
1058 "movdqa 0x10(%0),%%xmm1 \n"
1059 "lea 0x20(%0),%0 \n"
1060 "psrlw $0x8,%%xmm0 \n"
1061 "psrlw $0x8,%%xmm1 \n"
1062 "packuswb %%xmm1,%%xmm0 \n"
1063 "movdqa %%xmm0,(%1) \n"
1064 "lea 0x10(%1),%1 \n"
1065 "sub $0x10,%2 \n"
1066 "ja 1b \n"
1067 : "+r"(src_uyvy), // %0
1068 "+r"(dst_y), // %1
1069 "+r"(pix) // %2
1070 :
1071 : "memory", "cc"
1072#if defined(__SSE2__)
1073 , "xmm0", "xmm1"
1074#endif
1075 );
1076}
1077
1078void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
1079 uint8* dst_u, uint8* dst_y, int pix) {
1080 asm volatile (
1081 "pcmpeqb %%xmm5,%%xmm5 \n"
1082 "psrlw $0x8,%%xmm5 \n"
1083 "sub %1,%2 \n"
1084 "1: \n"
1085 "movdqa (%0),%%xmm0 \n"
1086 "movdqa 0x10(%0),%%xmm1 \n"
1087 "movdqa (%0,%4,1),%%xmm2 \n"
1088 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1089 "lea 0x20(%0),%0 \n"
1090 "pavgb %%xmm2,%%xmm0 \n"
1091 "pavgb %%xmm3,%%xmm1 \n"
1092 "pand %%xmm5,%%xmm0 \n"
1093 "pand %%xmm5,%%xmm1 \n"
1094 "packuswb %%xmm1,%%xmm0 \n"
1095 "movdqa %%xmm0,%%xmm1 \n"
1096 "pand %%xmm5,%%xmm0 \n"
1097 "packuswb %%xmm0,%%xmm0 \n"
1098 "psrlw $0x8,%%xmm1 \n"
1099 "packuswb %%xmm1,%%xmm1 \n"
1100 "movq %%xmm0,(%1) \n"
1101 "movq %%xmm1,(%1,%2) \n"
1102 "lea 0x8(%1),%1 \n"
1103 "sub $0x10,%3 \n"
1104 "ja 1b \n"
1105 : "+r"(src_uyvy), // %0
1106 "+r"(dst_u), // %1
1107 "+r"(dst_y), // %2
1108 "+r"(pix) // %3
1109 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1110 : "memory", "cc"
1111#if defined(__SSE2__)
1112 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1113#endif
1114 );
1115}
1116
1117void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
1118 uint8* dst_y, int pix) {
1119 asm volatile (
1120 "1: \n"
1121 "movdqu (%0),%%xmm0 \n"
1122 "movdqu 0x10(%0),%%xmm1 \n"
1123 "lea 0x20(%0),%0 \n"
1124 "psrlw $0x8,%%xmm0 \n"
1125 "psrlw $0x8,%%xmm1 \n"
1126 "packuswb %%xmm1,%%xmm0 \n"
1127 "movdqu %%xmm0,(%1) \n"
1128 "lea 0x10(%1),%1 \n"
1129 "sub $0x10,%2 \n"
1130 "ja 1b \n"
1131 : "+r"(src_uyvy), // %0
1132 "+r"(dst_y), // %1
1133 "+r"(pix) // %2
1134 :
1135 : "memory", "cc"
1136#if defined(__SSE2__)
1137 , "xmm0", "xmm1"
1138#endif
1139 );
1140}
1141
1142void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
1143 uint8* dst_u, uint8* dst_y, int pix) {
1144 asm volatile (
1145 "pcmpeqb %%xmm5,%%xmm5 \n"
1146 "psrlw $0x8,%%xmm5 \n"
1147 "sub %1,%2 \n"
1148 "1: \n"
1149 "movdqu (%0),%%xmm0 \n"
1150 "movdqu 0x10(%0),%%xmm1 \n"
1151 "movdqu (%0,%4,1),%%xmm2 \n"
1152 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1153 "lea 0x20(%0),%0 \n"
1154 "pavgb %%xmm2,%%xmm0 \n"
1155 "pavgb %%xmm3,%%xmm1 \n"
1156 "pand %%xmm5,%%xmm0 \n"
1157 "pand %%xmm5,%%xmm1 \n"
1158 "packuswb %%xmm1,%%xmm0 \n"
1159 "movdqa %%xmm0,%%xmm1 \n"
1160 "pand %%xmm5,%%xmm0 \n"
1161 "packuswb %%xmm0,%%xmm0 \n"
1162 "psrlw $0x8,%%xmm1 \n"
1163 "packuswb %%xmm1,%%xmm1 \n"
1164 "movq %%xmm0,(%1) \n"
1165 "movq %%xmm1,(%1,%2) \n"
1166 "lea 0x8(%1),%1 \n"
1167 "sub $0x10,%3 \n"
1168 "ja 1b \n"
1169 : "+r"(src_uyvy), // %0
1170 "+r"(dst_u), // %1
1171 "+r"(dst_y), // %2
1172 "+r"(pix) // %3
1173 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1174 : "memory", "cc"
1175#if defined(__SSE2__)
1176 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1177#endif
1178 );
1179}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001180#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001181
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001182#endif // defined(__x86_64__) || defined(__i386__)
1183
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001184#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001185} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001186} // namespace libyuv
1187#endif