blob: ee2e779684ca538759b2a73ee969d69919417253 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
21#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
22
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
108 2u, 1u,0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
115 "1: \n"
116 "movq (%0),%%xmm0 \n"
117 "lea 0x8(%0),%0 \n"
118 "punpcklbw %%xmm0,%%xmm0 \n"
119 "movdqa %%xmm0,%%xmm1 \n"
120 "punpcklwd %%xmm0,%%xmm0 \n"
121 "punpckhwd %%xmm1,%%xmm1 \n"
122 "por %%xmm5,%%xmm0 \n"
123 "por %%xmm5,%%xmm1 \n"
124 "movdqa %%xmm0,(%1) \n"
125 "movdqa %%xmm1,0x10(%1) \n"
126 "lea 0x20(%1),%1 \n"
127 "sub $0x8,%2 \n"
128 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000129 : "+r"(src_y), // %0
130 "+r"(dst_argb), // %1
131 "+r"(pix) // %2
132 :
133 : "memory", "cc"
134#if defined(__SSE2__)
135 , "xmm0", "xmm1", "xmm5"
136#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000137 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000138}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000139
140void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000141 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000142 "movdqa %3,%%xmm5 \n"
143 "1: \n"
144 "movdqa (%0),%%xmm0 \n"
145 "lea 0x10(%0),%0 \n"
146 "pshufb %%xmm5,%%xmm0 \n"
147 "movdqa %%xmm0,(%1) \n"
148 "lea 0x10(%1),%1 \n"
149 "sub $0x4,%2 \n"
150 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000151 : "+r"(src_abgr), // %0
152 "+r"(dst_argb), // %1
153 "+r"(pix) // %2
154 : "m"(kShuffleMaskABGRToARGB) // %3
155 : "memory", "cc"
156#if defined(__SSE2__)
157 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000158#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000159 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000160}
161
162void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000163 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000164 "movdqa %3,%%xmm5 \n"
165 "1: \n"
166 "movdqa (%0),%%xmm0 \n"
167 "lea 0x10(%0),%0 \n"
168 "pshufb %%xmm5,%%xmm0 \n"
169 "movdqa %%xmm0,(%1) \n"
170 "lea 0x10(%1),%1 \n"
171 "sub $0x4,%2 \n"
172 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173 : "+r"(src_bgra), // %0
174 "+r"(dst_argb), // %1
175 "+r"(pix) // %2
176 : "m"(kShuffleMaskBGRAToARGB) // %3
177 : "memory", "cc"
178#if defined(__SSE2__)
179 , "xmm0", "xmm5"
180#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000181 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000182}
183
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000184void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000185 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000186 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
187 "pslld $0x18,%%xmm5 \n"
188 "movdqa %3,%%xmm4 \n"
189 "1: \n"
190 "movdqu (%0),%%xmm0 \n"
191 "movdqu 0x10(%0),%%xmm1 \n"
192 "movdqu 0x20(%0),%%xmm3 \n"
193 "lea 0x30(%0),%0 \n"
194 "movdqa %%xmm3,%%xmm2 \n"
195 "palignr $0x8,%%xmm1,%%xmm2 \n"
196 "pshufb %%xmm4,%%xmm2 \n"
197 "por %%xmm5,%%xmm2 \n"
198 "palignr $0xc,%%xmm0,%%xmm1 \n"
199 "pshufb %%xmm4,%%xmm0 \n"
200 "movdqa %%xmm2,0x20(%1) \n"
201 "por %%xmm5,%%xmm0 \n"
202 "pshufb %%xmm4,%%xmm1 \n"
203 "movdqa %%xmm0,(%1) \n"
204 "por %%xmm5,%%xmm1 \n"
205 "palignr $0x4,%%xmm3,%%xmm3 \n"
206 "pshufb %%xmm4,%%xmm3 \n"
207 "movdqa %%xmm1,0x10(%1) \n"
208 "por %%xmm5,%%xmm3 \n"
209 "movdqa %%xmm3,0x30(%1) \n"
210 "lea 0x40(%1),%1 \n"
211 "sub $0x10,%2 \n"
212 "ja 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000213 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000214 "+r"(dst_argb), // %1
215 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000216 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000217 : "memory", "cc"
218#if defined(__SSE2__)
219 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
220#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000221 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000222}
223
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000224void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000225 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000226 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
227 "pslld $0x18,%%xmm5 \n"
228 "movdqa %3,%%xmm4 \n"
229 "1: \n"
230 "movdqu (%0),%%xmm0 \n"
231 "movdqu 0x10(%0),%%xmm1 \n"
232 "movdqu 0x20(%0),%%xmm3 \n"
233 "lea 0x30(%0),%0 \n"
234 "movdqa %%xmm3,%%xmm2 \n"
235 "palignr $0x8,%%xmm1,%%xmm2 \n"
236 "pshufb %%xmm4,%%xmm2 \n"
237 "por %%xmm5,%%xmm2 \n"
238 "palignr $0xc,%%xmm0,%%xmm1 \n"
239 "pshufb %%xmm4,%%xmm0 \n"
240 "movdqa %%xmm2,0x20(%1) \n"
241 "por %%xmm5,%%xmm0 \n"
242 "pshufb %%xmm4,%%xmm1 \n"
243 "movdqa %%xmm0,(%1) \n"
244 "por %%xmm5,%%xmm1 \n"
245 "palignr $0x4,%%xmm3,%%xmm3 \n"
246 "pshufb %%xmm4,%%xmm3 \n"
247 "movdqa %%xmm1,0x10(%1) \n"
248 "por %%xmm5,%%xmm3 \n"
249 "movdqa %%xmm3,0x30(%1) \n"
250 "lea 0x40(%1),%1 \n"
251 "sub $0x10,%2 \n"
252 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000253 : "+r"(src_raw), // %0
254 "+r"(dst_argb), // %1
255 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000256 : "m"(kShuffleMaskRAWToARGB) // %3
257 : "memory", "cc"
258#if defined(__SSE2__)
259 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
260#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000261 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000262}
263
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000264void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
265 asm volatile (
266 "mov $0x1080108,%%eax \n"
267 "movd %%eax,%%xmm5 \n"
268 "pshufd $0x0,%%xmm5,%%xmm5 \n"
269 "mov $0x20082008,%%eax \n"
270 "movd %%eax,%%xmm6 \n"
271 "pshufd $0x0,%%xmm6,%%xmm6 \n"
272 "pcmpeqb %%xmm3,%%xmm3 \n"
273 "psllw $0xb,%%xmm3 \n"
274 "pcmpeqb %%xmm4,%%xmm4 \n"
275 "psllw $0xa,%%xmm4 \n"
276 "psrlw $0x5,%%xmm4 \n"
277 "pcmpeqb %%xmm7,%%xmm7 \n"
278 "psllw $0x8,%%xmm7 \n"
279 "sub %0,%1 \n"
280 "sub %0,%1 \n"
281 "1: \n"
282 "movdqu (%0),%%xmm0 \n"
283 "movdqa %%xmm0,%%xmm1 \n"
284 "movdqa %%xmm0,%%xmm2 \n"
285 "pand %%xmm3,%%xmm1 \n"
286 "psllw $0xb,%%xmm2 \n"
287 "pmulhuw %%xmm5,%%xmm1 \n"
288 "pmulhuw %%xmm5,%%xmm2 \n"
289 "psllw $0x8,%%xmm1 \n"
290 "por %%xmm2,%%xmm1 \n"
291 "pand %%xmm4,%%xmm0 \n"
292 "pmulhuw %%xmm6,%%xmm0 \n"
293 "por %%xmm7,%%xmm0 \n"
294 "movdqa %%xmm1,%%xmm2 \n"
295 "punpcklbw %%xmm0,%%xmm1 \n"
296 "punpckhbw %%xmm0,%%xmm2 \n"
297 "movdqa %%xmm1,(%1,%0,2) \n"
298 "movdqa %%xmm2,0x10(%1,%0,2) \n"
299 "lea 0x10(%0),%0 \n"
300 "sub $0x8,%2 \n"
301 "ja 1b \n"
302 : "+r"(src), // %0
303 "+r"(dst), // %1
304 "+r"(pix) // %2
305 :
306 : "memory", "cc", "eax"
307#if defined(__SSE2__)
308 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
309#endif
310 );
311}
312
313void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
314 asm volatile (
315 "mov $0x1080108,%%eax \n"
316 "movd %%eax,%%xmm5 \n"
317 "pshufd $0x0,%%xmm5,%%xmm5 \n"
318 "mov $0x42004200,%%eax \n"
319 "movd %%eax,%%xmm6 \n"
320 "pshufd $0x0,%%xmm6,%%xmm6 \n"
321 "pcmpeqb %%xmm3,%%xmm3 \n"
322 "psllw $0xb,%%xmm3 \n"
323 "movdqa %%xmm3,%%xmm4 \n"
324 "psrlw $0x6,%%xmm4 \n"
325 "pcmpeqb %%xmm7,%%xmm7 \n"
326 "psllw $0x8,%%xmm7 \n"
327 "sub %0,%1 \n"
328 "sub %0,%1 \n"
329 "1: \n"
330 "movdqu (%0),%%xmm0 \n"
331 "movdqa %%xmm0,%%xmm1 \n"
332 "movdqa %%xmm0,%%xmm2 \n"
333 "psllw $0x1,%%xmm1 \n"
334 "psllw $0xb,%%xmm2 \n"
335 "pand %%xmm3,%%xmm1 \n"
336 "pmulhuw %%xmm5,%%xmm2 \n"
337 "pmulhuw %%xmm5,%%xmm1 \n"
338 "psllw $0x8,%%xmm1 \n"
339 "por %%xmm2,%%xmm1 \n"
340 "movdqa %%xmm0,%%xmm2 \n"
341 "pand %%xmm4,%%xmm0 \n"
342 "psraw $0x8,%%xmm2 \n"
343 "pmulhuw %%xmm6,%%xmm0 \n"
344 "pand %%xmm7,%%xmm2 \n"
345 "por %%xmm2,%%xmm0 \n"
346 "movdqa %%xmm1,%%xmm2 \n"
347 "punpcklbw %%xmm0,%%xmm1 \n"
348 "punpckhbw %%xmm0,%%xmm2 \n"
349 "movdqa %%xmm1,(%1,%0,2) \n"
350 "movdqa %%xmm2,0x10(%1,%0,2) \n"
351 "lea 0x10(%0),%0 \n"
352 "sub $0x8,%2 \n"
353 "ja 1b \n"
354 : "+r"(src), // %0
355 "+r"(dst), // %1
356 "+r"(pix) // %2
357 :
358 : "memory", "cc", "eax"
359#if defined(__SSE2__)
360 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
361#endif
362 );
363}
364
365void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
366 asm volatile (
367 "mov $0xf0f0f0f,%%eax \n"
368 "movd %%eax,%%xmm4 \n"
369 "pshufd $0x0,%%xmm4,%%xmm4 \n"
370 "movdqa %%xmm4,%%xmm5 \n"
371 "pslld $0x4,%%xmm5 \n"
372 "sub %0,%1 \n"
373 "sub %0,%1 \n"
374 "1: \n"
375 "movdqu (%0),%%xmm0 \n"
376 "movdqa %%xmm0,%%xmm2 \n"
377 "pand %%xmm4,%%xmm0 \n"
378 "pand %%xmm5,%%xmm2 \n"
379 "movdqa %%xmm0,%%xmm1 \n"
380 "movdqa %%xmm2,%%xmm3 \n"
381 "psllw $0x4,%%xmm1 \n"
382 "psrlw $0x4,%%xmm3 \n"
383 "por %%xmm1,%%xmm0 \n"
384 "por %%xmm3,%%xmm2 \n"
385 "movdqa %%xmm0,%%xmm1 \n"
386 "punpcklbw %%xmm2,%%xmm0 \n"
387 "punpckhbw %%xmm2,%%xmm1 \n"
388 "movdqa %%xmm0,(%1,%0,2) \n"
389 "movdqa %%xmm1,0x10(%1,%0,2) \n"
390 "lea 0x10(%0),%0 \n"
391 "sub $0x8,%2 \n"
392 "ja 1b \n"
393 : "+r"(src), // %0
394 "+r"(dst), // %1
395 "+r"(pix) // %2
396 :
397 : "memory", "cc", "eax"
398#if defined(__SSE2__)
399 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
400#endif
401 );
402}
403
404void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
405 asm volatile (
406 "movdqa %3,%%xmm6 \n"
407 "1: \n"
408 "movdqa (%0),%%xmm0 \n"
409 "movdqa 0x10(%0),%%xmm1 \n"
410 "movdqa 0x20(%0),%%xmm2 \n"
411 "movdqa 0x30(%0),%%xmm3 \n"
412 "lea 0x40(%0),%0 \n"
413 "pshufb %%xmm6,%%xmm0 \n"
414 "pshufb %%xmm6,%%xmm1 \n"
415 "pshufb %%xmm6,%%xmm2 \n"
416 "pshufb %%xmm6,%%xmm3 \n"
417 "movdqa %%xmm1,%%xmm4 \n"
418 "psrldq $0x4,%%xmm1 \n"
419 "pslldq $0xc,%%xmm4 \n"
420 "movdqa %%xmm2,%%xmm5 \n"
421 "por %%xmm4,%%xmm0 \n"
422 "pslldq $0x8,%%xmm5 \n"
423 "movdqa %%xmm0,(%1) \n"
424 "por %%xmm5,%%xmm1 \n"
425 "psrldq $0x8,%%xmm2 \n"
426 "pslldq $0x4,%%xmm3 \n"
427 "por %%xmm3,%%xmm2 \n"
428 "movdqa %%xmm1,0x10(%1) \n"
429 "movdqa %%xmm2,0x20(%1) \n"
430 "lea 0x30(%1),%1 \n"
431 "sub $0x10,%2 \n"
432 "ja 1b \n"
433 : "+r"(src), // %0
434 "+r"(dst), // %1
435 "+r"(pix) // %2
436 : "m"(kShuffleMaskARGBToRGB24) // %3
437 : "memory", "cc"
438#if defined(__SSE2__)
439 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
440#endif
441 );
442}
443
444void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
445 asm volatile (
446 "movdqa %3,%%xmm6 \n"
447 "1: \n"
448 "movdqa (%0),%%xmm0 \n"
449 "movdqa 0x10(%0),%%xmm1 \n"
450 "movdqa 0x20(%0),%%xmm2 \n"
451 "movdqa 0x30(%0),%%xmm3 \n"
452 "lea 0x40(%0),%0 \n"
453 "pshufb %%xmm6,%%xmm0 \n"
454 "pshufb %%xmm6,%%xmm1 \n"
455 "pshufb %%xmm6,%%xmm2 \n"
456 "pshufb %%xmm6,%%xmm3 \n"
457 "movdqa %%xmm1,%%xmm4 \n"
458 "psrldq $0x4,%%xmm1 \n"
459 "pslldq $0xc,%%xmm4 \n"
460 "movdqa %%xmm2,%%xmm5 \n"
461 "por %%xmm4,%%xmm0 \n"
462 "pslldq $0x8,%%xmm5 \n"
463 "movdqa %%xmm0,(%1) \n"
464 "por %%xmm5,%%xmm1 \n"
465 "psrldq $0x8,%%xmm2 \n"
466 "pslldq $0x4,%%xmm3 \n"
467 "por %%xmm3,%%xmm2 \n"
468 "movdqa %%xmm1,0x10(%1) \n"
469 "movdqa %%xmm2,0x20(%1) \n"
470 "lea 0x30(%1),%1 \n"
471 "sub $0x10,%2 \n"
472 "ja 1b \n"
473 : "+r"(src), // %0
474 "+r"(dst), // %1
475 "+r"(pix) // %2
476 : "m"(kShuffleMaskARGBToRAW) // %3
477 : "memory", "cc"
478#if defined(__SSE2__)
479 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
480#endif
481 );
482}
483
484void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
485 asm volatile (
486 "pcmpeqb %%xmm3,%%xmm3 \n"
487 "psrld $0x1b,%%xmm3 \n"
488 "pcmpeqb %%xmm4,%%xmm4 \n"
489 "psrld $0x1a,%%xmm4 \n"
490 "pslld $0x5,%%xmm4 \n"
491 "pcmpeqb %%xmm5,%%xmm5 \n"
492 "pslld $0xb,%%xmm5 \n"
493 "1: \n"
494 "movdqa (%0),%%xmm0 \n"
495 "movdqa %%xmm0,%%xmm1 \n"
496 "movdqa %%xmm0,%%xmm2 \n"
497 "pslld $0x8,%%xmm0 \n"
498 "psrld $0x3,%%xmm1 \n"
499 "psrld $0x5,%%xmm2 \n"
500 "psrad $0x10,%%xmm0 \n"
501 "pand %%xmm3,%%xmm1 \n"
502 "pand %%xmm4,%%xmm2 \n"
503 "pand %%xmm5,%%xmm0 \n"
504 "por %%xmm2,%%xmm1 \n"
505 "por %%xmm1,%%xmm0 \n"
506 "packssdw %%xmm0,%%xmm0 \n"
507 "lea 0x10(%0),%0 \n"
508 "movq %%xmm0,(%1) \n"
509 "lea 0x8(%1),%1 \n"
510 "sub $0x4,%2 \n"
511 "ja 1b \n"
512 : "+r"(src), // %0
513 "+r"(dst), // %1
514 "+r"(pix) // %2
515 :
516 : "memory", "cc"
517#if defined(__SSE2__)
518 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
519#endif
520 );
521}
522
523void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
524 asm volatile (
525 "pcmpeqb %%xmm4,%%xmm4 \n"
526 "psrld $0x1b,%%xmm4 \n"
527 "movdqa %%xmm4,%%xmm5 \n"
528 "pslld $0x5,%%xmm5 \n"
529 "movdqa %%xmm4,%%xmm6 \n"
530 "pslld $0xa,%%xmm6 \n"
531 "pcmpeqb %%xmm7,%%xmm7 \n"
532 "pslld $0xf,%%xmm7 \n"
533 "1: \n"
534 "movdqa (%0),%%xmm0 \n"
535 "movdqa %%xmm0,%%xmm1 \n"
536 "movdqa %%xmm0,%%xmm2 \n"
537 "movdqa %%xmm0,%%xmm3 \n"
538 "psrad $0x10,%%xmm0 \n"
539 "psrld $0x3,%%xmm1 \n"
540 "psrld $0x6,%%xmm2 \n"
541 "psrld $0x9,%%xmm3 \n"
542 "pand %%xmm7,%%xmm0 \n"
543 "pand %%xmm4,%%xmm1 \n"
544 "pand %%xmm5,%%xmm2 \n"
545 "pand %%xmm6,%%xmm3 \n"
546 "por %%xmm1,%%xmm0 \n"
547 "por %%xmm3,%%xmm2 \n"
548 "por %%xmm2,%%xmm0 \n"
549 "packssdw %%xmm0,%%xmm0 \n"
550 "lea 0x10(%0),%0 \n"
551 "movq %%xmm0,(%1) \n"
552 "lea 0x8(%1),%1 \n"
553 "sub $0x4,%2 \n"
554 "ja 1b \n"
555 : "+r"(src), // %0
556 "+r"(dst), // %1
557 "+r"(pix) // %2
558 :
559 : "memory", "cc"
560#if defined(__SSE2__)
561 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
562#endif
563 );
564}
565
566void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
567 asm volatile (
568 "pcmpeqb %%xmm4,%%xmm4 \n"
569 "psllw $0xc,%%xmm4 \n"
570 "movdqa %%xmm4,%%xmm3 \n"
571 "psrlw $0x8,%%xmm3 \n"
572 "1: \n"
573 "movdqa (%0),%%xmm0 \n"
574 "movdqa %%xmm0,%%xmm1 \n"
575 "pand %%xmm3,%%xmm0 \n"
576 "pand %%xmm4,%%xmm1 \n"
577 "psrlq $0x4,%%xmm0 \n"
578 "psrlq $0x8,%%xmm1 \n"
579 "por %%xmm1,%%xmm0 \n"
580 "packuswb %%xmm0,%%xmm0 \n"
581 "lea 0x10(%0),%0 \n"
582 "movq %%xmm0,(%1) \n"
583 "lea 0x8(%1),%1 \n"
584 "sub $0x4,%2 \n"
585 "ja 1b \n"
586 : "+r"(src), // %0
587 "+r"(dst), // %1
588 "+r"(pix) // %2
589 :
590 : "memory", "cc"
591#if defined(__SSE2__)
592 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
593#endif
594 );
595}
596
fbarchard@google.comb6149762011-11-07 21:58:52 +0000597void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000598 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000599 "movdqa %4,%%xmm5 \n"
600 "movdqa %3,%%xmm4 \n"
601 "1: \n"
602 "movdqa (%0),%%xmm0 \n"
603 "movdqa 0x10(%0),%%xmm1 \n"
604 "movdqa 0x20(%0),%%xmm2 \n"
605 "movdqa 0x30(%0),%%xmm3 \n"
606 "pmaddubsw %%xmm4,%%xmm0 \n"
607 "pmaddubsw %%xmm4,%%xmm1 \n"
608 "pmaddubsw %%xmm4,%%xmm2 \n"
609 "pmaddubsw %%xmm4,%%xmm3 \n"
610 "lea 0x40(%0),%0 \n"
611 "phaddw %%xmm1,%%xmm0 \n"
612 "phaddw %%xmm3,%%xmm2 \n"
613 "psrlw $0x7,%%xmm0 \n"
614 "psrlw $0x7,%%xmm2 \n"
615 "packuswb %%xmm2,%%xmm0 \n"
616 "paddb %%xmm5,%%xmm0 \n"
617 "movdqa %%xmm0,(%1) \n"
618 "lea 0x10(%1),%1 \n"
619 "sub $0x10,%2 \n"
620 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000621 : "+r"(src_argb), // %0
622 "+r"(dst_y), // %1
623 "+r"(pix) // %2
624 : "m"(kARGBToY), // %3
625 "m"(kAddY16) // %4
626 : "memory", "cc"
627#if defined(__SSE2__)
628 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
629#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000630 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000631}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000632
633void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
634 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000635 "movdqa %4,%%xmm5 \n"
636 "movdqa %3,%%xmm4 \n"
637 "1: \n"
638 "movdqu (%0),%%xmm0 \n"
639 "movdqu 0x10(%0),%%xmm1 \n"
640 "movdqu 0x20(%0),%%xmm2 \n"
641 "movdqu 0x30(%0),%%xmm3 \n"
642 "pmaddubsw %%xmm4,%%xmm0 \n"
643 "pmaddubsw %%xmm4,%%xmm1 \n"
644 "pmaddubsw %%xmm4,%%xmm2 \n"
645 "pmaddubsw %%xmm4,%%xmm3 \n"
646 "lea 0x40(%0),%0 \n"
647 "phaddw %%xmm1,%%xmm0 \n"
648 "phaddw %%xmm3,%%xmm2 \n"
649 "psrlw $0x7,%%xmm0 \n"
650 "psrlw $0x7,%%xmm2 \n"
651 "packuswb %%xmm2,%%xmm0 \n"
652 "paddb %%xmm5,%%xmm0 \n"
653 "movdqu %%xmm0,(%1) \n"
654 "lea 0x10(%1),%1 \n"
655 "sub $0x10,%2 \n"
656 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000657 : "+r"(src_argb), // %0
658 "+r"(dst_y), // %1
659 "+r"(pix) // %2
660 : "m"(kARGBToY), // %3
661 "m"(kAddY16) // %4
662 : "memory", "cc"
663#if defined(__SSE2__)
664 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
665#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000666 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000667}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000668
fbarchard@google.com714050a2012-02-17 22:59:56 +0000669// TODO(fbarchard): pass xmm constants to single block of assembly.
670// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
671// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
672// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
673// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000674void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
675 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000676 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000677 "movdqa %0,%%xmm4 \n"
678 "movdqa %1,%%xmm3 \n"
679 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000680 :
681 : "m"(kARGBToU), // %0
682 "m"(kARGBToV), // %1
683 "m"(kAddUV128) // %2
684 :
685#if defined(__SSE2__)
686 "xmm3", "xmm4", "xmm5"
687#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000688 );
689 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000690 "sub %1,%2 \n"
691 "1: \n"
692 "movdqa (%0),%%xmm0 \n"
693 "movdqa 0x10(%0),%%xmm1 \n"
694 "movdqa 0x20(%0),%%xmm2 \n"
695 "movdqa 0x30(%0),%%xmm6 \n"
696 "pavgb (%0,%4,1),%%xmm0 \n"
697 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
698 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
699 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
700 "lea 0x40(%0),%0 \n"
701 "movdqa %%xmm0,%%xmm7 \n"
702 "shufps $0x88,%%xmm1,%%xmm0 \n"
703 "shufps $0xdd,%%xmm1,%%xmm7 \n"
704 "pavgb %%xmm7,%%xmm0 \n"
705 "movdqa %%xmm2,%%xmm7 \n"
706 "shufps $0x88,%%xmm6,%%xmm2 \n"
707 "shufps $0xdd,%%xmm6,%%xmm7 \n"
708 "pavgb %%xmm7,%%xmm2 \n"
709 "movdqa %%xmm0,%%xmm1 \n"
710 "movdqa %%xmm2,%%xmm6 \n"
711 "pmaddubsw %%xmm4,%%xmm0 \n"
712 "pmaddubsw %%xmm4,%%xmm2 \n"
713 "pmaddubsw %%xmm3,%%xmm1 \n"
714 "pmaddubsw %%xmm3,%%xmm6 \n"
715 "phaddw %%xmm2,%%xmm0 \n"
716 "phaddw %%xmm6,%%xmm1 \n"
717 "psraw $0x8,%%xmm0 \n"
718 "psraw $0x8,%%xmm1 \n"
719 "packsswb %%xmm1,%%xmm0 \n"
720 "paddb %%xmm5,%%xmm0 \n"
721 "movlps %%xmm0,(%1) \n"
722 "movhps %%xmm0,(%1,%2,1) \n"
723 "lea 0x8(%1),%1 \n"
724 "sub $0x10,%3 \n"
725 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000726 : "+r"(src_argb0), // %0
727 "+r"(dst_u), // %1
728 "+r"(dst_v), // %2
729 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000730 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000731 : "memory", "cc"
732#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000733 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000734#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000735 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000736}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000737
738void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
739 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000740 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000741 "movdqa %0,%%xmm4 \n"
742 "movdqa %1,%%xmm3 \n"
743 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000744 :
745 : "m"(kARGBToU), // %0
746 "m"(kARGBToV), // %1
747 "m"(kAddUV128) // %2
748 :
749#if defined(__SSE2__)
750 "xmm3", "xmm4", "xmm5"
751#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000752 );
753 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000754 "sub %1,%2 \n"
755 "1: \n"
756 "movdqu (%0),%%xmm0 \n"
757 "movdqu 0x10(%0),%%xmm1 \n"
758 "movdqu 0x20(%0),%%xmm2 \n"
759 "movdqu 0x30(%0),%%xmm6 \n"
760 "movdqu (%0,%4,1),%%xmm7 \n"
761 "pavgb %%xmm7,%%xmm0 \n"
762 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
763 "pavgb %%xmm7,%%xmm1 \n"
764 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
765 "pavgb %%xmm7,%%xmm2 \n"
766 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
767 "pavgb %%xmm7,%%xmm6 \n"
768 "lea 0x40(%0),%0 \n"
769 "movdqa %%xmm0,%%xmm7 \n"
770 "shufps $0x88,%%xmm1,%%xmm0 \n"
771 "shufps $0xdd,%%xmm1,%%xmm7 \n"
772 "pavgb %%xmm7,%%xmm0 \n"
773 "movdqa %%xmm2,%%xmm7 \n"
774 "shufps $0x88,%%xmm6,%%xmm2 \n"
775 "shufps $0xdd,%%xmm6,%%xmm7 \n"
776 "pavgb %%xmm7,%%xmm2 \n"
777 "movdqa %%xmm0,%%xmm1 \n"
778 "movdqa %%xmm2,%%xmm6 \n"
779 "pmaddubsw %%xmm4,%%xmm0 \n"
780 "pmaddubsw %%xmm4,%%xmm2 \n"
781 "pmaddubsw %%xmm3,%%xmm1 \n"
782 "pmaddubsw %%xmm3,%%xmm6 \n"
783 "phaddw %%xmm2,%%xmm0 \n"
784 "phaddw %%xmm6,%%xmm1 \n"
785 "psraw $0x8,%%xmm0 \n"
786 "psraw $0x8,%%xmm1 \n"
787 "packsswb %%xmm1,%%xmm0 \n"
788 "paddb %%xmm5,%%xmm0 \n"
789 "movlps %%xmm0,(%1) \n"
790 "movhps %%xmm0,(%1,%2,1) \n"
791 "lea 0x8(%1),%1 \n"
792 "sub $0x10,%3 \n"
793 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000794 : "+r"(src_argb0), // %0
795 "+r"(dst_u), // %1
796 "+r"(dst_v), // %2
797 "+rm"(width) // %3
798 : "r"(static_cast<intptr_t>(src_stride_argb))
799 : "memory", "cc"
800#if defined(__SSE2__)
801 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
802#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000803 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000804}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000805
fbarchard@google.com714050a2012-02-17 22:59:56 +0000806void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
807 asm volatile (
808 "movdqa %4,%%xmm5 \n"
809 "movdqa %3,%%xmm4 \n"
810 "1: \n"
811 "movdqa (%0),%%xmm0 \n"
812 "movdqa 0x10(%0),%%xmm1 \n"
813 "movdqa 0x20(%0),%%xmm2 \n"
814 "movdqa 0x30(%0),%%xmm3 \n"
815 "pmaddubsw %%xmm4,%%xmm0 \n"
816 "pmaddubsw %%xmm4,%%xmm1 \n"
817 "pmaddubsw %%xmm4,%%xmm2 \n"
818 "pmaddubsw %%xmm4,%%xmm3 \n"
819 "lea 0x40(%0),%0 \n"
820 "phaddw %%xmm1,%%xmm0 \n"
821 "phaddw %%xmm3,%%xmm2 \n"
822 "psrlw $0x7,%%xmm0 \n"
823 "psrlw $0x7,%%xmm2 \n"
824 "packuswb %%xmm2,%%xmm0 \n"
825 "paddb %%xmm5,%%xmm0 \n"
826 "movdqa %%xmm0,(%1) \n"
827 "lea 0x10(%1),%1 \n"
828 "sub $0x10,%2 \n"
829 "ja 1b \n"
830 : "+r"(src_bgra), // %0
831 "+r"(dst_y), // %1
832 "+r"(pix) // %2
833 : "m"(kBGRAToY), // %3
834 "m"(kAddY16) // %4
835 : "memory", "cc"
836#if defined(__SSE2__)
837 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000838#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000839 );
840}
841
842void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
843 asm volatile (
844 "movdqa %4,%%xmm5 \n"
845 "movdqa %3,%%xmm4 \n"
846 "1: \n"
847 "movdqu (%0),%%xmm0 \n"
848 "movdqu 0x10(%0),%%xmm1 \n"
849 "movdqu 0x20(%0),%%xmm2 \n"
850 "movdqu 0x30(%0),%%xmm3 \n"
851 "pmaddubsw %%xmm4,%%xmm0 \n"
852 "pmaddubsw %%xmm4,%%xmm1 \n"
853 "pmaddubsw %%xmm4,%%xmm2 \n"
854 "pmaddubsw %%xmm4,%%xmm3 \n"
855 "lea 0x40(%0),%0 \n"
856 "phaddw %%xmm1,%%xmm0 \n"
857 "phaddw %%xmm3,%%xmm2 \n"
858 "psrlw $0x7,%%xmm0 \n"
859 "psrlw $0x7,%%xmm2 \n"
860 "packuswb %%xmm2,%%xmm0 \n"
861 "paddb %%xmm5,%%xmm0 \n"
862 "movdqu %%xmm0,(%1) \n"
863 "lea 0x10(%1),%1 \n"
864 "sub $0x10,%2 \n"
865 "ja 1b \n"
866 : "+r"(src_bgra), // %0
867 "+r"(dst_y), // %1
868 "+r"(pix) // %2
869 : "m"(kBGRAToY), // %3
870 "m"(kAddY16) // %4
871 : "memory", "cc"
872#if defined(__SSE2__)
873 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
874#endif
875 );
876}
877
878void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
879 uint8* dst_u, uint8* dst_v, int width) {
880 asm volatile (
881 "movdqa %0,%%xmm4 \n"
882 "movdqa %1,%%xmm3 \n"
883 "movdqa %2,%%xmm5 \n"
884 :
885 : "m"(kBGRAToU), // %0
886 "m"(kBGRAToV), // %1
887 "m"(kAddUV128) // %2
888 :
889#if defined(__SSE2__)
890 "xmm3", "xmm4", "xmm5"
891#endif
892 );
893 asm volatile (
894 "sub %1,%2 \n"
895 "1: \n"
896 "movdqa (%0),%%xmm0 \n"
897 "movdqa 0x10(%0),%%xmm1 \n"
898 "movdqa 0x20(%0),%%xmm2 \n"
899 "movdqa 0x30(%0),%%xmm6 \n"
900 "pavgb (%0,%4,1),%%xmm0 \n"
901 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
902 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
903 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
904 "lea 0x40(%0),%0 \n"
905 "movdqa %%xmm0,%%xmm7 \n"
906 "shufps $0x88,%%xmm1,%%xmm0 \n"
907 "shufps $0xdd,%%xmm1,%%xmm7 \n"
908 "pavgb %%xmm7,%%xmm0 \n"
909 "movdqa %%xmm2,%%xmm7 \n"
910 "shufps $0x88,%%xmm6,%%xmm2 \n"
911 "shufps $0xdd,%%xmm6,%%xmm7 \n"
912 "pavgb %%xmm7,%%xmm2 \n"
913 "movdqa %%xmm0,%%xmm1 \n"
914 "movdqa %%xmm2,%%xmm6 \n"
915 "pmaddubsw %%xmm4,%%xmm0 \n"
916 "pmaddubsw %%xmm4,%%xmm2 \n"
917 "pmaddubsw %%xmm3,%%xmm1 \n"
918 "pmaddubsw %%xmm3,%%xmm6 \n"
919 "phaddw %%xmm2,%%xmm0 \n"
920 "phaddw %%xmm6,%%xmm1 \n"
921 "psraw $0x8,%%xmm0 \n"
922 "psraw $0x8,%%xmm1 \n"
923 "packsswb %%xmm1,%%xmm0 \n"
924 "paddb %%xmm5,%%xmm0 \n"
925 "movlps %%xmm0,(%1) \n"
926 "movhps %%xmm0,(%1,%2,1) \n"
927 "lea 0x8(%1),%1 \n"
928 "sub $0x10,%3 \n"
929 "ja 1b \n"
930 : "+r"(src_bgra0), // %0
931 "+r"(dst_u), // %1
932 "+r"(dst_v), // %2
933 "+rm"(width) // %3
934 : "r"(static_cast<intptr_t>(src_stride_bgra))
935 : "memory", "cc"
936#if defined(__SSE2__)
937 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
938#endif
939 );
940}
941
942void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
943 uint8* dst_u, uint8* dst_v, int width) {
944 asm volatile (
945 "movdqa %0,%%xmm4 \n"
946 "movdqa %1,%%xmm3 \n"
947 "movdqa %2,%%xmm5 \n"
948 :
949 : "m"(kBGRAToU), // %0
950 "m"(kBGRAToV), // %1
951 "m"(kAddUV128) // %2
952 :
953#if defined(__SSE2__)
954 "xmm3", "xmm4", "xmm5"
955#endif
956 );
957 asm volatile (
958 "sub %1,%2 \n"
959 "1: \n"
960 "movdqu (%0),%%xmm0 \n"
961 "movdqu 0x10(%0),%%xmm1 \n"
962 "movdqu 0x20(%0),%%xmm2 \n"
963 "movdqu 0x30(%0),%%xmm6 \n"
964 "movdqu (%0,%4,1),%%xmm7 \n"
965 "pavgb %%xmm7,%%xmm0 \n"
966 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
967 "pavgb %%xmm7,%%xmm1 \n"
968 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
969 "pavgb %%xmm7,%%xmm2 \n"
970 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
971 "pavgb %%xmm7,%%xmm6 \n"
972 "lea 0x40(%0),%0 \n"
973 "movdqa %%xmm0,%%xmm7 \n"
974 "shufps $0x88,%%xmm1,%%xmm0 \n"
975 "shufps $0xdd,%%xmm1,%%xmm7 \n"
976 "pavgb %%xmm7,%%xmm0 \n"
977 "movdqa %%xmm2,%%xmm7 \n"
978 "shufps $0x88,%%xmm6,%%xmm2 \n"
979 "shufps $0xdd,%%xmm6,%%xmm7 \n"
980 "pavgb %%xmm7,%%xmm2 \n"
981 "movdqa %%xmm0,%%xmm1 \n"
982 "movdqa %%xmm2,%%xmm6 \n"
983 "pmaddubsw %%xmm4,%%xmm0 \n"
984 "pmaddubsw %%xmm4,%%xmm2 \n"
985 "pmaddubsw %%xmm3,%%xmm1 \n"
986 "pmaddubsw %%xmm3,%%xmm6 \n"
987 "phaddw %%xmm2,%%xmm0 \n"
988 "phaddw %%xmm6,%%xmm1 \n"
989 "psraw $0x8,%%xmm0 \n"
990 "psraw $0x8,%%xmm1 \n"
991 "packsswb %%xmm1,%%xmm0 \n"
992 "paddb %%xmm5,%%xmm0 \n"
993 "movlps %%xmm0,(%1) \n"
994 "movhps %%xmm0,(%1,%2,1) \n"
995 "lea 0x8(%1),%1 \n"
996 "sub $0x10,%3 \n"
997 "ja 1b \n"
998 : "+r"(src_bgra0), // %0
999 "+r"(dst_u), // %1
1000 "+r"(dst_v), // %2
1001 "+rm"(width) // %3
1002 : "r"(static_cast<intptr_t>(src_stride_bgra))
1003 : "memory", "cc"
1004#if defined(__SSE2__)
1005 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1006#endif
1007 );
1008}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001009
1010void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1011 asm volatile (
1012 "movdqa %4,%%xmm5 \n"
1013 "movdqa %3,%%xmm4 \n"
1014 "1: \n"
1015 "movdqa (%0),%%xmm0 \n"
1016 "movdqa 0x10(%0),%%xmm1 \n"
1017 "movdqa 0x20(%0),%%xmm2 \n"
1018 "movdqa 0x30(%0),%%xmm3 \n"
1019 "pmaddubsw %%xmm4,%%xmm0 \n"
1020 "pmaddubsw %%xmm4,%%xmm1 \n"
1021 "pmaddubsw %%xmm4,%%xmm2 \n"
1022 "pmaddubsw %%xmm4,%%xmm3 \n"
1023 "lea 0x40(%0),%0 \n"
1024 "phaddw %%xmm1,%%xmm0 \n"
1025 "phaddw %%xmm3,%%xmm2 \n"
1026 "psrlw $0x7,%%xmm0 \n"
1027 "psrlw $0x7,%%xmm2 \n"
1028 "packuswb %%xmm2,%%xmm0 \n"
1029 "paddb %%xmm5,%%xmm0 \n"
1030 "movdqa %%xmm0,(%1) \n"
1031 "lea 0x10(%1),%1 \n"
1032 "sub $0x10,%2 \n"
1033 "ja 1b \n"
1034 : "+r"(src_abgr), // %0
1035 "+r"(dst_y), // %1
1036 "+r"(pix) // %2
1037 : "m"(kABGRToY), // %3
1038 "m"(kAddY16) // %4
1039 : "memory", "cc"
1040#if defined(__SSE2__)
1041 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1042#endif
1043 );
1044}
1045
1046void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1047 asm volatile (
1048 "movdqa %4,%%xmm5 \n"
1049 "movdqa %3,%%xmm4 \n"
1050 "1: \n"
1051 "movdqu (%0),%%xmm0 \n"
1052 "movdqu 0x10(%0),%%xmm1 \n"
1053 "movdqu 0x20(%0),%%xmm2 \n"
1054 "movdqu 0x30(%0),%%xmm3 \n"
1055 "pmaddubsw %%xmm4,%%xmm0 \n"
1056 "pmaddubsw %%xmm4,%%xmm1 \n"
1057 "pmaddubsw %%xmm4,%%xmm2 \n"
1058 "pmaddubsw %%xmm4,%%xmm3 \n"
1059 "lea 0x40(%0),%0 \n"
1060 "phaddw %%xmm1,%%xmm0 \n"
1061 "phaddw %%xmm3,%%xmm2 \n"
1062 "psrlw $0x7,%%xmm0 \n"
1063 "psrlw $0x7,%%xmm2 \n"
1064 "packuswb %%xmm2,%%xmm0 \n"
1065 "paddb %%xmm5,%%xmm0 \n"
1066 "movdqu %%xmm0,(%1) \n"
1067 "lea 0x10(%1),%1 \n"
1068 "sub $0x10,%2 \n"
1069 "ja 1b \n"
1070 : "+r"(src_abgr), // %0
1071 "+r"(dst_y), // %1
1072 "+r"(pix) // %2
1073 : "m"(kABGRToY), // %3
1074 "m"(kAddY16) // %4
1075 : "memory", "cc"
1076#if defined(__SSE2__)
1077 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1078#endif
1079 );
1080}
1081
1082void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1083 uint8* dst_u, uint8* dst_v, int width) {
1084 asm volatile (
1085 "movdqa %0,%%xmm4 \n"
1086 "movdqa %1,%%xmm3 \n"
1087 "movdqa %2,%%xmm5 \n"
1088 :
1089 : "m"(kABGRToU), // %0
1090 "m"(kABGRToV), // %1
1091 "m"(kAddUV128) // %2
1092 :
1093#if defined(__SSE2__)
1094 "xmm3", "xmm4", "xmm5"
1095#endif
1096 );
1097 asm volatile (
1098 "sub %1,%2 \n"
1099 "1: \n"
1100 "movdqa (%0),%%xmm0 \n"
1101 "movdqa 0x10(%0),%%xmm1 \n"
1102 "movdqa 0x20(%0),%%xmm2 \n"
1103 "movdqa 0x30(%0),%%xmm6 \n"
1104 "pavgb (%0,%4,1),%%xmm0 \n"
1105 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1106 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1107 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1108 "lea 0x40(%0),%0 \n"
1109 "movdqa %%xmm0,%%xmm7 \n"
1110 "shufps $0x88,%%xmm1,%%xmm0 \n"
1111 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1112 "pavgb %%xmm7,%%xmm0 \n"
1113 "movdqa %%xmm2,%%xmm7 \n"
1114 "shufps $0x88,%%xmm6,%%xmm2 \n"
1115 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1116 "pavgb %%xmm7,%%xmm2 \n"
1117 "movdqa %%xmm0,%%xmm1 \n"
1118 "movdqa %%xmm2,%%xmm6 \n"
1119 "pmaddubsw %%xmm4,%%xmm0 \n"
1120 "pmaddubsw %%xmm4,%%xmm2 \n"
1121 "pmaddubsw %%xmm3,%%xmm1 \n"
1122 "pmaddubsw %%xmm3,%%xmm6 \n"
1123 "phaddw %%xmm2,%%xmm0 \n"
1124 "phaddw %%xmm6,%%xmm1 \n"
1125 "psraw $0x8,%%xmm0 \n"
1126 "psraw $0x8,%%xmm1 \n"
1127 "packsswb %%xmm1,%%xmm0 \n"
1128 "paddb %%xmm5,%%xmm0 \n"
1129 "movlps %%xmm0,(%1) \n"
1130 "movhps %%xmm0,(%1,%2,1) \n"
1131 "lea 0x8(%1),%1 \n"
1132 "sub $0x10,%3 \n"
1133 "ja 1b \n"
1134 : "+r"(src_abgr0), // %0
1135 "+r"(dst_u), // %1
1136 "+r"(dst_v), // %2
1137 "+rm"(width) // %3
1138 : "r"(static_cast<intptr_t>(src_stride_abgr))
1139 : "memory", "cc"
1140#if defined(__SSE2__)
1141 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1142#endif
1143 );
1144}
1145
1146void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1147 uint8* dst_u, uint8* dst_v, int width) {
1148 asm volatile (
1149 "movdqa %0,%%xmm4 \n"
1150 "movdqa %1,%%xmm3 \n"
1151 "movdqa %2,%%xmm5 \n"
1152 :
1153 : "m"(kABGRToU), // %0
1154 "m"(kABGRToV), // %1
1155 "m"(kAddUV128) // %2
1156 :
1157#if defined(__SSE2__)
1158 "xmm3", "xmm4", "xmm5"
1159#endif
1160 );
1161 asm volatile (
1162 "sub %1,%2 \n"
1163 "1: \n"
1164 "movdqu (%0),%%xmm0 \n"
1165 "movdqu 0x10(%0),%%xmm1 \n"
1166 "movdqu 0x20(%0),%%xmm2 \n"
1167 "movdqu 0x30(%0),%%xmm6 \n"
1168 "movdqu (%0,%4,1),%%xmm7 \n"
1169 "pavgb %%xmm7,%%xmm0 \n"
1170 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1171 "pavgb %%xmm7,%%xmm1 \n"
1172 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1173 "pavgb %%xmm7,%%xmm2 \n"
1174 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm6 \n"
1176 "lea 0x40(%0),%0 \n"
1177 "movdqa %%xmm0,%%xmm7 \n"
1178 "shufps $0x88,%%xmm1,%%xmm0 \n"
1179 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1180 "pavgb %%xmm7,%%xmm0 \n"
1181 "movdqa %%xmm2,%%xmm7 \n"
1182 "shufps $0x88,%%xmm6,%%xmm2 \n"
1183 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1184 "pavgb %%xmm7,%%xmm2 \n"
1185 "movdqa %%xmm0,%%xmm1 \n"
1186 "movdqa %%xmm2,%%xmm6 \n"
1187 "pmaddubsw %%xmm4,%%xmm0 \n"
1188 "pmaddubsw %%xmm4,%%xmm2 \n"
1189 "pmaddubsw %%xmm3,%%xmm1 \n"
1190 "pmaddubsw %%xmm3,%%xmm6 \n"
1191 "phaddw %%xmm2,%%xmm0 \n"
1192 "phaddw %%xmm6,%%xmm1 \n"
1193 "psraw $0x8,%%xmm0 \n"
1194 "psraw $0x8,%%xmm1 \n"
1195 "packsswb %%xmm1,%%xmm0 \n"
1196 "paddb %%xmm5,%%xmm0 \n"
1197 "movlps %%xmm0,(%1) \n"
1198 "movhps %%xmm0,(%1,%2,1) \n"
1199 "lea 0x8(%1),%1 \n"
1200 "sub $0x10,%3 \n"
1201 "ja 1b \n"
1202 : "+r"(src_abgr0), // %0
1203 "+r"(dst_u), // %1
1204 "+r"(dst_v), // %2
1205 "+rm"(width) // %3
1206 : "r"(static_cast<intptr_t>(src_stride_abgr))
1207 : "memory", "cc"
1208#if defined(__SSE2__)
1209 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1210#endif
1211 );
1212}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001213
1214#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001215
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001216#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001217#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1218#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1219#define UR 0
1220
1221#define VB 0
1222#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1223#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1224
1225// Bias
1226#define BB UB * 128 + VB * 128
1227#define BG UG * 128 + VG * 128
1228#define BR UR * 128 + VR * 128
1229
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001230#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001231
fbarchard@google.comb6149762011-11-07 21:58:52 +00001232#if defined(__APPLE__) || defined(__x86_64__)
1233#define OMITFP
1234#else
1235#define OMITFP __attribute__((optimize("omit-frame-pointer")))
1236#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001237
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001238struct {
1239 vec8 kUVToB;
1240 vec8 kUVToG;
1241 vec8 kUVToR;
1242 vec16 kUVBiasB;
1243 vec16 kUVBiasG;
1244 vec16 kUVBiasR;
1245 vec16 kYSub16;
1246 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001247} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001248 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1249 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1250 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1251 { BB, BB, BB, BB, BB, BB, BB, BB },
1252 { BG, BG, BG, BG, BG, BG, BG, BG },
1253 { BR, BR, BR, BR, BR, BR, BR, BR },
1254 { 16, 16, 16, 16, 16, 16, 16, 16 },
1255 { YG, YG, YG, YG, YG, YG, YG, YG }
1256};
1257
1258// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +00001259#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001260 "movd (%1),%%xmm0 \n" \
1261 "movd (%1,%2,1),%%xmm1 \n" \
1262 "lea 0x4(%1),%1 \n" \
1263 "punpcklbw %%xmm1,%%xmm0 \n" \
1264 "punpcklwd %%xmm0,%%xmm0 \n" \
1265 "movdqa %%xmm0,%%xmm1 \n" \
1266 "movdqa %%xmm0,%%xmm2 \n" \
1267 "pmaddubsw (%5),%%xmm0 \n" \
1268 "pmaddubsw 16(%5),%%xmm1 \n" \
1269 "pmaddubsw 32(%5),%%xmm2 \n" \
1270 "psubw 48(%5),%%xmm0 \n" \
1271 "psubw 64(%5),%%xmm1 \n" \
1272 "psubw 80(%5),%%xmm2 \n" \
1273 "movq (%0),%%xmm3 \n" \
1274 "lea 0x8(%0),%0 \n" \
1275 "punpcklbw %%xmm4,%%xmm3 \n" \
1276 "psubsw 96(%5),%%xmm3 \n" \
1277 "pmullw 112(%5),%%xmm3 \n" \
1278 "paddsw %%xmm3,%%xmm0 \n" \
1279 "paddsw %%xmm3,%%xmm1 \n" \
1280 "paddsw %%xmm3,%%xmm2 \n" \
1281 "psraw $0x6,%%xmm0 \n" \
1282 "psraw $0x6,%%xmm1 \n" \
1283 "psraw $0x6,%%xmm2 \n" \
1284 "packuswb %%xmm0,%%xmm0 \n" \
1285 "packuswb %%xmm1,%%xmm1 \n" \
1286 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001287
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001288void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
1289 const uint8* u_buf,
1290 const uint8* v_buf,
1291 uint8* rgb_buf,
1292 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +00001293 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001294 "sub %1,%2 \n"
1295 "pcmpeqb %%xmm5,%%xmm5 \n"
1296 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001297 "1: \n"
1298 YUVTORGB
1299 "punpcklbw %%xmm1,%%xmm0 \n"
1300 "punpcklbw %%xmm5,%%xmm2 \n"
1301 "movdqa %%xmm0,%%xmm1 \n"
1302 "punpcklwd %%xmm2,%%xmm0 \n"
1303 "punpckhwd %%xmm2,%%xmm1 \n"
1304 "movdqa %%xmm0,(%3) \n"
1305 "movdqa %%xmm1,0x10(%3) \n"
1306 "lea 0x20(%3),%3 \n"
1307 "sub $0x8,%4 \n"
1308 "ja 1b \n"
1309 : "+r"(y_buf), // %0
1310 "+r"(u_buf), // %1
1311 "+r"(v_buf), // %2
1312 "+r"(rgb_buf), // %3
1313 "+rm"(width) // %4
1314 : "r"(&kYuvConstants.kUVToB) // %5
1315 : "memory", "cc"
1316#if defined(__SSE2__)
1317 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1318#endif
1319 );
1320}
1321
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001322void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
1323 const uint8* u_buf,
1324 const uint8* v_buf,
1325 uint8* rgb_buf,
1326 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001327 asm volatile (
1328 "sub %1,%2 \n"
1329 "pcmpeqb %%xmm5,%%xmm5 \n"
1330 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001331 "1: \n"
1332 YUVTORGB
1333 "pcmpeqb %%xmm5,%%xmm5 \n"
1334 "punpcklbw %%xmm0,%%xmm1 \n"
1335 "punpcklbw %%xmm2,%%xmm5 \n"
1336 "movdqa %%xmm5,%%xmm0 \n"
1337 "punpcklwd %%xmm1,%%xmm5 \n"
1338 "punpckhwd %%xmm1,%%xmm0 \n"
1339 "movdqa %%xmm5,(%3) \n"
1340 "movdqa %%xmm0,0x10(%3) \n"
1341 "lea 0x20(%3),%3 \n"
1342 "sub $0x8,%4 \n"
1343 "ja 1b \n"
1344 : "+r"(y_buf), // %0
1345 "+r"(u_buf), // %1
1346 "+r"(v_buf), // %2
1347 "+r"(rgb_buf), // %3
1348 "+rm"(width) // %4
1349 : "r"(&kYuvConstants.kUVToB) // %5
1350 : "memory", "cc"
1351#if defined(__SSE2__)
1352 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1353#endif
1354 );
1355}
1356
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001357void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
1358 const uint8* u_buf,
1359 const uint8* v_buf,
1360 uint8* rgb_buf,
1361 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001362 asm volatile (
1363 "sub %1,%2 \n"
1364 "pcmpeqb %%xmm5,%%xmm5 \n"
1365 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001366 "1: \n"
1367 YUVTORGB
1368 "punpcklbw %%xmm1,%%xmm2 \n"
1369 "punpcklbw %%xmm5,%%xmm0 \n"
1370 "movdqa %%xmm2,%%xmm1 \n"
1371 "punpcklwd %%xmm0,%%xmm2 \n"
1372 "punpckhwd %%xmm0,%%xmm1 \n"
1373 "movdqa %%xmm2,(%3) \n"
1374 "movdqa %%xmm1,0x10(%3) \n"
1375 "lea 0x20(%3),%3 \n"
1376 "sub $0x8,%4 \n"
1377 "ja 1b \n"
1378 : "+r"(y_buf), // %0
1379 "+r"(u_buf), // %1
1380 "+r"(v_buf), // %2
1381 "+r"(rgb_buf), // %3
1382 "+rm"(width) // %4
1383 : "r"(&kYuvConstants.kUVToB) // %5
1384 : "memory", "cc"
1385#if defined(__SSE2__)
1386 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1387#endif
1388 );
1389}
1390
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001391void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1392 const uint8* u_buf,
1393 const uint8* v_buf,
1394 uint8* rgb_buf,
1395 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001396 asm volatile (
1397 "sub %1,%2 \n"
1398 "pcmpeqb %%xmm5,%%xmm5 \n"
1399 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001400 "1: \n"
1401 "movd (%1),%%xmm0 \n"
1402 "movd (%1,%2,1),%%xmm1 \n"
1403 "lea 0x4(%1),%1 \n"
1404 "punpcklbw %%xmm1,%%xmm0 \n"
1405 "movdqa %%xmm0,%%xmm1 \n"
1406 "movdqa %%xmm0,%%xmm2 \n"
1407 "pmaddubsw (%5),%%xmm0 \n"
1408 "pmaddubsw 16(%5),%%xmm1 \n"
1409 "pmaddubsw 32(%5),%%xmm2 \n"
1410 "psubw 48(%5),%%xmm0 \n"
1411 "psubw 64(%5),%%xmm1 \n"
1412 "psubw 80(%5),%%xmm2 \n"
1413 "movd (%0),%%xmm3 \n"
1414 "lea 0x4(%0),%0 \n"
1415 "punpcklbw %%xmm4,%%xmm3 \n"
1416 "psubsw 96(%5),%%xmm3 \n"
1417 "pmullw 112(%5),%%xmm3 \n"
1418 "paddsw %%xmm3,%%xmm0 \n"
1419 "paddsw %%xmm3,%%xmm1 \n"
1420 "paddsw %%xmm3,%%xmm2 \n"
1421 "psraw $0x6,%%xmm0 \n"
1422 "psraw $0x6,%%xmm1 \n"
1423 "psraw $0x6,%%xmm2 \n"
1424 "packuswb %%xmm0,%%xmm0 \n"
1425 "packuswb %%xmm1,%%xmm1 \n"
1426 "packuswb %%xmm2,%%xmm2 \n"
1427 "punpcklbw %%xmm1,%%xmm0 \n"
1428 "punpcklbw %%xmm5,%%xmm2 \n"
1429 "punpcklwd %%xmm2,%%xmm0 \n"
1430 "movdqa %%xmm0,(%3) \n"
1431 "lea 0x10(%3),%3 \n"
1432 "sub $0x4,%4 \n"
1433 "ja 1b \n"
1434 : "+r"(y_buf), // %0
1435 "+r"(u_buf), // %1
1436 "+r"(v_buf), // %2
1437 "+r"(rgb_buf), // %3
1438 "+rm"(width) // %4
1439 : "r"(&kYuvConstants.kUVToB) // %5
1440 : "memory", "cc"
1441#if defined(__SSE2__)
1442 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1443#endif
1444 );
1445}
1446#endif
1447
1448#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001449void YToARGBRow_SSE2(const uint8* y_buf,
1450 uint8* rgb_buf,
1451 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001452 asm volatile (
1453 "pcmpeqb %%xmm4,%%xmm4 \n"
1454 "pslld $0x18,%%xmm4 \n"
1455 "mov $0x10001000,%%eax \n"
1456 "movd %%eax,%%xmm3 \n"
1457 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1458 "mov $0x012a012a,%%eax \n"
1459 "movd %%eax,%%xmm2 \n"
1460 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001461 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001462 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001463 "movq (%0),%%xmm0 \n"
1464 "lea 0x8(%0),%0 \n"
1465 "punpcklbw %%xmm0,%%xmm0 \n"
1466 "psubusw %%xmm3,%%xmm0 \n"
1467 "pmulhuw %%xmm2,%%xmm0 \n"
1468 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001469
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001470 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001471 "punpcklbw %%xmm0,%%xmm0 \n"
1472 "movdqa %%xmm0,%%xmm1 \n"
1473 "punpcklwd %%xmm0,%%xmm0 \n"
1474 "punpckhwd %%xmm1,%%xmm1 \n"
1475 "por %%xmm4,%%xmm0 \n"
1476 "por %%xmm4,%%xmm1 \n"
1477 "movdqa %%xmm0,(%1) \n"
1478 "movdqa %%xmm1,16(%1) \n"
1479 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001480
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001481 "sub $0x8,%2 \n"
1482 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001483 : "+r"(y_buf), // %0
1484 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001485 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001486 :
1487 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001488#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001489 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001490#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001491 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001492}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001493#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001494
fbarchard@google.com42831e02012-01-21 02:54:17 +00001495#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001496// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001497CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001498 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1499};
1500
fbarchard@google.com42831e02012-01-21 02:54:17 +00001501void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001502 intptr_t temp_width = static_cast<intptr_t>(width);
1503 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001504 "movdqa %3,%%xmm5 \n"
1505 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001506 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001507 "movdqa (%0,%2),%%xmm0 \n"
1508 "pshufb %%xmm5,%%xmm0 \n"
1509 "sub $0x10,%2 \n"
1510 "movdqa %%xmm0,(%1) \n"
1511 "lea 0x10(%1),%1 \n"
1512 "ja 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001513 : "+r"(src), // %0
1514 "+r"(dst), // %1
1515 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001516 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001517 : "memory", "cc"
1518#if defined(__SSE2__)
1519 , "xmm0", "xmm5"
1520#endif
1521 );
1522}
1523#endif
1524
fbarchard@google.com42831e02012-01-21 02:54:17 +00001525#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001526void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001527 intptr_t temp_width = static_cast<intptr_t>(width);
1528 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001529 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001530 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001531 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001532 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001533 "psllw $0x8,%%xmm0 \n"
1534 "psrlw $0x8,%%xmm1 \n"
1535 "por %%xmm1,%%xmm0 \n"
1536 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1537 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1538 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1539 "sub $0x10,%2 \n"
1540 "movdqu %%xmm0,(%1) \n"
1541 "lea 0x10(%1),%1 \n"
1542 "ja 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001543 : "+r"(src), // %0
1544 "+r"(dst), // %1
1545 "+r"(temp_width) // %2
1546 :
1547 : "memory", "cc"
1548#if defined(__SSE2__)
1549 , "xmm0", "xmm1"
1550#endif
1551 );
1552}
1553#endif
1554
fbarchard@google.com16a96642012-03-02 22:38:09 +00001555#ifdef HAS_MIRRORROW_UV_SSSE3
1556// Shuffle table for reversing the bytes of UV channels.
1557CONST uvec8 kShuffleMirrorUV = {
1558 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1559};
1560void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1561 int width) {
1562 intptr_t temp_width = static_cast<intptr_t>(width);
1563 asm volatile (
1564 "movdqa %4,%%xmm1 \n"
1565 "lea -16(%0,%3,2),%0 \n"
1566 "sub %1,%2 \n"
1567 "1: \n"
1568 "movdqa (%0),%%xmm0 \n"
1569 "lea -16(%0),%0 \n"
1570 "pshufb %%xmm1,%%xmm0 \n"
1571 "sub $8,%3 \n"
1572 "movlpd %%xmm0,(%1) \n"
1573 "movhpd %%xmm0,(%1,%2) \n"
1574 "lea 8(%1),%1 \n"
1575 "ja 1b \n"
1576 : "+r"(src), // %0
1577 "+r"(dst_u), // %1
1578 "+r"(dst_v), // %2
1579 "+r"(temp_width) // %3
1580 : "m"(kShuffleMirrorUV) // %4
1581 : "memory", "cc"
1582#if defined(__SSE2__)
1583 , "xmm0", "xmm1"
1584#endif
1585 );
1586}
1587#endif
1588
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001589#ifdef HAS_SPLITUV_SSE2
1590void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
1591 asm volatile (
1592 "pcmpeqb %%xmm5,%%xmm5 \n"
1593 "psrlw $0x8,%%xmm5 \n"
1594 "sub %1,%2 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001595 "1: \n"
1596 "movdqa (%0),%%xmm0 \n"
1597 "movdqa 0x10(%0),%%xmm1 \n"
1598 "lea 0x20(%0),%0 \n"
1599 "movdqa %%xmm0,%%xmm2 \n"
1600 "movdqa %%xmm1,%%xmm3 \n"
1601 "pand %%xmm5,%%xmm0 \n"
1602 "pand %%xmm5,%%xmm1 \n"
1603 "packuswb %%xmm1,%%xmm0 \n"
1604 "psrlw $0x8,%%xmm2 \n"
1605 "psrlw $0x8,%%xmm3 \n"
1606 "packuswb %%xmm3,%%xmm2 \n"
1607 "movdqa %%xmm0,(%1) \n"
1608 "movdqa %%xmm2,(%1,%2) \n"
1609 "lea 0x10(%1),%1 \n"
1610 "sub $0x10,%3 \n"
1611 "ja 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001612 : "+r"(src_uv), // %0
1613 "+r"(dst_u), // %1
1614 "+r"(dst_v), // %2
1615 "+r"(pix) // %3
1616 :
1617 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001618#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001619 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001620#endif
1621 );
1622}
1623#endif
1624
fbarchard@google.com19932f82012-02-16 22:19:14 +00001625#ifdef HAS_COPYROW_SSE2
1626void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
1627 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001628 "sub %0,%1 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001629 "1: \n"
1630 "movdqa (%0),%%xmm0 \n"
1631 "movdqa 0x10(%0),%%xmm1 \n"
1632 "movdqa %%xmm0,(%0,%1) \n"
1633 "movdqa %%xmm1,0x10(%0,%1) \n"
1634 "lea 0x20(%0),%0 \n"
1635 "sub $0x20,%2 \n"
1636 "ja 1b \n"
1637 : "+r"(src), // %0
1638 "+r"(dst), // %1
1639 "+r"(count) // %2
1640 :
1641 : "memory", "cc"
1642#if defined(__SSE2__)
1643 , "xmm0", "xmm1"
1644#endif
1645 );
1646}
1647#endif // HAS_COPYROW_SSE2
1648
1649#ifdef HAS_COPYROW_X86
1650void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1651 size_t width_tmp = static_cast<size_t>(width);
1652 asm volatile (
1653 "shr $0x2,%2 \n"
1654 "rep movsl \n"
1655 : "+S"(src), // %0
1656 "+D"(dst), // %1
1657 "+c"(width_tmp) // %2
1658 :
1659 : "memory", "cc"
1660 );
1661}
1662#endif
1663
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001664#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001665void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
1666 asm volatile (
1667 "pcmpeqb %%xmm5,%%xmm5 \n"
1668 "psrlw $0x8,%%xmm5 \n"
1669 "1: \n"
1670 "movdqa (%0),%%xmm0 \n"
1671 "movdqa 0x10(%0),%%xmm1 \n"
1672 "lea 0x20(%0),%0 \n"
1673 "pand %%xmm5,%%xmm0 \n"
1674 "pand %%xmm5,%%xmm1 \n"
1675 "packuswb %%xmm1,%%xmm0 \n"
1676 "movdqa %%xmm0,(%1) \n"
1677 "lea 0x10(%1),%1 \n"
1678 "sub $0x10,%2 \n"
1679 "ja 1b \n"
1680 : "+r"(src_yuy2), // %0
1681 "+r"(dst_y), // %1
1682 "+r"(pix) // %2
1683 :
1684 : "memory", "cc"
1685#if defined(__SSE2__)
1686 , "xmm0", "xmm1", "xmm5"
1687#endif
1688 );
1689}
1690
1691void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1692 uint8* dst_u, uint8* dst_y, int pix) {
1693 asm volatile (
1694 "pcmpeqb %%xmm5,%%xmm5 \n"
1695 "psrlw $0x8,%%xmm5 \n"
1696 "sub %1,%2 \n"
1697 "1: \n"
1698 "movdqa (%0),%%xmm0 \n"
1699 "movdqa 0x10(%0),%%xmm1 \n"
1700 "movdqa (%0,%4,1),%%xmm2 \n"
1701 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1702 "lea 0x20(%0),%0 \n"
1703 "pavgb %%xmm2,%%xmm0 \n"
1704 "pavgb %%xmm3,%%xmm1 \n"
1705 "psrlw $0x8,%%xmm0 \n"
1706 "psrlw $0x8,%%xmm1 \n"
1707 "packuswb %%xmm1,%%xmm0 \n"
1708 "movdqa %%xmm0,%%xmm1 \n"
1709 "pand %%xmm5,%%xmm0 \n"
1710 "packuswb %%xmm0,%%xmm0 \n"
1711 "psrlw $0x8,%%xmm1 \n"
1712 "packuswb %%xmm1,%%xmm1 \n"
1713 "movq %%xmm0,(%1) \n"
1714 "movq %%xmm1,(%1,%2) \n"
1715 "lea 0x8(%1),%1 \n"
1716 "sub $0x10,%3 \n"
1717 "ja 1b \n"
1718 : "+r"(src_yuy2), // %0
1719 "+r"(dst_u), // %1
1720 "+r"(dst_y), // %2
1721 "+r"(pix) // %3
1722 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1723 : "memory", "cc"
1724#if defined(__SSE2__)
1725 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1726#endif
1727 );
1728}
1729
1730void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1731 uint8* dst_y, int pix) {
1732 asm volatile (
1733 "pcmpeqb %%xmm5,%%xmm5 \n"
1734 "psrlw $0x8,%%xmm5 \n"
1735 "1: \n"
1736 "movdqu (%0),%%xmm0 \n"
1737 "movdqu 0x10(%0),%%xmm1 \n"
1738 "lea 0x20(%0),%0 \n"
1739 "pand %%xmm5,%%xmm0 \n"
1740 "pand %%xmm5,%%xmm1 \n"
1741 "packuswb %%xmm1,%%xmm0 \n"
1742 "movdqu %%xmm0,(%1) \n"
1743 "lea 0x10(%1),%1 \n"
1744 "sub $0x10,%2 \n"
1745 "ja 1b \n"
1746 : "+r"(src_yuy2), // %0
1747 "+r"(dst_y), // %1
1748 "+r"(pix) // %2
1749 :
1750 : "memory", "cc"
1751#if defined(__SSE2__)
1752 , "xmm0", "xmm1", "xmm5"
1753#endif
1754 );
1755}
1756
1757void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1758 int stride_yuy2,
1759 uint8* dst_u, uint8* dst_y,
1760 int pix) {
1761 asm volatile (
1762 "pcmpeqb %%xmm5,%%xmm5 \n"
1763 "psrlw $0x8,%%xmm5 \n"
1764 "sub %1,%2 \n"
1765 "1: \n"
1766 "movdqu (%0),%%xmm0 \n"
1767 "movdqu 0x10(%0),%%xmm1 \n"
1768 "movdqu (%0,%4,1),%%xmm2 \n"
1769 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1770 "lea 0x20(%0),%0 \n"
1771 "pavgb %%xmm2,%%xmm0 \n"
1772 "pavgb %%xmm3,%%xmm1 \n"
1773 "psrlw $0x8,%%xmm0 \n"
1774 "psrlw $0x8,%%xmm1 \n"
1775 "packuswb %%xmm1,%%xmm0 \n"
1776 "movdqa %%xmm0,%%xmm1 \n"
1777 "pand %%xmm5,%%xmm0 \n"
1778 "packuswb %%xmm0,%%xmm0 \n"
1779 "psrlw $0x8,%%xmm1 \n"
1780 "packuswb %%xmm1,%%xmm1 \n"
1781 "movq %%xmm0,(%1) \n"
1782 "movq %%xmm1,(%1,%2) \n"
1783 "lea 0x8(%1),%1 \n"
1784 "sub $0x10,%3 \n"
1785 "ja 1b \n"
1786 : "+r"(src_yuy2), // %0
1787 "+r"(dst_u), // %1
1788 "+r"(dst_y), // %2
1789 "+r"(pix) // %3
1790 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1791 : "memory", "cc"
1792#if defined(__SSE2__)
1793 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1794#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001795 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001796}
1797
1798void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
1799 asm volatile (
1800 "1: \n"
1801 "movdqa (%0),%%xmm0 \n"
1802 "movdqa 0x10(%0),%%xmm1 \n"
1803 "lea 0x20(%0),%0 \n"
1804 "psrlw $0x8,%%xmm0 \n"
1805 "psrlw $0x8,%%xmm1 \n"
1806 "packuswb %%xmm1,%%xmm0 \n"
1807 "movdqa %%xmm0,(%1) \n"
1808 "lea 0x10(%1),%1 \n"
1809 "sub $0x10,%2 \n"
1810 "ja 1b \n"
1811 : "+r"(src_uyvy), // %0
1812 "+r"(dst_y), // %1
1813 "+r"(pix) // %2
1814 :
1815 : "memory", "cc"
1816#if defined(__SSE2__)
1817 , "xmm0", "xmm1"
1818#endif
1819 );
1820}
1821
1822void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
1823 uint8* dst_u, uint8* dst_y, int pix) {
1824 asm volatile (
1825 "pcmpeqb %%xmm5,%%xmm5 \n"
1826 "psrlw $0x8,%%xmm5 \n"
1827 "sub %1,%2 \n"
1828 "1: \n"
1829 "movdqa (%0),%%xmm0 \n"
1830 "movdqa 0x10(%0),%%xmm1 \n"
1831 "movdqa (%0,%4,1),%%xmm2 \n"
1832 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1833 "lea 0x20(%0),%0 \n"
1834 "pavgb %%xmm2,%%xmm0 \n"
1835 "pavgb %%xmm3,%%xmm1 \n"
1836 "pand %%xmm5,%%xmm0 \n"
1837 "pand %%xmm5,%%xmm1 \n"
1838 "packuswb %%xmm1,%%xmm0 \n"
1839 "movdqa %%xmm0,%%xmm1 \n"
1840 "pand %%xmm5,%%xmm0 \n"
1841 "packuswb %%xmm0,%%xmm0 \n"
1842 "psrlw $0x8,%%xmm1 \n"
1843 "packuswb %%xmm1,%%xmm1 \n"
1844 "movq %%xmm0,(%1) \n"
1845 "movq %%xmm1,(%1,%2) \n"
1846 "lea 0x8(%1),%1 \n"
1847 "sub $0x10,%3 \n"
1848 "ja 1b \n"
1849 : "+r"(src_uyvy), // %0
1850 "+r"(dst_u), // %1
1851 "+r"(dst_y), // %2
1852 "+r"(pix) // %3
1853 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1854 : "memory", "cc"
1855#if defined(__SSE2__)
1856 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1857#endif
1858 );
1859}
1860
1861void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
1862 uint8* dst_y, int pix) {
1863 asm volatile (
1864 "1: \n"
1865 "movdqu (%0),%%xmm0 \n"
1866 "movdqu 0x10(%0),%%xmm1 \n"
1867 "lea 0x20(%0),%0 \n"
1868 "psrlw $0x8,%%xmm0 \n"
1869 "psrlw $0x8,%%xmm1 \n"
1870 "packuswb %%xmm1,%%xmm0 \n"
1871 "movdqu %%xmm0,(%1) \n"
1872 "lea 0x10(%1),%1 \n"
1873 "sub $0x10,%2 \n"
1874 "ja 1b \n"
1875 : "+r"(src_uyvy), // %0
1876 "+r"(dst_y), // %1
1877 "+r"(pix) // %2
1878 :
1879 : "memory", "cc"
1880#if defined(__SSE2__)
1881 , "xmm0", "xmm1"
1882#endif
1883 );
1884}
1885
1886void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
1887 uint8* dst_u, uint8* dst_y, int pix) {
1888 asm volatile (
1889 "pcmpeqb %%xmm5,%%xmm5 \n"
1890 "psrlw $0x8,%%xmm5 \n"
1891 "sub %1,%2 \n"
1892 "1: \n"
1893 "movdqu (%0),%%xmm0 \n"
1894 "movdqu 0x10(%0),%%xmm1 \n"
1895 "movdqu (%0,%4,1),%%xmm2 \n"
1896 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1897 "lea 0x20(%0),%0 \n"
1898 "pavgb %%xmm2,%%xmm0 \n"
1899 "pavgb %%xmm3,%%xmm1 \n"
1900 "pand %%xmm5,%%xmm0 \n"
1901 "pand %%xmm5,%%xmm1 \n"
1902 "packuswb %%xmm1,%%xmm0 \n"
1903 "movdqa %%xmm0,%%xmm1 \n"
1904 "pand %%xmm5,%%xmm0 \n"
1905 "packuswb %%xmm0,%%xmm0 \n"
1906 "psrlw $0x8,%%xmm1 \n"
1907 "packuswb %%xmm1,%%xmm1 \n"
1908 "movq %%xmm0,(%1) \n"
1909 "movq %%xmm1,(%1,%2) \n"
1910 "lea 0x8(%1),%1 \n"
1911 "sub $0x10,%3 \n"
1912 "ja 1b \n"
1913 : "+r"(src_uyvy), // %0
1914 "+r"(dst_u), // %1
1915 "+r"(dst_y), // %2
1916 "+r"(pix) // %3
1917 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1918 : "memory", "cc"
1919#if defined(__SSE2__)
1920 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1921#endif
1922 );
1923}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001924#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001925
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001926#endif // defined(__x86_64__) || defined(__i386__)
1927
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001928#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001929} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001930} // namespace libyuv
1931#endif