blob: 249ef8e297a2c28359c73b3f2729a6e7dd577188 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
21#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
22
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
108 2u, 1u,0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
115 "1: \n"
116 "movq (%0),%%xmm0 \n"
117 "lea 0x8(%0),%0 \n"
118 "punpcklbw %%xmm0,%%xmm0 \n"
119 "movdqa %%xmm0,%%xmm1 \n"
120 "punpcklwd %%xmm0,%%xmm0 \n"
121 "punpckhwd %%xmm1,%%xmm1 \n"
122 "por %%xmm5,%%xmm0 \n"
123 "por %%xmm5,%%xmm1 \n"
124 "movdqa %%xmm0,(%1) \n"
125 "movdqa %%xmm1,0x10(%1) \n"
126 "lea 0x20(%1),%1 \n"
127 "sub $0x8,%2 \n"
128 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000129 : "+r"(src_y), // %0
130 "+r"(dst_argb), // %1
131 "+r"(pix) // %2
132 :
133 : "memory", "cc"
134#if defined(__SSE2__)
135 , "xmm0", "xmm1", "xmm5"
136#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000137 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000138}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000139
140void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000141 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000142 "movdqa %3,%%xmm5 \n"
143 "1: \n"
144 "movdqa (%0),%%xmm0 \n"
145 "lea 0x10(%0),%0 \n"
146 "pshufb %%xmm5,%%xmm0 \n"
147 "movdqa %%xmm0,(%1) \n"
148 "lea 0x10(%1),%1 \n"
149 "sub $0x4,%2 \n"
150 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000151 : "+r"(src_abgr), // %0
152 "+r"(dst_argb), // %1
153 "+r"(pix) // %2
154 : "m"(kShuffleMaskABGRToARGB) // %3
155 : "memory", "cc"
156#if defined(__SSE2__)
157 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000158#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000159 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000160}
161
162void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000163 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000164 "movdqa %3,%%xmm5 \n"
165 "1: \n"
166 "movdqa (%0),%%xmm0 \n"
167 "lea 0x10(%0),%0 \n"
168 "pshufb %%xmm5,%%xmm0 \n"
169 "movdqa %%xmm0,(%1) \n"
170 "lea 0x10(%1),%1 \n"
171 "sub $0x4,%2 \n"
172 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173 : "+r"(src_bgra), // %0
174 "+r"(dst_argb), // %1
175 "+r"(pix) // %2
176 : "m"(kShuffleMaskBGRAToARGB) // %3
177 : "memory", "cc"
178#if defined(__SSE2__)
179 , "xmm0", "xmm5"
180#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000181 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000182}
183
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000184void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000185 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000186 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
187 "pslld $0x18,%%xmm5 \n"
188 "movdqa %3,%%xmm4 \n"
189 "1: \n"
190 "movdqu (%0),%%xmm0 \n"
191 "movdqu 0x10(%0),%%xmm1 \n"
192 "movdqu 0x20(%0),%%xmm3 \n"
193 "lea 0x30(%0),%0 \n"
194 "movdqa %%xmm3,%%xmm2 \n"
195 "palignr $0x8,%%xmm1,%%xmm2 \n"
196 "pshufb %%xmm4,%%xmm2 \n"
197 "por %%xmm5,%%xmm2 \n"
198 "palignr $0xc,%%xmm0,%%xmm1 \n"
199 "pshufb %%xmm4,%%xmm0 \n"
200 "movdqa %%xmm2,0x20(%1) \n"
201 "por %%xmm5,%%xmm0 \n"
202 "pshufb %%xmm4,%%xmm1 \n"
203 "movdqa %%xmm0,(%1) \n"
204 "por %%xmm5,%%xmm1 \n"
205 "palignr $0x4,%%xmm3,%%xmm3 \n"
206 "pshufb %%xmm4,%%xmm3 \n"
207 "movdqa %%xmm1,0x10(%1) \n"
208 "por %%xmm5,%%xmm3 \n"
209 "movdqa %%xmm3,0x30(%1) \n"
210 "lea 0x40(%1),%1 \n"
211 "sub $0x10,%2 \n"
212 "ja 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000213 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000214 "+r"(dst_argb), // %1
215 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000216 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000217 : "memory", "cc"
218#if defined(__SSE2__)
219 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
220#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000221 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000222}
223
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000224void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000225 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000226 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
227 "pslld $0x18,%%xmm5 \n"
228 "movdqa %3,%%xmm4 \n"
229 "1: \n"
230 "movdqu (%0),%%xmm0 \n"
231 "movdqu 0x10(%0),%%xmm1 \n"
232 "movdqu 0x20(%0),%%xmm3 \n"
233 "lea 0x30(%0),%0 \n"
234 "movdqa %%xmm3,%%xmm2 \n"
235 "palignr $0x8,%%xmm1,%%xmm2 \n"
236 "pshufb %%xmm4,%%xmm2 \n"
237 "por %%xmm5,%%xmm2 \n"
238 "palignr $0xc,%%xmm0,%%xmm1 \n"
239 "pshufb %%xmm4,%%xmm0 \n"
240 "movdqa %%xmm2,0x20(%1) \n"
241 "por %%xmm5,%%xmm0 \n"
242 "pshufb %%xmm4,%%xmm1 \n"
243 "movdqa %%xmm0,(%1) \n"
244 "por %%xmm5,%%xmm1 \n"
245 "palignr $0x4,%%xmm3,%%xmm3 \n"
246 "pshufb %%xmm4,%%xmm3 \n"
247 "movdqa %%xmm1,0x10(%1) \n"
248 "por %%xmm5,%%xmm3 \n"
249 "movdqa %%xmm3,0x30(%1) \n"
250 "lea 0x40(%1),%1 \n"
251 "sub $0x10,%2 \n"
252 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000253 : "+r"(src_raw), // %0
254 "+r"(dst_argb), // %1
255 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000256 : "m"(kShuffleMaskRAWToARGB) // %3
257 : "memory", "cc"
258#if defined(__SSE2__)
259 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
260#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000261 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000262}
263
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000264void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
265 asm volatile (
266 "mov $0x1080108,%%eax \n"
267 "movd %%eax,%%xmm5 \n"
268 "pshufd $0x0,%%xmm5,%%xmm5 \n"
269 "mov $0x20082008,%%eax \n"
270 "movd %%eax,%%xmm6 \n"
271 "pshufd $0x0,%%xmm6,%%xmm6 \n"
272 "pcmpeqb %%xmm3,%%xmm3 \n"
273 "psllw $0xb,%%xmm3 \n"
274 "pcmpeqb %%xmm4,%%xmm4 \n"
275 "psllw $0xa,%%xmm4 \n"
276 "psrlw $0x5,%%xmm4 \n"
277 "pcmpeqb %%xmm7,%%xmm7 \n"
278 "psllw $0x8,%%xmm7 \n"
279 "sub %0,%1 \n"
280 "sub %0,%1 \n"
281 "1: \n"
282 "movdqu (%0),%%xmm0 \n"
283 "movdqa %%xmm0,%%xmm1 \n"
284 "movdqa %%xmm0,%%xmm2 \n"
285 "pand %%xmm3,%%xmm1 \n"
286 "psllw $0xb,%%xmm2 \n"
287 "pmulhuw %%xmm5,%%xmm1 \n"
288 "pmulhuw %%xmm5,%%xmm2 \n"
289 "psllw $0x8,%%xmm1 \n"
290 "por %%xmm2,%%xmm1 \n"
291 "pand %%xmm4,%%xmm0 \n"
292 "pmulhuw %%xmm6,%%xmm0 \n"
293 "por %%xmm7,%%xmm0 \n"
294 "movdqa %%xmm1,%%xmm2 \n"
295 "punpcklbw %%xmm0,%%xmm1 \n"
296 "punpckhbw %%xmm0,%%xmm2 \n"
297 "movdqa %%xmm1,(%1,%0,2) \n"
298 "movdqa %%xmm2,0x10(%1,%0,2) \n"
299 "lea 0x10(%0),%0 \n"
300 "sub $0x8,%2 \n"
301 "ja 1b \n"
302 : "+r"(src), // %0
303 "+r"(dst), // %1
304 "+r"(pix) // %2
305 :
306 : "memory", "cc", "eax"
307#if defined(__SSE2__)
308 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
309#endif
310 );
311}
312
313void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
314 asm volatile (
315 "mov $0x1080108,%%eax \n"
316 "movd %%eax,%%xmm5 \n"
317 "pshufd $0x0,%%xmm5,%%xmm5 \n"
318 "mov $0x42004200,%%eax \n"
319 "movd %%eax,%%xmm6 \n"
320 "pshufd $0x0,%%xmm6,%%xmm6 \n"
321 "pcmpeqb %%xmm3,%%xmm3 \n"
322 "psllw $0xb,%%xmm3 \n"
323 "movdqa %%xmm3,%%xmm4 \n"
324 "psrlw $0x6,%%xmm4 \n"
325 "pcmpeqb %%xmm7,%%xmm7 \n"
326 "psllw $0x8,%%xmm7 \n"
327 "sub %0,%1 \n"
328 "sub %0,%1 \n"
329 "1: \n"
330 "movdqu (%0),%%xmm0 \n"
331 "movdqa %%xmm0,%%xmm1 \n"
332 "movdqa %%xmm0,%%xmm2 \n"
333 "psllw $0x1,%%xmm1 \n"
334 "psllw $0xb,%%xmm2 \n"
335 "pand %%xmm3,%%xmm1 \n"
336 "pmulhuw %%xmm5,%%xmm2 \n"
337 "pmulhuw %%xmm5,%%xmm1 \n"
338 "psllw $0x8,%%xmm1 \n"
339 "por %%xmm2,%%xmm1 \n"
340 "movdqa %%xmm0,%%xmm2 \n"
341 "pand %%xmm4,%%xmm0 \n"
342 "psraw $0x8,%%xmm2 \n"
343 "pmulhuw %%xmm6,%%xmm0 \n"
344 "pand %%xmm7,%%xmm2 \n"
345 "por %%xmm2,%%xmm0 \n"
346 "movdqa %%xmm1,%%xmm2 \n"
347 "punpcklbw %%xmm0,%%xmm1 \n"
348 "punpckhbw %%xmm0,%%xmm2 \n"
349 "movdqa %%xmm1,(%1,%0,2) \n"
350 "movdqa %%xmm2,0x10(%1,%0,2) \n"
351 "lea 0x10(%0),%0 \n"
352 "sub $0x8,%2 \n"
353 "ja 1b \n"
354 : "+r"(src), // %0
355 "+r"(dst), // %1
356 "+r"(pix) // %2
357 :
358 : "memory", "cc", "eax"
359#if defined(__SSE2__)
360 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
361#endif
362 );
363}
364
365void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
366 asm volatile (
367 "mov $0xf0f0f0f,%%eax \n"
368 "movd %%eax,%%xmm4 \n"
369 "pshufd $0x0,%%xmm4,%%xmm4 \n"
370 "movdqa %%xmm4,%%xmm5 \n"
371 "pslld $0x4,%%xmm5 \n"
372 "sub %0,%1 \n"
373 "sub %0,%1 \n"
374 "1: \n"
375 "movdqu (%0),%%xmm0 \n"
376 "movdqa %%xmm0,%%xmm2 \n"
377 "pand %%xmm4,%%xmm0 \n"
378 "pand %%xmm5,%%xmm2 \n"
379 "movdqa %%xmm0,%%xmm1 \n"
380 "movdqa %%xmm2,%%xmm3 \n"
381 "psllw $0x4,%%xmm1 \n"
382 "psrlw $0x4,%%xmm3 \n"
383 "por %%xmm1,%%xmm0 \n"
384 "por %%xmm3,%%xmm2 \n"
385 "movdqa %%xmm0,%%xmm1 \n"
386 "punpcklbw %%xmm2,%%xmm0 \n"
387 "punpckhbw %%xmm2,%%xmm1 \n"
388 "movdqa %%xmm0,(%1,%0,2) \n"
389 "movdqa %%xmm1,0x10(%1,%0,2) \n"
390 "lea 0x10(%0),%0 \n"
391 "sub $0x8,%2 \n"
392 "ja 1b \n"
393 : "+r"(src), // %0
394 "+r"(dst), // %1
395 "+r"(pix) // %2
396 :
397 : "memory", "cc", "eax"
398#if defined(__SSE2__)
399 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
400#endif
401 );
402}
403
404void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
405 asm volatile (
406 "movdqa %3,%%xmm6 \n"
407 "1: \n"
408 "movdqa (%0),%%xmm0 \n"
409 "movdqa 0x10(%0),%%xmm1 \n"
410 "movdqa 0x20(%0),%%xmm2 \n"
411 "movdqa 0x30(%0),%%xmm3 \n"
412 "lea 0x40(%0),%0 \n"
413 "pshufb %%xmm6,%%xmm0 \n"
414 "pshufb %%xmm6,%%xmm1 \n"
415 "pshufb %%xmm6,%%xmm2 \n"
416 "pshufb %%xmm6,%%xmm3 \n"
417 "movdqa %%xmm1,%%xmm4 \n"
418 "psrldq $0x4,%%xmm1 \n"
419 "pslldq $0xc,%%xmm4 \n"
420 "movdqa %%xmm2,%%xmm5 \n"
421 "por %%xmm4,%%xmm0 \n"
422 "pslldq $0x8,%%xmm5 \n"
423 "movdqa %%xmm0,(%1) \n"
424 "por %%xmm5,%%xmm1 \n"
425 "psrldq $0x8,%%xmm2 \n"
426 "pslldq $0x4,%%xmm3 \n"
427 "por %%xmm3,%%xmm2 \n"
428 "movdqa %%xmm1,0x10(%1) \n"
429 "movdqa %%xmm2,0x20(%1) \n"
430 "lea 0x30(%1),%1 \n"
431 "sub $0x10,%2 \n"
432 "ja 1b \n"
433 : "+r"(src), // %0
434 "+r"(dst), // %1
435 "+r"(pix) // %2
436 : "m"(kShuffleMaskARGBToRGB24) // %3
437 : "memory", "cc"
438#if defined(__SSE2__)
439 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
440#endif
441 );
442}
443
444void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
445 asm volatile (
446 "movdqa %3,%%xmm6 \n"
447 "1: \n"
448 "movdqa (%0),%%xmm0 \n"
449 "movdqa 0x10(%0),%%xmm1 \n"
450 "movdqa 0x20(%0),%%xmm2 \n"
451 "movdqa 0x30(%0),%%xmm3 \n"
452 "lea 0x40(%0),%0 \n"
453 "pshufb %%xmm6,%%xmm0 \n"
454 "pshufb %%xmm6,%%xmm1 \n"
455 "pshufb %%xmm6,%%xmm2 \n"
456 "pshufb %%xmm6,%%xmm3 \n"
457 "movdqa %%xmm1,%%xmm4 \n"
458 "psrldq $0x4,%%xmm1 \n"
459 "pslldq $0xc,%%xmm4 \n"
460 "movdqa %%xmm2,%%xmm5 \n"
461 "por %%xmm4,%%xmm0 \n"
462 "pslldq $0x8,%%xmm5 \n"
463 "movdqa %%xmm0,(%1) \n"
464 "por %%xmm5,%%xmm1 \n"
465 "psrldq $0x8,%%xmm2 \n"
466 "pslldq $0x4,%%xmm3 \n"
467 "por %%xmm3,%%xmm2 \n"
468 "movdqa %%xmm1,0x10(%1) \n"
469 "movdqa %%xmm2,0x20(%1) \n"
470 "lea 0x30(%1),%1 \n"
471 "sub $0x10,%2 \n"
472 "ja 1b \n"
473 : "+r"(src), // %0
474 "+r"(dst), // %1
475 "+r"(pix) // %2
476 : "m"(kShuffleMaskARGBToRAW) // %3
477 : "memory", "cc"
478#if defined(__SSE2__)
479 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
480#endif
481 );
482}
483
484void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
485 asm volatile (
486 "pcmpeqb %%xmm3,%%xmm3 \n"
487 "psrld $0x1b,%%xmm3 \n"
488 "pcmpeqb %%xmm4,%%xmm4 \n"
489 "psrld $0x1a,%%xmm4 \n"
490 "pslld $0x5,%%xmm4 \n"
491 "pcmpeqb %%xmm5,%%xmm5 \n"
492 "pslld $0xb,%%xmm5 \n"
493 "1: \n"
494 "movdqa (%0),%%xmm0 \n"
495 "movdqa %%xmm0,%%xmm1 \n"
496 "movdqa %%xmm0,%%xmm2 \n"
497 "pslld $0x8,%%xmm0 \n"
498 "psrld $0x3,%%xmm1 \n"
499 "psrld $0x5,%%xmm2 \n"
500 "psrad $0x10,%%xmm0 \n"
501 "pand %%xmm3,%%xmm1 \n"
502 "pand %%xmm4,%%xmm2 \n"
503 "pand %%xmm5,%%xmm0 \n"
504 "por %%xmm2,%%xmm1 \n"
505 "por %%xmm1,%%xmm0 \n"
506 "packssdw %%xmm0,%%xmm0 \n"
507 "lea 0x10(%0),%0 \n"
508 "movq %%xmm0,(%1) \n"
509 "lea 0x8(%1),%1 \n"
510 "sub $0x4,%2 \n"
511 "ja 1b \n"
512 : "+r"(src), // %0
513 "+r"(dst), // %1
514 "+r"(pix) // %2
515 :
516 : "memory", "cc"
517#if defined(__SSE2__)
518 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
519#endif
520 );
521}
522
523void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
524 asm volatile (
525 "pcmpeqb %%xmm4,%%xmm4 \n"
526 "psrld $0x1b,%%xmm4 \n"
527 "movdqa %%xmm4,%%xmm5 \n"
528 "pslld $0x5,%%xmm5 \n"
529 "movdqa %%xmm4,%%xmm6 \n"
530 "pslld $0xa,%%xmm6 \n"
531 "pcmpeqb %%xmm7,%%xmm7 \n"
532 "pslld $0xf,%%xmm7 \n"
533 "1: \n"
534 "movdqa (%0),%%xmm0 \n"
535 "movdqa %%xmm0,%%xmm1 \n"
536 "movdqa %%xmm0,%%xmm2 \n"
537 "movdqa %%xmm0,%%xmm3 \n"
538 "psrad $0x10,%%xmm0 \n"
539 "psrld $0x3,%%xmm1 \n"
540 "psrld $0x6,%%xmm2 \n"
541 "psrld $0x9,%%xmm3 \n"
542 "pand %%xmm7,%%xmm0 \n"
543 "pand %%xmm4,%%xmm1 \n"
544 "pand %%xmm5,%%xmm2 \n"
545 "pand %%xmm6,%%xmm3 \n"
546 "por %%xmm1,%%xmm0 \n"
547 "por %%xmm3,%%xmm2 \n"
548 "por %%xmm2,%%xmm0 \n"
549 "packssdw %%xmm0,%%xmm0 \n"
550 "lea 0x10(%0),%0 \n"
551 "movq %%xmm0,(%1) \n"
552 "lea 0x8(%1),%1 \n"
553 "sub $0x4,%2 \n"
554 "ja 1b \n"
555 : "+r"(src), // %0
556 "+r"(dst), // %1
557 "+r"(pix) // %2
558 :
559 : "memory", "cc"
560#if defined(__SSE2__)
561 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
562#endif
563 );
564}
565
566void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
567 asm volatile (
568 "pcmpeqb %%xmm4,%%xmm4 \n"
569 "psllw $0xc,%%xmm4 \n"
570 "movdqa %%xmm4,%%xmm3 \n"
571 "psrlw $0x8,%%xmm3 \n"
572 "1: \n"
573 "movdqa (%0),%%xmm0 \n"
574 "movdqa %%xmm0,%%xmm1 \n"
575 "pand %%xmm3,%%xmm0 \n"
576 "pand %%xmm4,%%xmm1 \n"
577 "psrlq $0x4,%%xmm0 \n"
578 "psrlq $0x8,%%xmm1 \n"
579 "por %%xmm1,%%xmm0 \n"
580 "packuswb %%xmm0,%%xmm0 \n"
581 "lea 0x10(%0),%0 \n"
582 "movq %%xmm0,(%1) \n"
583 "lea 0x8(%1),%1 \n"
584 "sub $0x4,%2 \n"
585 "ja 1b \n"
586 : "+r"(src), // %0
587 "+r"(dst), // %1
588 "+r"(pix) // %2
589 :
590 : "memory", "cc"
591#if defined(__SSE2__)
592 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
593#endif
594 );
595}
596
fbarchard@google.comb6149762011-11-07 21:58:52 +0000597void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000598 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000599 "movdqa %4,%%xmm5 \n"
600 "movdqa %3,%%xmm4 \n"
601 "1: \n"
602 "movdqa (%0),%%xmm0 \n"
603 "movdqa 0x10(%0),%%xmm1 \n"
604 "movdqa 0x20(%0),%%xmm2 \n"
605 "movdqa 0x30(%0),%%xmm3 \n"
606 "pmaddubsw %%xmm4,%%xmm0 \n"
607 "pmaddubsw %%xmm4,%%xmm1 \n"
608 "pmaddubsw %%xmm4,%%xmm2 \n"
609 "pmaddubsw %%xmm4,%%xmm3 \n"
610 "lea 0x40(%0),%0 \n"
611 "phaddw %%xmm1,%%xmm0 \n"
612 "phaddw %%xmm3,%%xmm2 \n"
613 "psrlw $0x7,%%xmm0 \n"
614 "psrlw $0x7,%%xmm2 \n"
615 "packuswb %%xmm2,%%xmm0 \n"
616 "paddb %%xmm5,%%xmm0 \n"
617 "movdqa %%xmm0,(%1) \n"
618 "lea 0x10(%1),%1 \n"
619 "sub $0x10,%2 \n"
620 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000621 : "+r"(src_argb), // %0
622 "+r"(dst_y), // %1
623 "+r"(pix) // %2
624 : "m"(kARGBToY), // %3
625 "m"(kAddY16) // %4
626 : "memory", "cc"
627#if defined(__SSE2__)
628 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
629#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000630 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000631}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000632
633void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
634 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000635 "movdqa %4,%%xmm5 \n"
636 "movdqa %3,%%xmm4 \n"
637 "1: \n"
638 "movdqu (%0),%%xmm0 \n"
639 "movdqu 0x10(%0),%%xmm1 \n"
640 "movdqu 0x20(%0),%%xmm2 \n"
641 "movdqu 0x30(%0),%%xmm3 \n"
642 "pmaddubsw %%xmm4,%%xmm0 \n"
643 "pmaddubsw %%xmm4,%%xmm1 \n"
644 "pmaddubsw %%xmm4,%%xmm2 \n"
645 "pmaddubsw %%xmm4,%%xmm3 \n"
646 "lea 0x40(%0),%0 \n"
647 "phaddw %%xmm1,%%xmm0 \n"
648 "phaddw %%xmm3,%%xmm2 \n"
649 "psrlw $0x7,%%xmm0 \n"
650 "psrlw $0x7,%%xmm2 \n"
651 "packuswb %%xmm2,%%xmm0 \n"
652 "paddb %%xmm5,%%xmm0 \n"
653 "movdqu %%xmm0,(%1) \n"
654 "lea 0x10(%1),%1 \n"
655 "sub $0x10,%2 \n"
656 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000657 : "+r"(src_argb), // %0
658 "+r"(dst_y), // %1
659 "+r"(pix) // %2
660 : "m"(kARGBToY), // %3
661 "m"(kAddY16) // %4
662 : "memory", "cc"
663#if defined(__SSE2__)
664 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
665#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000666 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000667}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000668
fbarchard@google.com714050a2012-02-17 22:59:56 +0000669// TODO(fbarchard): pass xmm constants to single block of assembly.
670// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
671// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
672// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
673// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000674void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
675 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000676 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000677 "movdqa %0,%%xmm4 \n"
678 "movdqa %1,%%xmm3 \n"
679 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000680 :
681 : "m"(kARGBToU), // %0
682 "m"(kARGBToV), // %1
683 "m"(kAddUV128) // %2
684 :
685#if defined(__SSE2__)
686 "xmm3", "xmm4", "xmm5"
687#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000688 );
689 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000690 "sub %1,%2 \n"
691 "1: \n"
692 "movdqa (%0),%%xmm0 \n"
693 "movdqa 0x10(%0),%%xmm1 \n"
694 "movdqa 0x20(%0),%%xmm2 \n"
695 "movdqa 0x30(%0),%%xmm6 \n"
696 "pavgb (%0,%4,1),%%xmm0 \n"
697 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
698 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
699 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
700 "lea 0x40(%0),%0 \n"
701 "movdqa %%xmm0,%%xmm7 \n"
702 "shufps $0x88,%%xmm1,%%xmm0 \n"
703 "shufps $0xdd,%%xmm1,%%xmm7 \n"
704 "pavgb %%xmm7,%%xmm0 \n"
705 "movdqa %%xmm2,%%xmm7 \n"
706 "shufps $0x88,%%xmm6,%%xmm2 \n"
707 "shufps $0xdd,%%xmm6,%%xmm7 \n"
708 "pavgb %%xmm7,%%xmm2 \n"
709 "movdqa %%xmm0,%%xmm1 \n"
710 "movdqa %%xmm2,%%xmm6 \n"
711 "pmaddubsw %%xmm4,%%xmm0 \n"
712 "pmaddubsw %%xmm4,%%xmm2 \n"
713 "pmaddubsw %%xmm3,%%xmm1 \n"
714 "pmaddubsw %%xmm3,%%xmm6 \n"
715 "phaddw %%xmm2,%%xmm0 \n"
716 "phaddw %%xmm6,%%xmm1 \n"
717 "psraw $0x8,%%xmm0 \n"
718 "psraw $0x8,%%xmm1 \n"
719 "packsswb %%xmm1,%%xmm0 \n"
720 "paddb %%xmm5,%%xmm0 \n"
721 "movlps %%xmm0,(%1) \n"
722 "movhps %%xmm0,(%1,%2,1) \n"
723 "lea 0x8(%1),%1 \n"
724 "sub $0x10,%3 \n"
725 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000726 : "+r"(src_argb0), // %0
727 "+r"(dst_u), // %1
728 "+r"(dst_v), // %2
729 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000730 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000731 : "memory", "cc"
732#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000733 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000734#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000735 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000736}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000737
738void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
739 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000740 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000741 "movdqa %0,%%xmm4 \n"
742 "movdqa %1,%%xmm3 \n"
743 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000744 :
745 : "m"(kARGBToU), // %0
746 "m"(kARGBToV), // %1
747 "m"(kAddUV128) // %2
748 :
749#if defined(__SSE2__)
750 "xmm3", "xmm4", "xmm5"
751#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000752 );
753 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000754 "sub %1,%2 \n"
755 "1: \n"
756 "movdqu (%0),%%xmm0 \n"
757 "movdqu 0x10(%0),%%xmm1 \n"
758 "movdqu 0x20(%0),%%xmm2 \n"
759 "movdqu 0x30(%0),%%xmm6 \n"
760 "movdqu (%0,%4,1),%%xmm7 \n"
761 "pavgb %%xmm7,%%xmm0 \n"
762 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
763 "pavgb %%xmm7,%%xmm1 \n"
764 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
765 "pavgb %%xmm7,%%xmm2 \n"
766 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
767 "pavgb %%xmm7,%%xmm6 \n"
768 "lea 0x40(%0),%0 \n"
769 "movdqa %%xmm0,%%xmm7 \n"
770 "shufps $0x88,%%xmm1,%%xmm0 \n"
771 "shufps $0xdd,%%xmm1,%%xmm7 \n"
772 "pavgb %%xmm7,%%xmm0 \n"
773 "movdqa %%xmm2,%%xmm7 \n"
774 "shufps $0x88,%%xmm6,%%xmm2 \n"
775 "shufps $0xdd,%%xmm6,%%xmm7 \n"
776 "pavgb %%xmm7,%%xmm2 \n"
777 "movdqa %%xmm0,%%xmm1 \n"
778 "movdqa %%xmm2,%%xmm6 \n"
779 "pmaddubsw %%xmm4,%%xmm0 \n"
780 "pmaddubsw %%xmm4,%%xmm2 \n"
781 "pmaddubsw %%xmm3,%%xmm1 \n"
782 "pmaddubsw %%xmm3,%%xmm6 \n"
783 "phaddw %%xmm2,%%xmm0 \n"
784 "phaddw %%xmm6,%%xmm1 \n"
785 "psraw $0x8,%%xmm0 \n"
786 "psraw $0x8,%%xmm1 \n"
787 "packsswb %%xmm1,%%xmm0 \n"
788 "paddb %%xmm5,%%xmm0 \n"
789 "movlps %%xmm0,(%1) \n"
790 "movhps %%xmm0,(%1,%2,1) \n"
791 "lea 0x8(%1),%1 \n"
792 "sub $0x10,%3 \n"
793 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000794 : "+r"(src_argb0), // %0
795 "+r"(dst_u), // %1
796 "+r"(dst_v), // %2
797 "+rm"(width) // %3
798 : "r"(static_cast<intptr_t>(src_stride_argb))
799 : "memory", "cc"
800#if defined(__SSE2__)
801 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
802#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000803 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000804}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000805
806
807void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
808 asm volatile (
809 "movdqa %4,%%xmm5 \n"
810 "movdqa %3,%%xmm4 \n"
811 "1: \n"
812 "movdqa (%0),%%xmm0 \n"
813 "movdqa 0x10(%0),%%xmm1 \n"
814 "movdqa 0x20(%0),%%xmm2 \n"
815 "movdqa 0x30(%0),%%xmm3 \n"
816 "pmaddubsw %%xmm4,%%xmm0 \n"
817 "pmaddubsw %%xmm4,%%xmm1 \n"
818 "pmaddubsw %%xmm4,%%xmm2 \n"
819 "pmaddubsw %%xmm4,%%xmm3 \n"
820 "lea 0x40(%0),%0 \n"
821 "phaddw %%xmm1,%%xmm0 \n"
822 "phaddw %%xmm3,%%xmm2 \n"
823 "psrlw $0x7,%%xmm0 \n"
824 "psrlw $0x7,%%xmm2 \n"
825 "packuswb %%xmm2,%%xmm0 \n"
826 "paddb %%xmm5,%%xmm0 \n"
827 "movdqa %%xmm0,(%1) \n"
828 "lea 0x10(%1),%1 \n"
829 "sub $0x10,%2 \n"
830 "ja 1b \n"
831 : "+r"(src_bgra), // %0
832 "+r"(dst_y), // %1
833 "+r"(pix) // %2
834 : "m"(kBGRAToY), // %3
835 "m"(kAddY16) // %4
836 : "memory", "cc"
837#if defined(__SSE2__)
838 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000839#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000840 );
841}
842
843void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
844 asm volatile (
845 "movdqa %4,%%xmm5 \n"
846 "movdqa %3,%%xmm4 \n"
847 "1: \n"
848 "movdqu (%0),%%xmm0 \n"
849 "movdqu 0x10(%0),%%xmm1 \n"
850 "movdqu 0x20(%0),%%xmm2 \n"
851 "movdqu 0x30(%0),%%xmm3 \n"
852 "pmaddubsw %%xmm4,%%xmm0 \n"
853 "pmaddubsw %%xmm4,%%xmm1 \n"
854 "pmaddubsw %%xmm4,%%xmm2 \n"
855 "pmaddubsw %%xmm4,%%xmm3 \n"
856 "lea 0x40(%0),%0 \n"
857 "phaddw %%xmm1,%%xmm0 \n"
858 "phaddw %%xmm3,%%xmm2 \n"
859 "psrlw $0x7,%%xmm0 \n"
860 "psrlw $0x7,%%xmm2 \n"
861 "packuswb %%xmm2,%%xmm0 \n"
862 "paddb %%xmm5,%%xmm0 \n"
863 "movdqu %%xmm0,(%1) \n"
864 "lea 0x10(%1),%1 \n"
865 "sub $0x10,%2 \n"
866 "ja 1b \n"
867 : "+r"(src_bgra), // %0
868 "+r"(dst_y), // %1
869 "+r"(pix) // %2
870 : "m"(kBGRAToY), // %3
871 "m"(kAddY16) // %4
872 : "memory", "cc"
873#if defined(__SSE2__)
874 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
875#endif
876 );
877}
878
879void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
880 uint8* dst_u, uint8* dst_v, int width) {
881 asm volatile (
882 "movdqa %0,%%xmm4 \n"
883 "movdqa %1,%%xmm3 \n"
884 "movdqa %2,%%xmm5 \n"
885 :
886 : "m"(kBGRAToU), // %0
887 "m"(kBGRAToV), // %1
888 "m"(kAddUV128) // %2
889 :
890#if defined(__SSE2__)
891 "xmm3", "xmm4", "xmm5"
892#endif
893 );
894 asm volatile (
895 "sub %1,%2 \n"
896 "1: \n"
897 "movdqa (%0),%%xmm0 \n"
898 "movdqa 0x10(%0),%%xmm1 \n"
899 "movdqa 0x20(%0),%%xmm2 \n"
900 "movdqa 0x30(%0),%%xmm6 \n"
901 "pavgb (%0,%4,1),%%xmm0 \n"
902 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
903 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
904 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
905 "lea 0x40(%0),%0 \n"
906 "movdqa %%xmm0,%%xmm7 \n"
907 "shufps $0x88,%%xmm1,%%xmm0 \n"
908 "shufps $0xdd,%%xmm1,%%xmm7 \n"
909 "pavgb %%xmm7,%%xmm0 \n"
910 "movdqa %%xmm2,%%xmm7 \n"
911 "shufps $0x88,%%xmm6,%%xmm2 \n"
912 "shufps $0xdd,%%xmm6,%%xmm7 \n"
913 "pavgb %%xmm7,%%xmm2 \n"
914 "movdqa %%xmm0,%%xmm1 \n"
915 "movdqa %%xmm2,%%xmm6 \n"
916 "pmaddubsw %%xmm4,%%xmm0 \n"
917 "pmaddubsw %%xmm4,%%xmm2 \n"
918 "pmaddubsw %%xmm3,%%xmm1 \n"
919 "pmaddubsw %%xmm3,%%xmm6 \n"
920 "phaddw %%xmm2,%%xmm0 \n"
921 "phaddw %%xmm6,%%xmm1 \n"
922 "psraw $0x8,%%xmm0 \n"
923 "psraw $0x8,%%xmm1 \n"
924 "packsswb %%xmm1,%%xmm0 \n"
925 "paddb %%xmm5,%%xmm0 \n"
926 "movlps %%xmm0,(%1) \n"
927 "movhps %%xmm0,(%1,%2,1) \n"
928 "lea 0x8(%1),%1 \n"
929 "sub $0x10,%3 \n"
930 "ja 1b \n"
931 : "+r"(src_bgra0), // %0
932 "+r"(dst_u), // %1
933 "+r"(dst_v), // %2
934 "+rm"(width) // %3
935 : "r"(static_cast<intptr_t>(src_stride_bgra))
936 : "memory", "cc"
937#if defined(__SSE2__)
938 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
939#endif
940 );
941}
942
943void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
944 uint8* dst_u, uint8* dst_v, int width) {
945 asm volatile (
946 "movdqa %0,%%xmm4 \n"
947 "movdqa %1,%%xmm3 \n"
948 "movdqa %2,%%xmm5 \n"
949 :
950 : "m"(kBGRAToU), // %0
951 "m"(kBGRAToV), // %1
952 "m"(kAddUV128) // %2
953 :
954#if defined(__SSE2__)
955 "xmm3", "xmm4", "xmm5"
956#endif
957 );
958 asm volatile (
959 "sub %1,%2 \n"
960 "1: \n"
961 "movdqu (%0),%%xmm0 \n"
962 "movdqu 0x10(%0),%%xmm1 \n"
963 "movdqu 0x20(%0),%%xmm2 \n"
964 "movdqu 0x30(%0),%%xmm6 \n"
965 "movdqu (%0,%4,1),%%xmm7 \n"
966 "pavgb %%xmm7,%%xmm0 \n"
967 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
968 "pavgb %%xmm7,%%xmm1 \n"
969 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
970 "pavgb %%xmm7,%%xmm2 \n"
971 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
972 "pavgb %%xmm7,%%xmm6 \n"
973 "lea 0x40(%0),%0 \n"
974 "movdqa %%xmm0,%%xmm7 \n"
975 "shufps $0x88,%%xmm1,%%xmm0 \n"
976 "shufps $0xdd,%%xmm1,%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm0 \n"
978 "movdqa %%xmm2,%%xmm7 \n"
979 "shufps $0x88,%%xmm6,%%xmm2 \n"
980 "shufps $0xdd,%%xmm6,%%xmm7 \n"
981 "pavgb %%xmm7,%%xmm2 \n"
982 "movdqa %%xmm0,%%xmm1 \n"
983 "movdqa %%xmm2,%%xmm6 \n"
984 "pmaddubsw %%xmm4,%%xmm0 \n"
985 "pmaddubsw %%xmm4,%%xmm2 \n"
986 "pmaddubsw %%xmm3,%%xmm1 \n"
987 "pmaddubsw %%xmm3,%%xmm6 \n"
988 "phaddw %%xmm2,%%xmm0 \n"
989 "phaddw %%xmm6,%%xmm1 \n"
990 "psraw $0x8,%%xmm0 \n"
991 "psraw $0x8,%%xmm1 \n"
992 "packsswb %%xmm1,%%xmm0 \n"
993 "paddb %%xmm5,%%xmm0 \n"
994 "movlps %%xmm0,(%1) \n"
995 "movhps %%xmm0,(%1,%2,1) \n"
996 "lea 0x8(%1),%1 \n"
997 "sub $0x10,%3 \n"
998 "ja 1b \n"
999 : "+r"(src_bgra0), // %0
1000 "+r"(dst_u), // %1
1001 "+r"(dst_v), // %2
1002 "+rm"(width) // %3
1003 : "r"(static_cast<intptr_t>(src_stride_bgra))
1004 : "memory", "cc"
1005#if defined(__SSE2__)
1006 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1007#endif
1008 );
1009}
1010#endif
1011
1012
1013void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1014 asm volatile (
1015 "movdqa %4,%%xmm5 \n"
1016 "movdqa %3,%%xmm4 \n"
1017 "1: \n"
1018 "movdqa (%0),%%xmm0 \n"
1019 "movdqa 0x10(%0),%%xmm1 \n"
1020 "movdqa 0x20(%0),%%xmm2 \n"
1021 "movdqa 0x30(%0),%%xmm3 \n"
1022 "pmaddubsw %%xmm4,%%xmm0 \n"
1023 "pmaddubsw %%xmm4,%%xmm1 \n"
1024 "pmaddubsw %%xmm4,%%xmm2 \n"
1025 "pmaddubsw %%xmm4,%%xmm3 \n"
1026 "lea 0x40(%0),%0 \n"
1027 "phaddw %%xmm1,%%xmm0 \n"
1028 "phaddw %%xmm3,%%xmm2 \n"
1029 "psrlw $0x7,%%xmm0 \n"
1030 "psrlw $0x7,%%xmm2 \n"
1031 "packuswb %%xmm2,%%xmm0 \n"
1032 "paddb %%xmm5,%%xmm0 \n"
1033 "movdqa %%xmm0,(%1) \n"
1034 "lea 0x10(%1),%1 \n"
1035 "sub $0x10,%2 \n"
1036 "ja 1b \n"
1037 : "+r"(src_abgr), // %0
1038 "+r"(dst_y), // %1
1039 "+r"(pix) // %2
1040 : "m"(kABGRToY), // %3
1041 "m"(kAddY16) // %4
1042 : "memory", "cc"
1043#if defined(__SSE2__)
1044 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1045#endif
1046 );
1047}
1048
1049void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1050 asm volatile (
1051 "movdqa %4,%%xmm5 \n"
1052 "movdqa %3,%%xmm4 \n"
1053 "1: \n"
1054 "movdqu (%0),%%xmm0 \n"
1055 "movdqu 0x10(%0),%%xmm1 \n"
1056 "movdqu 0x20(%0),%%xmm2 \n"
1057 "movdqu 0x30(%0),%%xmm3 \n"
1058 "pmaddubsw %%xmm4,%%xmm0 \n"
1059 "pmaddubsw %%xmm4,%%xmm1 \n"
1060 "pmaddubsw %%xmm4,%%xmm2 \n"
1061 "pmaddubsw %%xmm4,%%xmm3 \n"
1062 "lea 0x40(%0),%0 \n"
1063 "phaddw %%xmm1,%%xmm0 \n"
1064 "phaddw %%xmm3,%%xmm2 \n"
1065 "psrlw $0x7,%%xmm0 \n"
1066 "psrlw $0x7,%%xmm2 \n"
1067 "packuswb %%xmm2,%%xmm0 \n"
1068 "paddb %%xmm5,%%xmm0 \n"
1069 "movdqu %%xmm0,(%1) \n"
1070 "lea 0x10(%1),%1 \n"
1071 "sub $0x10,%2 \n"
1072 "ja 1b \n"
1073 : "+r"(src_abgr), // %0
1074 "+r"(dst_y), // %1
1075 "+r"(pix) // %2
1076 : "m"(kABGRToY), // %3
1077 "m"(kAddY16) // %4
1078 : "memory", "cc"
1079#if defined(__SSE2__)
1080 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1081#endif
1082 );
1083}
1084
1085void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1086 uint8* dst_u, uint8* dst_v, int width) {
1087 asm volatile (
1088 "movdqa %0,%%xmm4 \n"
1089 "movdqa %1,%%xmm3 \n"
1090 "movdqa %2,%%xmm5 \n"
1091 :
1092 : "m"(kABGRToU), // %0
1093 "m"(kABGRToV), // %1
1094 "m"(kAddUV128) // %2
1095 :
1096#if defined(__SSE2__)
1097 "xmm3", "xmm4", "xmm5"
1098#endif
1099 );
1100 asm volatile (
1101 "sub %1,%2 \n"
1102 "1: \n"
1103 "movdqa (%0),%%xmm0 \n"
1104 "movdqa 0x10(%0),%%xmm1 \n"
1105 "movdqa 0x20(%0),%%xmm2 \n"
1106 "movdqa 0x30(%0),%%xmm6 \n"
1107 "pavgb (%0,%4,1),%%xmm0 \n"
1108 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1109 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1110 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1111 "lea 0x40(%0),%0 \n"
1112 "movdqa %%xmm0,%%xmm7 \n"
1113 "shufps $0x88,%%xmm1,%%xmm0 \n"
1114 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1115 "pavgb %%xmm7,%%xmm0 \n"
1116 "movdqa %%xmm2,%%xmm7 \n"
1117 "shufps $0x88,%%xmm6,%%xmm2 \n"
1118 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1119 "pavgb %%xmm7,%%xmm2 \n"
1120 "movdqa %%xmm0,%%xmm1 \n"
1121 "movdqa %%xmm2,%%xmm6 \n"
1122 "pmaddubsw %%xmm4,%%xmm0 \n"
1123 "pmaddubsw %%xmm4,%%xmm2 \n"
1124 "pmaddubsw %%xmm3,%%xmm1 \n"
1125 "pmaddubsw %%xmm3,%%xmm6 \n"
1126 "phaddw %%xmm2,%%xmm0 \n"
1127 "phaddw %%xmm6,%%xmm1 \n"
1128 "psraw $0x8,%%xmm0 \n"
1129 "psraw $0x8,%%xmm1 \n"
1130 "packsswb %%xmm1,%%xmm0 \n"
1131 "paddb %%xmm5,%%xmm0 \n"
1132 "movlps %%xmm0,(%1) \n"
1133 "movhps %%xmm0,(%1,%2,1) \n"
1134 "lea 0x8(%1),%1 \n"
1135 "sub $0x10,%3 \n"
1136 "ja 1b \n"
1137 : "+r"(src_abgr0), // %0
1138 "+r"(dst_u), // %1
1139 "+r"(dst_v), // %2
1140 "+rm"(width) // %3
1141 : "r"(static_cast<intptr_t>(src_stride_abgr))
1142 : "memory", "cc"
1143#if defined(__SSE2__)
1144 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1145#endif
1146 );
1147}
1148
1149void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1150 uint8* dst_u, uint8* dst_v, int width) {
1151 asm volatile (
1152 "movdqa %0,%%xmm4 \n"
1153 "movdqa %1,%%xmm3 \n"
1154 "movdqa %2,%%xmm5 \n"
1155 :
1156 : "m"(kABGRToU), // %0
1157 "m"(kABGRToV), // %1
1158 "m"(kAddUV128) // %2
1159 :
1160#if defined(__SSE2__)
1161 "xmm3", "xmm4", "xmm5"
1162#endif
1163 );
1164 asm volatile (
1165 "sub %1,%2 \n"
1166 "1: \n"
1167 "movdqu (%0),%%xmm0 \n"
1168 "movdqu 0x10(%0),%%xmm1 \n"
1169 "movdqu 0x20(%0),%%xmm2 \n"
1170 "movdqu 0x30(%0),%%xmm6 \n"
1171 "movdqu (%0,%4,1),%%xmm7 \n"
1172 "pavgb %%xmm7,%%xmm0 \n"
1173 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1174 "pavgb %%xmm7,%%xmm1 \n"
1175 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1176 "pavgb %%xmm7,%%xmm2 \n"
1177 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1178 "pavgb %%xmm7,%%xmm6 \n"
1179 "lea 0x40(%0),%0 \n"
1180 "movdqa %%xmm0,%%xmm7 \n"
1181 "shufps $0x88,%%xmm1,%%xmm0 \n"
1182 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1183 "pavgb %%xmm7,%%xmm0 \n"
1184 "movdqa %%xmm2,%%xmm7 \n"
1185 "shufps $0x88,%%xmm6,%%xmm2 \n"
1186 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1187 "pavgb %%xmm7,%%xmm2 \n"
1188 "movdqa %%xmm0,%%xmm1 \n"
1189 "movdqa %%xmm2,%%xmm6 \n"
1190 "pmaddubsw %%xmm4,%%xmm0 \n"
1191 "pmaddubsw %%xmm4,%%xmm2 \n"
1192 "pmaddubsw %%xmm3,%%xmm1 \n"
1193 "pmaddubsw %%xmm3,%%xmm6 \n"
1194 "phaddw %%xmm2,%%xmm0 \n"
1195 "phaddw %%xmm6,%%xmm1 \n"
1196 "psraw $0x8,%%xmm0 \n"
1197 "psraw $0x8,%%xmm1 \n"
1198 "packsswb %%xmm1,%%xmm0 \n"
1199 "paddb %%xmm5,%%xmm0 \n"
1200 "movlps %%xmm0,(%1) \n"
1201 "movhps %%xmm0,(%1,%2,1) \n"
1202 "lea 0x8(%1),%1 \n"
1203 "sub $0x10,%3 \n"
1204 "ja 1b \n"
1205 : "+r"(src_abgr0), // %0
1206 "+r"(dst_u), // %1
1207 "+r"(dst_v), // %2
1208 "+rm"(width) // %3
1209 : "r"(static_cast<intptr_t>(src_stride_abgr))
1210 : "memory", "cc"
1211#if defined(__SSE2__)
1212 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1213#endif
1214 );
1215}
1216#endif
1217
1218#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001219
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001220#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001221#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1222#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1223#define UR 0
1224
1225#define VB 0
1226#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1227#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1228
1229// Bias
1230#define BB UB * 128 + VB * 128
1231#define BG UG * 128 + VG * 128
1232#define BR UR * 128 + VR * 128
1233
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001234#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001235
fbarchard@google.comb6149762011-11-07 21:58:52 +00001236#if defined(__APPLE__) || defined(__x86_64__)
1237#define OMITFP
1238#else
1239#define OMITFP __attribute__((optimize("omit-frame-pointer")))
1240#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001241
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001242struct {
1243 vec8 kUVToB;
1244 vec8 kUVToG;
1245 vec8 kUVToR;
1246 vec16 kUVBiasB;
1247 vec16 kUVBiasG;
1248 vec16 kUVBiasR;
1249 vec16 kYSub16;
1250 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001251} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001252 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1253 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1254 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1255 { BB, BB, BB, BB, BB, BB, BB, BB },
1256 { BG, BG, BG, BG, BG, BG, BG, BG },
1257 { BR, BR, BR, BR, BR, BR, BR, BR },
1258 { 16, 16, 16, 16, 16, 16, 16, 16 },
1259 { YG, YG, YG, YG, YG, YG, YG, YG }
1260};
1261
1262// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +00001263#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001264 "movd (%1),%%xmm0 \n" \
1265 "movd (%1,%2,1),%%xmm1 \n" \
1266 "lea 0x4(%1),%1 \n" \
1267 "punpcklbw %%xmm1,%%xmm0 \n" \
1268 "punpcklwd %%xmm0,%%xmm0 \n" \
1269 "movdqa %%xmm0,%%xmm1 \n" \
1270 "movdqa %%xmm0,%%xmm2 \n" \
1271 "pmaddubsw (%5),%%xmm0 \n" \
1272 "pmaddubsw 16(%5),%%xmm1 \n" \
1273 "pmaddubsw 32(%5),%%xmm2 \n" \
1274 "psubw 48(%5),%%xmm0 \n" \
1275 "psubw 64(%5),%%xmm1 \n" \
1276 "psubw 80(%5),%%xmm2 \n" \
1277 "movq (%0),%%xmm3 \n" \
1278 "lea 0x8(%0),%0 \n" \
1279 "punpcklbw %%xmm4,%%xmm3 \n" \
1280 "psubsw 96(%5),%%xmm3 \n" \
1281 "pmullw 112(%5),%%xmm3 \n" \
1282 "paddsw %%xmm3,%%xmm0 \n" \
1283 "paddsw %%xmm3,%%xmm1 \n" \
1284 "paddsw %%xmm3,%%xmm2 \n" \
1285 "psraw $0x6,%%xmm0 \n" \
1286 "psraw $0x6,%%xmm1 \n" \
1287 "psraw $0x6,%%xmm2 \n" \
1288 "packuswb %%xmm0,%%xmm0 \n" \
1289 "packuswb %%xmm1,%%xmm1 \n" \
1290 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001291
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001292void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
1293 const uint8* u_buf,
1294 const uint8* v_buf,
1295 uint8* rgb_buf,
1296 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +00001297 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001298 "sub %1,%2 \n"
1299 "pcmpeqb %%xmm5,%%xmm5 \n"
1300 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001301 "1: \n"
1302 YUVTORGB
1303 "punpcklbw %%xmm1,%%xmm0 \n"
1304 "punpcklbw %%xmm5,%%xmm2 \n"
1305 "movdqa %%xmm0,%%xmm1 \n"
1306 "punpcklwd %%xmm2,%%xmm0 \n"
1307 "punpckhwd %%xmm2,%%xmm1 \n"
1308 "movdqa %%xmm0,(%3) \n"
1309 "movdqa %%xmm1,0x10(%3) \n"
1310 "lea 0x20(%3),%3 \n"
1311 "sub $0x8,%4 \n"
1312 "ja 1b \n"
1313 : "+r"(y_buf), // %0
1314 "+r"(u_buf), // %1
1315 "+r"(v_buf), // %2
1316 "+r"(rgb_buf), // %3
1317 "+rm"(width) // %4
1318 : "r"(&kYuvConstants.kUVToB) // %5
1319 : "memory", "cc"
1320#if defined(__SSE2__)
1321 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1322#endif
1323 );
1324}
1325
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001326void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
1327 const uint8* u_buf,
1328 const uint8* v_buf,
1329 uint8* rgb_buf,
1330 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001331 asm volatile (
1332 "sub %1,%2 \n"
1333 "pcmpeqb %%xmm5,%%xmm5 \n"
1334 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001335 "1: \n"
1336 YUVTORGB
1337 "pcmpeqb %%xmm5,%%xmm5 \n"
1338 "punpcklbw %%xmm0,%%xmm1 \n"
1339 "punpcklbw %%xmm2,%%xmm5 \n"
1340 "movdqa %%xmm5,%%xmm0 \n"
1341 "punpcklwd %%xmm1,%%xmm5 \n"
1342 "punpckhwd %%xmm1,%%xmm0 \n"
1343 "movdqa %%xmm5,(%3) \n"
1344 "movdqa %%xmm0,0x10(%3) \n"
1345 "lea 0x20(%3),%3 \n"
1346 "sub $0x8,%4 \n"
1347 "ja 1b \n"
1348 : "+r"(y_buf), // %0
1349 "+r"(u_buf), // %1
1350 "+r"(v_buf), // %2
1351 "+r"(rgb_buf), // %3
1352 "+rm"(width) // %4
1353 : "r"(&kYuvConstants.kUVToB) // %5
1354 : "memory", "cc"
1355#if defined(__SSE2__)
1356 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1357#endif
1358 );
1359}
1360
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001361void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
1362 const uint8* u_buf,
1363 const uint8* v_buf,
1364 uint8* rgb_buf,
1365 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001366 asm volatile (
1367 "sub %1,%2 \n"
1368 "pcmpeqb %%xmm5,%%xmm5 \n"
1369 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001370 "1: \n"
1371 YUVTORGB
1372 "punpcklbw %%xmm1,%%xmm2 \n"
1373 "punpcklbw %%xmm5,%%xmm0 \n"
1374 "movdqa %%xmm2,%%xmm1 \n"
1375 "punpcklwd %%xmm0,%%xmm2 \n"
1376 "punpckhwd %%xmm0,%%xmm1 \n"
1377 "movdqa %%xmm2,(%3) \n"
1378 "movdqa %%xmm1,0x10(%3) \n"
1379 "lea 0x20(%3),%3 \n"
1380 "sub $0x8,%4 \n"
1381 "ja 1b \n"
1382 : "+r"(y_buf), // %0
1383 "+r"(u_buf), // %1
1384 "+r"(v_buf), // %2
1385 "+r"(rgb_buf), // %3
1386 "+rm"(width) // %4
1387 : "r"(&kYuvConstants.kUVToB) // %5
1388 : "memory", "cc"
1389#if defined(__SSE2__)
1390 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1391#endif
1392 );
1393}
1394
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001395void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1396 const uint8* u_buf,
1397 const uint8* v_buf,
1398 uint8* rgb_buf,
1399 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001400 asm volatile (
1401 "sub %1,%2 \n"
1402 "pcmpeqb %%xmm5,%%xmm5 \n"
1403 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001404 "1: \n"
1405 "movd (%1),%%xmm0 \n"
1406 "movd (%1,%2,1),%%xmm1 \n"
1407 "lea 0x4(%1),%1 \n"
1408 "punpcklbw %%xmm1,%%xmm0 \n"
1409 "movdqa %%xmm0,%%xmm1 \n"
1410 "movdqa %%xmm0,%%xmm2 \n"
1411 "pmaddubsw (%5),%%xmm0 \n"
1412 "pmaddubsw 16(%5),%%xmm1 \n"
1413 "pmaddubsw 32(%5),%%xmm2 \n"
1414 "psubw 48(%5),%%xmm0 \n"
1415 "psubw 64(%5),%%xmm1 \n"
1416 "psubw 80(%5),%%xmm2 \n"
1417 "movd (%0),%%xmm3 \n"
1418 "lea 0x4(%0),%0 \n"
1419 "punpcklbw %%xmm4,%%xmm3 \n"
1420 "psubsw 96(%5),%%xmm3 \n"
1421 "pmullw 112(%5),%%xmm3 \n"
1422 "paddsw %%xmm3,%%xmm0 \n"
1423 "paddsw %%xmm3,%%xmm1 \n"
1424 "paddsw %%xmm3,%%xmm2 \n"
1425 "psraw $0x6,%%xmm0 \n"
1426 "psraw $0x6,%%xmm1 \n"
1427 "psraw $0x6,%%xmm2 \n"
1428 "packuswb %%xmm0,%%xmm0 \n"
1429 "packuswb %%xmm1,%%xmm1 \n"
1430 "packuswb %%xmm2,%%xmm2 \n"
1431 "punpcklbw %%xmm1,%%xmm0 \n"
1432 "punpcklbw %%xmm5,%%xmm2 \n"
1433 "punpcklwd %%xmm2,%%xmm0 \n"
1434 "movdqa %%xmm0,(%3) \n"
1435 "lea 0x10(%3),%3 \n"
1436 "sub $0x4,%4 \n"
1437 "ja 1b \n"
1438 : "+r"(y_buf), // %0
1439 "+r"(u_buf), // %1
1440 "+r"(v_buf), // %2
1441 "+r"(rgb_buf), // %3
1442 "+rm"(width) // %4
1443 : "r"(&kYuvConstants.kUVToB) // %5
1444 : "memory", "cc"
1445#if defined(__SSE2__)
1446 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1447#endif
1448 );
1449}
1450#endif
1451
1452#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001453void YToARGBRow_SSE2(const uint8* y_buf,
1454 uint8* rgb_buf,
1455 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001456 asm volatile (
1457 "pcmpeqb %%xmm4,%%xmm4 \n"
1458 "pslld $0x18,%%xmm4 \n"
1459 "mov $0x10001000,%%eax \n"
1460 "movd %%eax,%%xmm3 \n"
1461 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1462 "mov $0x012a012a,%%eax \n"
1463 "movd %%eax,%%xmm2 \n"
1464 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001465 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001466 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001467 "movq (%0),%%xmm0 \n"
1468 "lea 0x8(%0),%0 \n"
1469 "punpcklbw %%xmm0,%%xmm0 \n"
1470 "psubusw %%xmm3,%%xmm0 \n"
1471 "pmulhuw %%xmm2,%%xmm0 \n"
1472 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001473
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001474 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001475 "punpcklbw %%xmm0,%%xmm0 \n"
1476 "movdqa %%xmm0,%%xmm1 \n"
1477 "punpcklwd %%xmm0,%%xmm0 \n"
1478 "punpckhwd %%xmm1,%%xmm1 \n"
1479 "por %%xmm4,%%xmm0 \n"
1480 "por %%xmm4,%%xmm1 \n"
1481 "movdqa %%xmm0,(%1) \n"
1482 "movdqa %%xmm1,16(%1) \n"
1483 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001484
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001485 "sub $0x8,%2 \n"
1486 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001487 : "+r"(y_buf), // %0
1488 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001489 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001490 :
1491 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001492#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001493 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001494#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001495 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001496}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001497#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001498
fbarchard@google.com42831e02012-01-21 02:54:17 +00001499#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001500
1501// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001502CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001503 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1504};
1505
fbarchard@google.com42831e02012-01-21 02:54:17 +00001506void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001507 intptr_t temp_width = static_cast<intptr_t>(width);
1508 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001509 "movdqa %3,%%xmm5 \n"
1510 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001511 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001512 "movdqa (%0,%2),%%xmm0 \n"
1513 "pshufb %%xmm5,%%xmm0 \n"
1514 "sub $0x10,%2 \n"
1515 "movdqa %%xmm0,(%1) \n"
1516 "lea 0x10(%1),%1 \n"
1517 "ja 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001518 : "+r"(src), // %0
1519 "+r"(dst), // %1
1520 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001521 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001522 : "memory", "cc"
1523#if defined(__SSE2__)
1524 , "xmm0", "xmm5"
1525#endif
1526 );
1527}
1528#endif
1529
fbarchard@google.com42831e02012-01-21 02:54:17 +00001530#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001531
fbarchard@google.com42831e02012-01-21 02:54:17 +00001532void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001533 intptr_t temp_width = static_cast<intptr_t>(width);
1534 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001535 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001536 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001537 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001538 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001539 "psllw $0x8,%%xmm0 \n"
1540 "psrlw $0x8,%%xmm1 \n"
1541 "por %%xmm1,%%xmm0 \n"
1542 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1543 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1544 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1545 "sub $0x10,%2 \n"
1546 "movdqu %%xmm0,(%1) \n"
1547 "lea 0x10(%1),%1 \n"
1548 "ja 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001549 : "+r"(src), // %0
1550 "+r"(dst), // %1
1551 "+r"(temp_width) // %2
1552 :
1553 : "memory", "cc"
1554#if defined(__SSE2__)
1555 , "xmm0", "xmm1"
1556#endif
1557 );
1558}
1559#endif
1560
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001561#ifdef HAS_SPLITUV_SSE2
1562void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
1563 asm volatile (
1564 "pcmpeqb %%xmm5,%%xmm5 \n"
1565 "psrlw $0x8,%%xmm5 \n"
1566 "sub %1,%2 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001567 "1: \n"
1568 "movdqa (%0),%%xmm0 \n"
1569 "movdqa 0x10(%0),%%xmm1 \n"
1570 "lea 0x20(%0),%0 \n"
1571 "movdqa %%xmm0,%%xmm2 \n"
1572 "movdqa %%xmm1,%%xmm3 \n"
1573 "pand %%xmm5,%%xmm0 \n"
1574 "pand %%xmm5,%%xmm1 \n"
1575 "packuswb %%xmm1,%%xmm0 \n"
1576 "psrlw $0x8,%%xmm2 \n"
1577 "psrlw $0x8,%%xmm3 \n"
1578 "packuswb %%xmm3,%%xmm2 \n"
1579 "movdqa %%xmm0,(%1) \n"
1580 "movdqa %%xmm2,(%1,%2) \n"
1581 "lea 0x10(%1),%1 \n"
1582 "sub $0x10,%3 \n"
1583 "ja 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001584 : "+r"(src_uv), // %0
1585 "+r"(dst_u), // %1
1586 "+r"(dst_v), // %2
1587 "+r"(pix) // %3
1588 :
1589 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001590#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001591 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001592#endif
1593 );
1594}
1595#endif
1596
fbarchard@google.com19932f82012-02-16 22:19:14 +00001597#ifdef HAS_COPYROW_SSE2
1598void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
1599 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001600 "sub %0,%1 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001601 "1: \n"
1602 "movdqa (%0),%%xmm0 \n"
1603 "movdqa 0x10(%0),%%xmm1 \n"
1604 "movdqa %%xmm0,(%0,%1) \n"
1605 "movdqa %%xmm1,0x10(%0,%1) \n"
1606 "lea 0x20(%0),%0 \n"
1607 "sub $0x20,%2 \n"
1608 "ja 1b \n"
1609 : "+r"(src), // %0
1610 "+r"(dst), // %1
1611 "+r"(count) // %2
1612 :
1613 : "memory", "cc"
1614#if defined(__SSE2__)
1615 , "xmm0", "xmm1"
1616#endif
1617 );
1618}
1619#endif // HAS_COPYROW_SSE2
1620
1621#ifdef HAS_COPYROW_X86
1622void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1623 size_t width_tmp = static_cast<size_t>(width);
1624 asm volatile (
1625 "shr $0x2,%2 \n"
1626 "rep movsl \n"
1627 : "+S"(src), // %0
1628 "+D"(dst), // %1
1629 "+c"(width_tmp) // %2
1630 :
1631 : "memory", "cc"
1632 );
1633}
1634#endif
1635
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001636#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001637void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
1638 asm volatile (
1639 "pcmpeqb %%xmm5,%%xmm5 \n"
1640 "psrlw $0x8,%%xmm5 \n"
1641 "1: \n"
1642 "movdqa (%0),%%xmm0 \n"
1643 "movdqa 0x10(%0),%%xmm1 \n"
1644 "lea 0x20(%0),%0 \n"
1645 "pand %%xmm5,%%xmm0 \n"
1646 "pand %%xmm5,%%xmm1 \n"
1647 "packuswb %%xmm1,%%xmm0 \n"
1648 "movdqa %%xmm0,(%1) \n"
1649 "lea 0x10(%1),%1 \n"
1650 "sub $0x10,%2 \n"
1651 "ja 1b \n"
1652 : "+r"(src_yuy2), // %0
1653 "+r"(dst_y), // %1
1654 "+r"(pix) // %2
1655 :
1656 : "memory", "cc"
1657#if defined(__SSE2__)
1658 , "xmm0", "xmm1", "xmm5"
1659#endif
1660 );
1661}
1662
1663void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1664 uint8* dst_u, uint8* dst_y, int pix) {
1665 asm volatile (
1666 "pcmpeqb %%xmm5,%%xmm5 \n"
1667 "psrlw $0x8,%%xmm5 \n"
1668 "sub %1,%2 \n"
1669 "1: \n"
1670 "movdqa (%0),%%xmm0 \n"
1671 "movdqa 0x10(%0),%%xmm1 \n"
1672 "movdqa (%0,%4,1),%%xmm2 \n"
1673 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1674 "lea 0x20(%0),%0 \n"
1675 "pavgb %%xmm2,%%xmm0 \n"
1676 "pavgb %%xmm3,%%xmm1 \n"
1677 "psrlw $0x8,%%xmm0 \n"
1678 "psrlw $0x8,%%xmm1 \n"
1679 "packuswb %%xmm1,%%xmm0 \n"
1680 "movdqa %%xmm0,%%xmm1 \n"
1681 "pand %%xmm5,%%xmm0 \n"
1682 "packuswb %%xmm0,%%xmm0 \n"
1683 "psrlw $0x8,%%xmm1 \n"
1684 "packuswb %%xmm1,%%xmm1 \n"
1685 "movq %%xmm0,(%1) \n"
1686 "movq %%xmm1,(%1,%2) \n"
1687 "lea 0x8(%1),%1 \n"
1688 "sub $0x10,%3 \n"
1689 "ja 1b \n"
1690 : "+r"(src_yuy2), // %0
1691 "+r"(dst_u), // %1
1692 "+r"(dst_y), // %2
1693 "+r"(pix) // %3
1694 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1695 : "memory", "cc"
1696#if defined(__SSE2__)
1697 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1698#endif
1699 );
1700}
1701
1702void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1703 uint8* dst_y, int pix) {
1704 asm volatile (
1705 "pcmpeqb %%xmm5,%%xmm5 \n"
1706 "psrlw $0x8,%%xmm5 \n"
1707 "1: \n"
1708 "movdqu (%0),%%xmm0 \n"
1709 "movdqu 0x10(%0),%%xmm1 \n"
1710 "lea 0x20(%0),%0 \n"
1711 "pand %%xmm5,%%xmm0 \n"
1712 "pand %%xmm5,%%xmm1 \n"
1713 "packuswb %%xmm1,%%xmm0 \n"
1714 "movdqu %%xmm0,(%1) \n"
1715 "lea 0x10(%1),%1 \n"
1716 "sub $0x10,%2 \n"
1717 "ja 1b \n"
1718 : "+r"(src_yuy2), // %0
1719 "+r"(dst_y), // %1
1720 "+r"(pix) // %2
1721 :
1722 : "memory", "cc"
1723#if defined(__SSE2__)
1724 , "xmm0", "xmm1", "xmm5"
1725#endif
1726 );
1727}
1728
1729void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1730 int stride_yuy2,
1731 uint8* dst_u, uint8* dst_y,
1732 int pix) {
1733 asm volatile (
1734 "pcmpeqb %%xmm5,%%xmm5 \n"
1735 "psrlw $0x8,%%xmm5 \n"
1736 "sub %1,%2 \n"
1737 "1: \n"
1738 "movdqu (%0),%%xmm0 \n"
1739 "movdqu 0x10(%0),%%xmm1 \n"
1740 "movdqu (%0,%4,1),%%xmm2 \n"
1741 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1742 "lea 0x20(%0),%0 \n"
1743 "pavgb %%xmm2,%%xmm0 \n"
1744 "pavgb %%xmm3,%%xmm1 \n"
1745 "psrlw $0x8,%%xmm0 \n"
1746 "psrlw $0x8,%%xmm1 \n"
1747 "packuswb %%xmm1,%%xmm0 \n"
1748 "movdqa %%xmm0,%%xmm1 \n"
1749 "pand %%xmm5,%%xmm0 \n"
1750 "packuswb %%xmm0,%%xmm0 \n"
1751 "psrlw $0x8,%%xmm1 \n"
1752 "packuswb %%xmm1,%%xmm1 \n"
1753 "movq %%xmm0,(%1) \n"
1754 "movq %%xmm1,(%1,%2) \n"
1755 "lea 0x8(%1),%1 \n"
1756 "sub $0x10,%3 \n"
1757 "ja 1b \n"
1758 : "+r"(src_yuy2), // %0
1759 "+r"(dst_u), // %1
1760 "+r"(dst_y), // %2
1761 "+r"(pix) // %3
1762 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1763 : "memory", "cc"
1764#if defined(__SSE2__)
1765 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1766#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001767 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001768}
1769
1770void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
1771 asm volatile (
1772 "1: \n"
1773 "movdqa (%0),%%xmm0 \n"
1774 "movdqa 0x10(%0),%%xmm1 \n"
1775 "lea 0x20(%0),%0 \n"
1776 "psrlw $0x8,%%xmm0 \n"
1777 "psrlw $0x8,%%xmm1 \n"
1778 "packuswb %%xmm1,%%xmm0 \n"
1779 "movdqa %%xmm0,(%1) \n"
1780 "lea 0x10(%1),%1 \n"
1781 "sub $0x10,%2 \n"
1782 "ja 1b \n"
1783 : "+r"(src_uyvy), // %0
1784 "+r"(dst_y), // %1
1785 "+r"(pix) // %2
1786 :
1787 : "memory", "cc"
1788#if defined(__SSE2__)
1789 , "xmm0", "xmm1"
1790#endif
1791 );
1792}
1793
1794void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
1795 uint8* dst_u, uint8* dst_y, int pix) {
1796 asm volatile (
1797 "pcmpeqb %%xmm5,%%xmm5 \n"
1798 "psrlw $0x8,%%xmm5 \n"
1799 "sub %1,%2 \n"
1800 "1: \n"
1801 "movdqa (%0),%%xmm0 \n"
1802 "movdqa 0x10(%0),%%xmm1 \n"
1803 "movdqa (%0,%4,1),%%xmm2 \n"
1804 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1805 "lea 0x20(%0),%0 \n"
1806 "pavgb %%xmm2,%%xmm0 \n"
1807 "pavgb %%xmm3,%%xmm1 \n"
1808 "pand %%xmm5,%%xmm0 \n"
1809 "pand %%xmm5,%%xmm1 \n"
1810 "packuswb %%xmm1,%%xmm0 \n"
1811 "movdqa %%xmm0,%%xmm1 \n"
1812 "pand %%xmm5,%%xmm0 \n"
1813 "packuswb %%xmm0,%%xmm0 \n"
1814 "psrlw $0x8,%%xmm1 \n"
1815 "packuswb %%xmm1,%%xmm1 \n"
1816 "movq %%xmm0,(%1) \n"
1817 "movq %%xmm1,(%1,%2) \n"
1818 "lea 0x8(%1),%1 \n"
1819 "sub $0x10,%3 \n"
1820 "ja 1b \n"
1821 : "+r"(src_uyvy), // %0
1822 "+r"(dst_u), // %1
1823 "+r"(dst_y), // %2
1824 "+r"(pix) // %3
1825 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1826 : "memory", "cc"
1827#if defined(__SSE2__)
1828 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1829#endif
1830 );
1831}
1832
1833void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
1834 uint8* dst_y, int pix) {
1835 asm volatile (
1836 "1: \n"
1837 "movdqu (%0),%%xmm0 \n"
1838 "movdqu 0x10(%0),%%xmm1 \n"
1839 "lea 0x20(%0),%0 \n"
1840 "psrlw $0x8,%%xmm0 \n"
1841 "psrlw $0x8,%%xmm1 \n"
1842 "packuswb %%xmm1,%%xmm0 \n"
1843 "movdqu %%xmm0,(%1) \n"
1844 "lea 0x10(%1),%1 \n"
1845 "sub $0x10,%2 \n"
1846 "ja 1b \n"
1847 : "+r"(src_uyvy), // %0
1848 "+r"(dst_y), // %1
1849 "+r"(pix) // %2
1850 :
1851 : "memory", "cc"
1852#if defined(__SSE2__)
1853 , "xmm0", "xmm1"
1854#endif
1855 );
1856}
1857
1858void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
1859 uint8* dst_u, uint8* dst_y, int pix) {
1860 asm volatile (
1861 "pcmpeqb %%xmm5,%%xmm5 \n"
1862 "psrlw $0x8,%%xmm5 \n"
1863 "sub %1,%2 \n"
1864 "1: \n"
1865 "movdqu (%0),%%xmm0 \n"
1866 "movdqu 0x10(%0),%%xmm1 \n"
1867 "movdqu (%0,%4,1),%%xmm2 \n"
1868 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1869 "lea 0x20(%0),%0 \n"
1870 "pavgb %%xmm2,%%xmm0 \n"
1871 "pavgb %%xmm3,%%xmm1 \n"
1872 "pand %%xmm5,%%xmm0 \n"
1873 "pand %%xmm5,%%xmm1 \n"
1874 "packuswb %%xmm1,%%xmm0 \n"
1875 "movdqa %%xmm0,%%xmm1 \n"
1876 "pand %%xmm5,%%xmm0 \n"
1877 "packuswb %%xmm0,%%xmm0 \n"
1878 "psrlw $0x8,%%xmm1 \n"
1879 "packuswb %%xmm1,%%xmm1 \n"
1880 "movq %%xmm0,(%1) \n"
1881 "movq %%xmm1,(%1,%2) \n"
1882 "lea 0x8(%1),%1 \n"
1883 "sub $0x10,%3 \n"
1884 "ja 1b \n"
1885 : "+r"(src_uyvy), // %0
1886 "+r"(dst_u), // %1
1887 "+r"(dst_y), // %2
1888 "+r"(pix) // %3
1889 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1890 : "memory", "cc"
1891#if defined(__SSE2__)
1892 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1893#endif
1894 );
1895}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001896#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001897
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001898#endif // defined(__x86_64__) || defined(__i386__)
1899
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001900#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001901} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001902} // namespace libyuv
1903#endif