blob: 06a06a52f28b250d1c6df9476248ad694c75a318 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
21#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
22
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
108 2u, 1u,0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
115 "1: \n"
116 "movq (%0),%%xmm0 \n"
117 "lea 0x8(%0),%0 \n"
118 "punpcklbw %%xmm0,%%xmm0 \n"
119 "movdqa %%xmm0,%%xmm1 \n"
120 "punpcklwd %%xmm0,%%xmm0 \n"
121 "punpckhwd %%xmm1,%%xmm1 \n"
122 "por %%xmm5,%%xmm0 \n"
123 "por %%xmm5,%%xmm1 \n"
124 "movdqa %%xmm0,(%1) \n"
125 "movdqa %%xmm1,0x10(%1) \n"
126 "lea 0x20(%1),%1 \n"
127 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000128 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000129 : "+r"(src_y), // %0
130 "+r"(dst_argb), // %1
131 "+r"(pix) // %2
132 :
133 : "memory", "cc"
134#if defined(__SSE2__)
135 , "xmm0", "xmm1", "xmm5"
136#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000137 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000138}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000139
140void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000141 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000142 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000143 "sub %0,%1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000144 "1: \n"
145 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000147 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000148 "movdqa %%xmm0,(%0,%1,1) \n"
149 "lea 0x10(%0),%0 \n"
150 "jg 1b \n"
151
fbarchard@google.comb6149762011-11-07 21:58:52 +0000152 : "+r"(src_abgr), // %0
153 "+r"(dst_argb), // %1
154 "+r"(pix) // %2
155 : "m"(kShuffleMaskABGRToARGB) // %3
156 : "memory", "cc"
157#if defined(__SSE2__)
158 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000159#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000160 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000161}
162
163void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000164 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000165 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000166 "sub %0,%1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "1: \n"
168 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000169 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000171 "movdqa %%xmm0,(%0,%1,1) \n"
172 "lea 0x10(%0),%0 \n"
173 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000174 : "+r"(src_bgra), // %0
175 "+r"(dst_argb), // %1
176 "+r"(pix) // %2
177 : "m"(kShuffleMaskBGRAToARGB) // %3
178 : "memory", "cc"
179#if defined(__SSE2__)
180 , "xmm0", "xmm5"
181#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000182 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000183}
184
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000185void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000186 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000187 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
188 "pslld $0x18,%%xmm5 \n"
189 "movdqa %3,%%xmm4 \n"
190 "1: \n"
191 "movdqu (%0),%%xmm0 \n"
192 "movdqu 0x10(%0),%%xmm1 \n"
193 "movdqu 0x20(%0),%%xmm3 \n"
194 "lea 0x30(%0),%0 \n"
195 "movdqa %%xmm3,%%xmm2 \n"
196 "palignr $0x8,%%xmm1,%%xmm2 \n"
197 "pshufb %%xmm4,%%xmm2 \n"
198 "por %%xmm5,%%xmm2 \n"
199 "palignr $0xc,%%xmm0,%%xmm1 \n"
200 "pshufb %%xmm4,%%xmm0 \n"
201 "movdqa %%xmm2,0x20(%1) \n"
202 "por %%xmm5,%%xmm0 \n"
203 "pshufb %%xmm4,%%xmm1 \n"
204 "movdqa %%xmm0,(%1) \n"
205 "por %%xmm5,%%xmm1 \n"
206 "palignr $0x4,%%xmm3,%%xmm3 \n"
207 "pshufb %%xmm4,%%xmm3 \n"
208 "movdqa %%xmm1,0x10(%1) \n"
209 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000210 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000211 "movdqa %%xmm3,0x30(%1) \n"
212 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000213 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000214 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000215 "+r"(dst_argb), // %1
216 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000217 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000218 : "memory", "cc"
219#if defined(__SSE2__)
220 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
221#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000222 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000223}
224
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000225void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000226 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000227 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
228 "pslld $0x18,%%xmm5 \n"
229 "movdqa %3,%%xmm4 \n"
230 "1: \n"
231 "movdqu (%0),%%xmm0 \n"
232 "movdqu 0x10(%0),%%xmm1 \n"
233 "movdqu 0x20(%0),%%xmm3 \n"
234 "lea 0x30(%0),%0 \n"
235 "movdqa %%xmm3,%%xmm2 \n"
236 "palignr $0x8,%%xmm1,%%xmm2 \n"
237 "pshufb %%xmm4,%%xmm2 \n"
238 "por %%xmm5,%%xmm2 \n"
239 "palignr $0xc,%%xmm0,%%xmm1 \n"
240 "pshufb %%xmm4,%%xmm0 \n"
241 "movdqa %%xmm2,0x20(%1) \n"
242 "por %%xmm5,%%xmm0 \n"
243 "pshufb %%xmm4,%%xmm1 \n"
244 "movdqa %%xmm0,(%1) \n"
245 "por %%xmm5,%%xmm1 \n"
246 "palignr $0x4,%%xmm3,%%xmm3 \n"
247 "pshufb %%xmm4,%%xmm3 \n"
248 "movdqa %%xmm1,0x10(%1) \n"
249 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000250 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000251 "movdqa %%xmm3,0x30(%1) \n"
252 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000253 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000254 : "+r"(src_raw), // %0
255 "+r"(dst_argb), // %1
256 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000257 : "m"(kShuffleMaskRAWToARGB) // %3
258 : "memory", "cc"
259#if defined(__SSE2__)
260 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
261#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000262 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000263}
264
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000265void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
266 asm volatile (
267 "mov $0x1080108,%%eax \n"
268 "movd %%eax,%%xmm5 \n"
269 "pshufd $0x0,%%xmm5,%%xmm5 \n"
270 "mov $0x20082008,%%eax \n"
271 "movd %%eax,%%xmm6 \n"
272 "pshufd $0x0,%%xmm6,%%xmm6 \n"
273 "pcmpeqb %%xmm3,%%xmm3 \n"
274 "psllw $0xb,%%xmm3 \n"
275 "pcmpeqb %%xmm4,%%xmm4 \n"
276 "psllw $0xa,%%xmm4 \n"
277 "psrlw $0x5,%%xmm4 \n"
278 "pcmpeqb %%xmm7,%%xmm7 \n"
279 "psllw $0x8,%%xmm7 \n"
280 "sub %0,%1 \n"
281 "sub %0,%1 \n"
282 "1: \n"
283 "movdqu (%0),%%xmm0 \n"
284 "movdqa %%xmm0,%%xmm1 \n"
285 "movdqa %%xmm0,%%xmm2 \n"
286 "pand %%xmm3,%%xmm1 \n"
287 "psllw $0xb,%%xmm2 \n"
288 "pmulhuw %%xmm5,%%xmm1 \n"
289 "pmulhuw %%xmm5,%%xmm2 \n"
290 "psllw $0x8,%%xmm1 \n"
291 "por %%xmm2,%%xmm1 \n"
292 "pand %%xmm4,%%xmm0 \n"
293 "pmulhuw %%xmm6,%%xmm0 \n"
294 "por %%xmm7,%%xmm0 \n"
295 "movdqa %%xmm1,%%xmm2 \n"
296 "punpcklbw %%xmm0,%%xmm1 \n"
297 "punpckhbw %%xmm0,%%xmm2 \n"
298 "movdqa %%xmm1,(%1,%0,2) \n"
299 "movdqa %%xmm2,0x10(%1,%0,2) \n"
300 "lea 0x10(%0),%0 \n"
301 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000302 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000303 : "+r"(src), // %0
304 "+r"(dst), // %1
305 "+r"(pix) // %2
306 :
307 : "memory", "cc", "eax"
308#if defined(__SSE2__)
309 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
310#endif
311 );
312}
313
314void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
315 asm volatile (
316 "mov $0x1080108,%%eax \n"
317 "movd %%eax,%%xmm5 \n"
318 "pshufd $0x0,%%xmm5,%%xmm5 \n"
319 "mov $0x42004200,%%eax \n"
320 "movd %%eax,%%xmm6 \n"
321 "pshufd $0x0,%%xmm6,%%xmm6 \n"
322 "pcmpeqb %%xmm3,%%xmm3 \n"
323 "psllw $0xb,%%xmm3 \n"
324 "movdqa %%xmm3,%%xmm4 \n"
325 "psrlw $0x6,%%xmm4 \n"
326 "pcmpeqb %%xmm7,%%xmm7 \n"
327 "psllw $0x8,%%xmm7 \n"
328 "sub %0,%1 \n"
329 "sub %0,%1 \n"
330 "1: \n"
331 "movdqu (%0),%%xmm0 \n"
332 "movdqa %%xmm0,%%xmm1 \n"
333 "movdqa %%xmm0,%%xmm2 \n"
334 "psllw $0x1,%%xmm1 \n"
335 "psllw $0xb,%%xmm2 \n"
336 "pand %%xmm3,%%xmm1 \n"
337 "pmulhuw %%xmm5,%%xmm2 \n"
338 "pmulhuw %%xmm5,%%xmm1 \n"
339 "psllw $0x8,%%xmm1 \n"
340 "por %%xmm2,%%xmm1 \n"
341 "movdqa %%xmm0,%%xmm2 \n"
342 "pand %%xmm4,%%xmm0 \n"
343 "psraw $0x8,%%xmm2 \n"
344 "pmulhuw %%xmm6,%%xmm0 \n"
345 "pand %%xmm7,%%xmm2 \n"
346 "por %%xmm2,%%xmm0 \n"
347 "movdqa %%xmm1,%%xmm2 \n"
348 "punpcklbw %%xmm0,%%xmm1 \n"
349 "punpckhbw %%xmm0,%%xmm2 \n"
350 "movdqa %%xmm1,(%1,%0,2) \n"
351 "movdqa %%xmm2,0x10(%1,%0,2) \n"
352 "lea 0x10(%0),%0 \n"
353 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000354 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000355 : "+r"(src), // %0
356 "+r"(dst), // %1
357 "+r"(pix) // %2
358 :
359 : "memory", "cc", "eax"
360#if defined(__SSE2__)
361 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
362#endif
363 );
364}
365
366void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
367 asm volatile (
368 "mov $0xf0f0f0f,%%eax \n"
369 "movd %%eax,%%xmm4 \n"
370 "pshufd $0x0,%%xmm4,%%xmm4 \n"
371 "movdqa %%xmm4,%%xmm5 \n"
372 "pslld $0x4,%%xmm5 \n"
373 "sub %0,%1 \n"
374 "sub %0,%1 \n"
375 "1: \n"
376 "movdqu (%0),%%xmm0 \n"
377 "movdqa %%xmm0,%%xmm2 \n"
378 "pand %%xmm4,%%xmm0 \n"
379 "pand %%xmm5,%%xmm2 \n"
380 "movdqa %%xmm0,%%xmm1 \n"
381 "movdqa %%xmm2,%%xmm3 \n"
382 "psllw $0x4,%%xmm1 \n"
383 "psrlw $0x4,%%xmm3 \n"
384 "por %%xmm1,%%xmm0 \n"
385 "por %%xmm3,%%xmm2 \n"
386 "movdqa %%xmm0,%%xmm1 \n"
387 "punpcklbw %%xmm2,%%xmm0 \n"
388 "punpckhbw %%xmm2,%%xmm1 \n"
389 "movdqa %%xmm0,(%1,%0,2) \n"
390 "movdqa %%xmm1,0x10(%1,%0,2) \n"
391 "lea 0x10(%0),%0 \n"
392 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000393 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000394 : "+r"(src), // %0
395 "+r"(dst), // %1
396 "+r"(pix) // %2
397 :
398 : "memory", "cc", "eax"
399#if defined(__SSE2__)
400 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
401#endif
402 );
403}
404
405void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
406 asm volatile (
407 "movdqa %3,%%xmm6 \n"
408 "1: \n"
409 "movdqa (%0),%%xmm0 \n"
410 "movdqa 0x10(%0),%%xmm1 \n"
411 "movdqa 0x20(%0),%%xmm2 \n"
412 "movdqa 0x30(%0),%%xmm3 \n"
413 "lea 0x40(%0),%0 \n"
414 "pshufb %%xmm6,%%xmm0 \n"
415 "pshufb %%xmm6,%%xmm1 \n"
416 "pshufb %%xmm6,%%xmm2 \n"
417 "pshufb %%xmm6,%%xmm3 \n"
418 "movdqa %%xmm1,%%xmm4 \n"
419 "psrldq $0x4,%%xmm1 \n"
420 "pslldq $0xc,%%xmm4 \n"
421 "movdqa %%xmm2,%%xmm5 \n"
422 "por %%xmm4,%%xmm0 \n"
423 "pslldq $0x8,%%xmm5 \n"
424 "movdqa %%xmm0,(%1) \n"
425 "por %%xmm5,%%xmm1 \n"
426 "psrldq $0x8,%%xmm2 \n"
427 "pslldq $0x4,%%xmm3 \n"
428 "por %%xmm3,%%xmm2 \n"
429 "movdqa %%xmm1,0x10(%1) \n"
430 "movdqa %%xmm2,0x20(%1) \n"
431 "lea 0x30(%1),%1 \n"
432 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000433 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000434 : "+r"(src), // %0
435 "+r"(dst), // %1
436 "+r"(pix) // %2
437 : "m"(kShuffleMaskARGBToRGB24) // %3
438 : "memory", "cc"
439#if defined(__SSE2__)
440 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
441#endif
442 );
443}
444
445void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
446 asm volatile (
447 "movdqa %3,%%xmm6 \n"
448 "1: \n"
449 "movdqa (%0),%%xmm0 \n"
450 "movdqa 0x10(%0),%%xmm1 \n"
451 "movdqa 0x20(%0),%%xmm2 \n"
452 "movdqa 0x30(%0),%%xmm3 \n"
453 "lea 0x40(%0),%0 \n"
454 "pshufb %%xmm6,%%xmm0 \n"
455 "pshufb %%xmm6,%%xmm1 \n"
456 "pshufb %%xmm6,%%xmm2 \n"
457 "pshufb %%xmm6,%%xmm3 \n"
458 "movdqa %%xmm1,%%xmm4 \n"
459 "psrldq $0x4,%%xmm1 \n"
460 "pslldq $0xc,%%xmm4 \n"
461 "movdqa %%xmm2,%%xmm5 \n"
462 "por %%xmm4,%%xmm0 \n"
463 "pslldq $0x8,%%xmm5 \n"
464 "movdqa %%xmm0,(%1) \n"
465 "por %%xmm5,%%xmm1 \n"
466 "psrldq $0x8,%%xmm2 \n"
467 "pslldq $0x4,%%xmm3 \n"
468 "por %%xmm3,%%xmm2 \n"
469 "movdqa %%xmm1,0x10(%1) \n"
470 "movdqa %%xmm2,0x20(%1) \n"
471 "lea 0x30(%1),%1 \n"
472 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000473 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000474 : "+r"(src), // %0
475 "+r"(dst), // %1
476 "+r"(pix) // %2
477 : "m"(kShuffleMaskARGBToRAW) // %3
478 : "memory", "cc"
479#if defined(__SSE2__)
480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
481#endif
482 );
483}
484
485void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
486 asm volatile (
487 "pcmpeqb %%xmm3,%%xmm3 \n"
488 "psrld $0x1b,%%xmm3 \n"
489 "pcmpeqb %%xmm4,%%xmm4 \n"
490 "psrld $0x1a,%%xmm4 \n"
491 "pslld $0x5,%%xmm4 \n"
492 "pcmpeqb %%xmm5,%%xmm5 \n"
493 "pslld $0xb,%%xmm5 \n"
494 "1: \n"
495 "movdqa (%0),%%xmm0 \n"
496 "movdqa %%xmm0,%%xmm1 \n"
497 "movdqa %%xmm0,%%xmm2 \n"
498 "pslld $0x8,%%xmm0 \n"
499 "psrld $0x3,%%xmm1 \n"
500 "psrld $0x5,%%xmm2 \n"
501 "psrad $0x10,%%xmm0 \n"
502 "pand %%xmm3,%%xmm1 \n"
503 "pand %%xmm4,%%xmm2 \n"
504 "pand %%xmm5,%%xmm0 \n"
505 "por %%xmm2,%%xmm1 \n"
506 "por %%xmm1,%%xmm0 \n"
507 "packssdw %%xmm0,%%xmm0 \n"
508 "lea 0x10(%0),%0 \n"
509 "movq %%xmm0,(%1) \n"
510 "lea 0x8(%1),%1 \n"
511 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000512 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000513 : "+r"(src), // %0
514 "+r"(dst), // %1
515 "+r"(pix) // %2
516 :
517 : "memory", "cc"
518#if defined(__SSE2__)
519 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
520#endif
521 );
522}
523
524void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
525 asm volatile (
526 "pcmpeqb %%xmm4,%%xmm4 \n"
527 "psrld $0x1b,%%xmm4 \n"
528 "movdqa %%xmm4,%%xmm5 \n"
529 "pslld $0x5,%%xmm5 \n"
530 "movdqa %%xmm4,%%xmm6 \n"
531 "pslld $0xa,%%xmm6 \n"
532 "pcmpeqb %%xmm7,%%xmm7 \n"
533 "pslld $0xf,%%xmm7 \n"
534 "1: \n"
535 "movdqa (%0),%%xmm0 \n"
536 "movdqa %%xmm0,%%xmm1 \n"
537 "movdqa %%xmm0,%%xmm2 \n"
538 "movdqa %%xmm0,%%xmm3 \n"
539 "psrad $0x10,%%xmm0 \n"
540 "psrld $0x3,%%xmm1 \n"
541 "psrld $0x6,%%xmm2 \n"
542 "psrld $0x9,%%xmm3 \n"
543 "pand %%xmm7,%%xmm0 \n"
544 "pand %%xmm4,%%xmm1 \n"
545 "pand %%xmm5,%%xmm2 \n"
546 "pand %%xmm6,%%xmm3 \n"
547 "por %%xmm1,%%xmm0 \n"
548 "por %%xmm3,%%xmm2 \n"
549 "por %%xmm2,%%xmm0 \n"
550 "packssdw %%xmm0,%%xmm0 \n"
551 "lea 0x10(%0),%0 \n"
552 "movq %%xmm0,(%1) \n"
553 "lea 0x8(%1),%1 \n"
554 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000555 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000556 : "+r"(src), // %0
557 "+r"(dst), // %1
558 "+r"(pix) // %2
559 :
560 : "memory", "cc"
561#if defined(__SSE2__)
562 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
563#endif
564 );
565}
566
567void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
568 asm volatile (
569 "pcmpeqb %%xmm4,%%xmm4 \n"
570 "psllw $0xc,%%xmm4 \n"
571 "movdqa %%xmm4,%%xmm3 \n"
572 "psrlw $0x8,%%xmm3 \n"
573 "1: \n"
574 "movdqa (%0),%%xmm0 \n"
575 "movdqa %%xmm0,%%xmm1 \n"
576 "pand %%xmm3,%%xmm0 \n"
577 "pand %%xmm4,%%xmm1 \n"
578 "psrlq $0x4,%%xmm0 \n"
579 "psrlq $0x8,%%xmm1 \n"
580 "por %%xmm1,%%xmm0 \n"
581 "packuswb %%xmm0,%%xmm0 \n"
582 "lea 0x10(%0),%0 \n"
583 "movq %%xmm0,(%1) \n"
584 "lea 0x8(%1),%1 \n"
585 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000586 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000587 : "+r"(src), // %0
588 "+r"(dst), // %1
589 "+r"(pix) // %2
590 :
591 : "memory", "cc"
592#if defined(__SSE2__)
593 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
594#endif
595 );
596}
597
fbarchard@google.comb6149762011-11-07 21:58:52 +0000598void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000599 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000600 "movdqa %4,%%xmm5 \n"
601 "movdqa %3,%%xmm4 \n"
602 "1: \n"
603 "movdqa (%0),%%xmm0 \n"
604 "movdqa 0x10(%0),%%xmm1 \n"
605 "movdqa 0x20(%0),%%xmm2 \n"
606 "movdqa 0x30(%0),%%xmm3 \n"
607 "pmaddubsw %%xmm4,%%xmm0 \n"
608 "pmaddubsw %%xmm4,%%xmm1 \n"
609 "pmaddubsw %%xmm4,%%xmm2 \n"
610 "pmaddubsw %%xmm4,%%xmm3 \n"
611 "lea 0x40(%0),%0 \n"
612 "phaddw %%xmm1,%%xmm0 \n"
613 "phaddw %%xmm3,%%xmm2 \n"
614 "psrlw $0x7,%%xmm0 \n"
615 "psrlw $0x7,%%xmm2 \n"
616 "packuswb %%xmm2,%%xmm0 \n"
617 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000618 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000619 "movdqa %%xmm0,(%1) \n"
620 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000621 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000622 : "+r"(src_argb), // %0
623 "+r"(dst_y), // %1
624 "+r"(pix) // %2
625 : "m"(kARGBToY), // %3
626 "m"(kAddY16) // %4
627 : "memory", "cc"
628#if defined(__SSE2__)
629 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
630#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000631 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000632}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000633
634void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
635 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000636 "movdqa %4,%%xmm5 \n"
637 "movdqa %3,%%xmm4 \n"
638 "1: \n"
639 "movdqu (%0),%%xmm0 \n"
640 "movdqu 0x10(%0),%%xmm1 \n"
641 "movdqu 0x20(%0),%%xmm2 \n"
642 "movdqu 0x30(%0),%%xmm3 \n"
643 "pmaddubsw %%xmm4,%%xmm0 \n"
644 "pmaddubsw %%xmm4,%%xmm1 \n"
645 "pmaddubsw %%xmm4,%%xmm2 \n"
646 "pmaddubsw %%xmm4,%%xmm3 \n"
647 "lea 0x40(%0),%0 \n"
648 "phaddw %%xmm1,%%xmm0 \n"
649 "phaddw %%xmm3,%%xmm2 \n"
650 "psrlw $0x7,%%xmm0 \n"
651 "psrlw $0x7,%%xmm2 \n"
652 "packuswb %%xmm2,%%xmm0 \n"
653 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000654 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000655 "movdqu %%xmm0,(%1) \n"
656 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000657 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000658 : "+r"(src_argb), // %0
659 "+r"(dst_y), // %1
660 "+r"(pix) // %2
661 : "m"(kARGBToY), // %3
662 "m"(kAddY16) // %4
663 : "memory", "cc"
664#if defined(__SSE2__)
665 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
666#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000667 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000668}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000669
fbarchard@google.com714050a2012-02-17 22:59:56 +0000670// TODO(fbarchard): pass xmm constants to single block of assembly.
671// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
672// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
673// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
674// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000675void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
676 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000677 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000678 "movdqa %0,%%xmm4 \n"
679 "movdqa %1,%%xmm3 \n"
680 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000681 :
682 : "m"(kARGBToU), // %0
683 "m"(kARGBToV), // %1
684 "m"(kAddUV128) // %2
685 :
686#if defined(__SSE2__)
687 "xmm3", "xmm4", "xmm5"
688#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000689 );
690 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000691 "sub %1,%2 \n"
692 "1: \n"
693 "movdqa (%0),%%xmm0 \n"
694 "movdqa 0x10(%0),%%xmm1 \n"
695 "movdqa 0x20(%0),%%xmm2 \n"
696 "movdqa 0x30(%0),%%xmm6 \n"
697 "pavgb (%0,%4,1),%%xmm0 \n"
698 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
699 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
700 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
701 "lea 0x40(%0),%0 \n"
702 "movdqa %%xmm0,%%xmm7 \n"
703 "shufps $0x88,%%xmm1,%%xmm0 \n"
704 "shufps $0xdd,%%xmm1,%%xmm7 \n"
705 "pavgb %%xmm7,%%xmm0 \n"
706 "movdqa %%xmm2,%%xmm7 \n"
707 "shufps $0x88,%%xmm6,%%xmm2 \n"
708 "shufps $0xdd,%%xmm6,%%xmm7 \n"
709 "pavgb %%xmm7,%%xmm2 \n"
710 "movdqa %%xmm0,%%xmm1 \n"
711 "movdqa %%xmm2,%%xmm6 \n"
712 "pmaddubsw %%xmm4,%%xmm0 \n"
713 "pmaddubsw %%xmm4,%%xmm2 \n"
714 "pmaddubsw %%xmm3,%%xmm1 \n"
715 "pmaddubsw %%xmm3,%%xmm6 \n"
716 "phaddw %%xmm2,%%xmm0 \n"
717 "phaddw %%xmm6,%%xmm1 \n"
718 "psraw $0x8,%%xmm0 \n"
719 "psraw $0x8,%%xmm1 \n"
720 "packsswb %%xmm1,%%xmm0 \n"
721 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000722 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000723 "movlps %%xmm0,(%1) \n"
724 "movhps %%xmm0,(%1,%2,1) \n"
725 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000726 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000727 : "+r"(src_argb0), // %0
728 "+r"(dst_u), // %1
729 "+r"(dst_v), // %2
730 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000731 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000732 : "memory", "cc"
733#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000734 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000735#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000736 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000737}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000738
739void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
740 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000741 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000742 "movdqa %0,%%xmm4 \n"
743 "movdqa %1,%%xmm3 \n"
744 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000745 :
746 : "m"(kARGBToU), // %0
747 "m"(kARGBToV), // %1
748 "m"(kAddUV128) // %2
749 :
750#if defined(__SSE2__)
751 "xmm3", "xmm4", "xmm5"
752#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000753 );
754 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000755 "sub %1,%2 \n"
756 "1: \n"
757 "movdqu (%0),%%xmm0 \n"
758 "movdqu 0x10(%0),%%xmm1 \n"
759 "movdqu 0x20(%0),%%xmm2 \n"
760 "movdqu 0x30(%0),%%xmm6 \n"
761 "movdqu (%0,%4,1),%%xmm7 \n"
762 "pavgb %%xmm7,%%xmm0 \n"
763 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
764 "pavgb %%xmm7,%%xmm1 \n"
765 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
766 "pavgb %%xmm7,%%xmm2 \n"
767 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
768 "pavgb %%xmm7,%%xmm6 \n"
769 "lea 0x40(%0),%0 \n"
770 "movdqa %%xmm0,%%xmm7 \n"
771 "shufps $0x88,%%xmm1,%%xmm0 \n"
772 "shufps $0xdd,%%xmm1,%%xmm7 \n"
773 "pavgb %%xmm7,%%xmm0 \n"
774 "movdqa %%xmm2,%%xmm7 \n"
775 "shufps $0x88,%%xmm6,%%xmm2 \n"
776 "shufps $0xdd,%%xmm6,%%xmm7 \n"
777 "pavgb %%xmm7,%%xmm2 \n"
778 "movdqa %%xmm0,%%xmm1 \n"
779 "movdqa %%xmm2,%%xmm6 \n"
780 "pmaddubsw %%xmm4,%%xmm0 \n"
781 "pmaddubsw %%xmm4,%%xmm2 \n"
782 "pmaddubsw %%xmm3,%%xmm1 \n"
783 "pmaddubsw %%xmm3,%%xmm6 \n"
784 "phaddw %%xmm2,%%xmm0 \n"
785 "phaddw %%xmm6,%%xmm1 \n"
786 "psraw $0x8,%%xmm0 \n"
787 "psraw $0x8,%%xmm1 \n"
788 "packsswb %%xmm1,%%xmm0 \n"
789 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000790 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000791 "movlps %%xmm0,(%1) \n"
792 "movhps %%xmm0,(%1,%2,1) \n"
793 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000794 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000795 : "+r"(src_argb0), // %0
796 "+r"(dst_u), // %1
797 "+r"(dst_v), // %2
798 "+rm"(width) // %3
799 : "r"(static_cast<intptr_t>(src_stride_argb))
800 : "memory", "cc"
801#if defined(__SSE2__)
802 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
803#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000804 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000805}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000806
fbarchard@google.com714050a2012-02-17 22:59:56 +0000807void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
808 asm volatile (
809 "movdqa %4,%%xmm5 \n"
810 "movdqa %3,%%xmm4 \n"
811 "1: \n"
812 "movdqa (%0),%%xmm0 \n"
813 "movdqa 0x10(%0),%%xmm1 \n"
814 "movdqa 0x20(%0),%%xmm2 \n"
815 "movdqa 0x30(%0),%%xmm3 \n"
816 "pmaddubsw %%xmm4,%%xmm0 \n"
817 "pmaddubsw %%xmm4,%%xmm1 \n"
818 "pmaddubsw %%xmm4,%%xmm2 \n"
819 "pmaddubsw %%xmm4,%%xmm3 \n"
820 "lea 0x40(%0),%0 \n"
821 "phaddw %%xmm1,%%xmm0 \n"
822 "phaddw %%xmm3,%%xmm2 \n"
823 "psrlw $0x7,%%xmm0 \n"
824 "psrlw $0x7,%%xmm2 \n"
825 "packuswb %%xmm2,%%xmm0 \n"
826 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000827 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000828 "movdqa %%xmm0,(%1) \n"
829 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000830 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000831 : "+r"(src_bgra), // %0
832 "+r"(dst_y), // %1
833 "+r"(pix) // %2
834 : "m"(kBGRAToY), // %3
835 "m"(kAddY16) // %4
836 : "memory", "cc"
837#if defined(__SSE2__)
838 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000839#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000840 );
841}
842
843void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
844 asm volatile (
845 "movdqa %4,%%xmm5 \n"
846 "movdqa %3,%%xmm4 \n"
847 "1: \n"
848 "movdqu (%0),%%xmm0 \n"
849 "movdqu 0x10(%0),%%xmm1 \n"
850 "movdqu 0x20(%0),%%xmm2 \n"
851 "movdqu 0x30(%0),%%xmm3 \n"
852 "pmaddubsw %%xmm4,%%xmm0 \n"
853 "pmaddubsw %%xmm4,%%xmm1 \n"
854 "pmaddubsw %%xmm4,%%xmm2 \n"
855 "pmaddubsw %%xmm4,%%xmm3 \n"
856 "lea 0x40(%0),%0 \n"
857 "phaddw %%xmm1,%%xmm0 \n"
858 "phaddw %%xmm3,%%xmm2 \n"
859 "psrlw $0x7,%%xmm0 \n"
860 "psrlw $0x7,%%xmm2 \n"
861 "packuswb %%xmm2,%%xmm0 \n"
862 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000863 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000864 "movdqu %%xmm0,(%1) \n"
865 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000866 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000867 : "+r"(src_bgra), // %0
868 "+r"(dst_y), // %1
869 "+r"(pix) // %2
870 : "m"(kBGRAToY), // %3
871 "m"(kAddY16) // %4
872 : "memory", "cc"
873#if defined(__SSE2__)
874 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
875#endif
876 );
877}
878
879void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
880 uint8* dst_u, uint8* dst_v, int width) {
881 asm volatile (
882 "movdqa %0,%%xmm4 \n"
883 "movdqa %1,%%xmm3 \n"
884 "movdqa %2,%%xmm5 \n"
885 :
886 : "m"(kBGRAToU), // %0
887 "m"(kBGRAToV), // %1
888 "m"(kAddUV128) // %2
889 :
890#if defined(__SSE2__)
891 "xmm3", "xmm4", "xmm5"
892#endif
893 );
894 asm volatile (
895 "sub %1,%2 \n"
896 "1: \n"
897 "movdqa (%0),%%xmm0 \n"
898 "movdqa 0x10(%0),%%xmm1 \n"
899 "movdqa 0x20(%0),%%xmm2 \n"
900 "movdqa 0x30(%0),%%xmm6 \n"
901 "pavgb (%0,%4,1),%%xmm0 \n"
902 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
903 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
904 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
905 "lea 0x40(%0),%0 \n"
906 "movdqa %%xmm0,%%xmm7 \n"
907 "shufps $0x88,%%xmm1,%%xmm0 \n"
908 "shufps $0xdd,%%xmm1,%%xmm7 \n"
909 "pavgb %%xmm7,%%xmm0 \n"
910 "movdqa %%xmm2,%%xmm7 \n"
911 "shufps $0x88,%%xmm6,%%xmm2 \n"
912 "shufps $0xdd,%%xmm6,%%xmm7 \n"
913 "pavgb %%xmm7,%%xmm2 \n"
914 "movdqa %%xmm0,%%xmm1 \n"
915 "movdqa %%xmm2,%%xmm6 \n"
916 "pmaddubsw %%xmm4,%%xmm0 \n"
917 "pmaddubsw %%xmm4,%%xmm2 \n"
918 "pmaddubsw %%xmm3,%%xmm1 \n"
919 "pmaddubsw %%xmm3,%%xmm6 \n"
920 "phaddw %%xmm2,%%xmm0 \n"
921 "phaddw %%xmm6,%%xmm1 \n"
922 "psraw $0x8,%%xmm0 \n"
923 "psraw $0x8,%%xmm1 \n"
924 "packsswb %%xmm1,%%xmm0 \n"
925 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000926 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000927 "movlps %%xmm0,(%1) \n"
928 "movhps %%xmm0,(%1,%2,1) \n"
929 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000930 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000931 : "+r"(src_bgra0), // %0
932 "+r"(dst_u), // %1
933 "+r"(dst_v), // %2
934 "+rm"(width) // %3
935 : "r"(static_cast<intptr_t>(src_stride_bgra))
936 : "memory", "cc"
937#if defined(__SSE2__)
938 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
939#endif
940 );
941}
942
943void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
944 uint8* dst_u, uint8* dst_v, int width) {
945 asm volatile (
946 "movdqa %0,%%xmm4 \n"
947 "movdqa %1,%%xmm3 \n"
948 "movdqa %2,%%xmm5 \n"
949 :
950 : "m"(kBGRAToU), // %0
951 "m"(kBGRAToV), // %1
952 "m"(kAddUV128) // %2
953 :
954#if defined(__SSE2__)
955 "xmm3", "xmm4", "xmm5"
956#endif
957 );
958 asm volatile (
959 "sub %1,%2 \n"
960 "1: \n"
961 "movdqu (%0),%%xmm0 \n"
962 "movdqu 0x10(%0),%%xmm1 \n"
963 "movdqu 0x20(%0),%%xmm2 \n"
964 "movdqu 0x30(%0),%%xmm6 \n"
965 "movdqu (%0,%4,1),%%xmm7 \n"
966 "pavgb %%xmm7,%%xmm0 \n"
967 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
968 "pavgb %%xmm7,%%xmm1 \n"
969 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
970 "pavgb %%xmm7,%%xmm2 \n"
971 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
972 "pavgb %%xmm7,%%xmm6 \n"
973 "lea 0x40(%0),%0 \n"
974 "movdqa %%xmm0,%%xmm7 \n"
975 "shufps $0x88,%%xmm1,%%xmm0 \n"
976 "shufps $0xdd,%%xmm1,%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm0 \n"
978 "movdqa %%xmm2,%%xmm7 \n"
979 "shufps $0x88,%%xmm6,%%xmm2 \n"
980 "shufps $0xdd,%%xmm6,%%xmm7 \n"
981 "pavgb %%xmm7,%%xmm2 \n"
982 "movdqa %%xmm0,%%xmm1 \n"
983 "movdqa %%xmm2,%%xmm6 \n"
984 "pmaddubsw %%xmm4,%%xmm0 \n"
985 "pmaddubsw %%xmm4,%%xmm2 \n"
986 "pmaddubsw %%xmm3,%%xmm1 \n"
987 "pmaddubsw %%xmm3,%%xmm6 \n"
988 "phaddw %%xmm2,%%xmm0 \n"
989 "phaddw %%xmm6,%%xmm1 \n"
990 "psraw $0x8,%%xmm0 \n"
991 "psraw $0x8,%%xmm1 \n"
992 "packsswb %%xmm1,%%xmm0 \n"
993 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000994 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000995 "movlps %%xmm0,(%1) \n"
996 "movhps %%xmm0,(%1,%2,1) \n"
997 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000998 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000999 : "+r"(src_bgra0), // %0
1000 "+r"(dst_u), // %1
1001 "+r"(dst_v), // %2
1002 "+rm"(width) // %3
1003 : "r"(static_cast<intptr_t>(src_stride_bgra))
1004 : "memory", "cc"
1005#if defined(__SSE2__)
1006 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1007#endif
1008 );
1009}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001010
1011void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1012 asm volatile (
1013 "movdqa %4,%%xmm5 \n"
1014 "movdqa %3,%%xmm4 \n"
1015 "1: \n"
1016 "movdqa (%0),%%xmm0 \n"
1017 "movdqa 0x10(%0),%%xmm1 \n"
1018 "movdqa 0x20(%0),%%xmm2 \n"
1019 "movdqa 0x30(%0),%%xmm3 \n"
1020 "pmaddubsw %%xmm4,%%xmm0 \n"
1021 "pmaddubsw %%xmm4,%%xmm1 \n"
1022 "pmaddubsw %%xmm4,%%xmm2 \n"
1023 "pmaddubsw %%xmm4,%%xmm3 \n"
1024 "lea 0x40(%0),%0 \n"
1025 "phaddw %%xmm1,%%xmm0 \n"
1026 "phaddw %%xmm3,%%xmm2 \n"
1027 "psrlw $0x7,%%xmm0 \n"
1028 "psrlw $0x7,%%xmm2 \n"
1029 "packuswb %%xmm2,%%xmm0 \n"
1030 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001031 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001032 "movdqa %%xmm0,(%1) \n"
1033 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001034 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001035 : "+r"(src_abgr), // %0
1036 "+r"(dst_y), // %1
1037 "+r"(pix) // %2
1038 : "m"(kABGRToY), // %3
1039 "m"(kAddY16) // %4
1040 : "memory", "cc"
1041#if defined(__SSE2__)
1042 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1043#endif
1044 );
1045}
1046
1047void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1048 asm volatile (
1049 "movdqa %4,%%xmm5 \n"
1050 "movdqa %3,%%xmm4 \n"
1051 "1: \n"
1052 "movdqu (%0),%%xmm0 \n"
1053 "movdqu 0x10(%0),%%xmm1 \n"
1054 "movdqu 0x20(%0),%%xmm2 \n"
1055 "movdqu 0x30(%0),%%xmm3 \n"
1056 "pmaddubsw %%xmm4,%%xmm0 \n"
1057 "pmaddubsw %%xmm4,%%xmm1 \n"
1058 "pmaddubsw %%xmm4,%%xmm2 \n"
1059 "pmaddubsw %%xmm4,%%xmm3 \n"
1060 "lea 0x40(%0),%0 \n"
1061 "phaddw %%xmm1,%%xmm0 \n"
1062 "phaddw %%xmm3,%%xmm2 \n"
1063 "psrlw $0x7,%%xmm0 \n"
1064 "psrlw $0x7,%%xmm2 \n"
1065 "packuswb %%xmm2,%%xmm0 \n"
1066 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001067 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001068 "movdqu %%xmm0,(%1) \n"
1069 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001070 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001071 : "+r"(src_abgr), // %0
1072 "+r"(dst_y), // %1
1073 "+r"(pix) // %2
1074 : "m"(kABGRToY), // %3
1075 "m"(kAddY16) // %4
1076 : "memory", "cc"
1077#if defined(__SSE2__)
1078 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1079#endif
1080 );
1081}
1082
1083void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1084 uint8* dst_u, uint8* dst_v, int width) {
1085 asm volatile (
1086 "movdqa %0,%%xmm4 \n"
1087 "movdqa %1,%%xmm3 \n"
1088 "movdqa %2,%%xmm5 \n"
1089 :
1090 : "m"(kABGRToU), // %0
1091 "m"(kABGRToV), // %1
1092 "m"(kAddUV128) // %2
1093 :
1094#if defined(__SSE2__)
1095 "xmm3", "xmm4", "xmm5"
1096#endif
1097 );
1098 asm volatile (
1099 "sub %1,%2 \n"
1100 "1: \n"
1101 "movdqa (%0),%%xmm0 \n"
1102 "movdqa 0x10(%0),%%xmm1 \n"
1103 "movdqa 0x20(%0),%%xmm2 \n"
1104 "movdqa 0x30(%0),%%xmm6 \n"
1105 "pavgb (%0,%4,1),%%xmm0 \n"
1106 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1107 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1108 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1109 "lea 0x40(%0),%0 \n"
1110 "movdqa %%xmm0,%%xmm7 \n"
1111 "shufps $0x88,%%xmm1,%%xmm0 \n"
1112 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1113 "pavgb %%xmm7,%%xmm0 \n"
1114 "movdqa %%xmm2,%%xmm7 \n"
1115 "shufps $0x88,%%xmm6,%%xmm2 \n"
1116 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1117 "pavgb %%xmm7,%%xmm2 \n"
1118 "movdqa %%xmm0,%%xmm1 \n"
1119 "movdqa %%xmm2,%%xmm6 \n"
1120 "pmaddubsw %%xmm4,%%xmm0 \n"
1121 "pmaddubsw %%xmm4,%%xmm2 \n"
1122 "pmaddubsw %%xmm3,%%xmm1 \n"
1123 "pmaddubsw %%xmm3,%%xmm6 \n"
1124 "phaddw %%xmm2,%%xmm0 \n"
1125 "phaddw %%xmm6,%%xmm1 \n"
1126 "psraw $0x8,%%xmm0 \n"
1127 "psraw $0x8,%%xmm1 \n"
1128 "packsswb %%xmm1,%%xmm0 \n"
1129 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001130 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001131 "movlps %%xmm0,(%1) \n"
1132 "movhps %%xmm0,(%1,%2,1) \n"
1133 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001134 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 : "+r"(src_abgr0), // %0
1136 "+r"(dst_u), // %1
1137 "+r"(dst_v), // %2
1138 "+rm"(width) // %3
1139 : "r"(static_cast<intptr_t>(src_stride_abgr))
1140 : "memory", "cc"
1141#if defined(__SSE2__)
1142 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1143#endif
1144 );
1145}
1146
1147void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1148 uint8* dst_u, uint8* dst_v, int width) {
1149 asm volatile (
1150 "movdqa %0,%%xmm4 \n"
1151 "movdqa %1,%%xmm3 \n"
1152 "movdqa %2,%%xmm5 \n"
1153 :
1154 : "m"(kABGRToU), // %0
1155 "m"(kABGRToV), // %1
1156 "m"(kAddUV128) // %2
1157 :
1158#if defined(__SSE2__)
1159 "xmm3", "xmm4", "xmm5"
1160#endif
1161 );
1162 asm volatile (
1163 "sub %1,%2 \n"
1164 "1: \n"
1165 "movdqu (%0),%%xmm0 \n"
1166 "movdqu 0x10(%0),%%xmm1 \n"
1167 "movdqu 0x20(%0),%%xmm2 \n"
1168 "movdqu 0x30(%0),%%xmm6 \n"
1169 "movdqu (%0,%4,1),%%xmm7 \n"
1170 "pavgb %%xmm7,%%xmm0 \n"
1171 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1172 "pavgb %%xmm7,%%xmm1 \n"
1173 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1174 "pavgb %%xmm7,%%xmm2 \n"
1175 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1176 "pavgb %%xmm7,%%xmm6 \n"
1177 "lea 0x40(%0),%0 \n"
1178 "movdqa %%xmm0,%%xmm7 \n"
1179 "shufps $0x88,%%xmm1,%%xmm0 \n"
1180 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1181 "pavgb %%xmm7,%%xmm0 \n"
1182 "movdqa %%xmm2,%%xmm7 \n"
1183 "shufps $0x88,%%xmm6,%%xmm2 \n"
1184 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1185 "pavgb %%xmm7,%%xmm2 \n"
1186 "movdqa %%xmm0,%%xmm1 \n"
1187 "movdqa %%xmm2,%%xmm6 \n"
1188 "pmaddubsw %%xmm4,%%xmm0 \n"
1189 "pmaddubsw %%xmm4,%%xmm2 \n"
1190 "pmaddubsw %%xmm3,%%xmm1 \n"
1191 "pmaddubsw %%xmm3,%%xmm6 \n"
1192 "phaddw %%xmm2,%%xmm0 \n"
1193 "phaddw %%xmm6,%%xmm1 \n"
1194 "psraw $0x8,%%xmm0 \n"
1195 "psraw $0x8,%%xmm1 \n"
1196 "packsswb %%xmm1,%%xmm0 \n"
1197 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001198 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001199 "movlps %%xmm0,(%1) \n"
1200 "movhps %%xmm0,(%1,%2,1) \n"
1201 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001202 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001203 : "+r"(src_abgr0), // %0
1204 "+r"(dst_u), // %1
1205 "+r"(dst_v), // %2
1206 "+rm"(width) // %3
1207 : "r"(static_cast<intptr_t>(src_stride_abgr))
1208 : "memory", "cc"
1209#if defined(__SSE2__)
1210 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1211#endif
1212 );
1213}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001214
1215#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001216
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001217#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001218#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1219#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1220#define UR 0
1221
1222#define VB 0
1223#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1224#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1225
1226// Bias
1227#define BB UB * 128 + VB * 128
1228#define BG UG * 128 + VG * 128
1229#define BR UR * 128 + VR * 128
1230
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001231#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001232
fbarchard@google.comb6149762011-11-07 21:58:52 +00001233#if defined(__APPLE__) || defined(__x86_64__)
1234#define OMITFP
1235#else
1236#define OMITFP __attribute__((optimize("omit-frame-pointer")))
1237#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001238
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001239struct {
1240 vec8 kUVToB;
1241 vec8 kUVToG;
1242 vec8 kUVToR;
1243 vec16 kUVBiasB;
1244 vec16 kUVBiasG;
1245 vec16 kUVBiasR;
1246 vec16 kYSub16;
1247 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001248} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001249 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1250 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1251 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1252 { BB, BB, BB, BB, BB, BB, BB, BB },
1253 { BG, BG, BG, BG, BG, BG, BG, BG },
1254 { BR, BR, BR, BR, BR, BR, BR, BR },
1255 { 16, 16, 16, 16, 16, 16, 16, 16 },
1256 { YG, YG, YG, YG, YG, YG, YG, YG }
1257};
1258
1259// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +00001260#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001261 "movd (%1),%%xmm0 \n" \
1262 "movd (%1,%2,1),%%xmm1 \n" \
1263 "lea 0x4(%1),%1 \n" \
1264 "punpcklbw %%xmm1,%%xmm0 \n" \
1265 "punpcklwd %%xmm0,%%xmm0 \n" \
1266 "movdqa %%xmm0,%%xmm1 \n" \
1267 "movdqa %%xmm0,%%xmm2 \n" \
1268 "pmaddubsw (%5),%%xmm0 \n" \
1269 "pmaddubsw 16(%5),%%xmm1 \n" \
1270 "pmaddubsw 32(%5),%%xmm2 \n" \
1271 "psubw 48(%5),%%xmm0 \n" \
1272 "psubw 64(%5),%%xmm1 \n" \
1273 "psubw 80(%5),%%xmm2 \n" \
1274 "movq (%0),%%xmm3 \n" \
1275 "lea 0x8(%0),%0 \n" \
1276 "punpcklbw %%xmm4,%%xmm3 \n" \
1277 "psubsw 96(%5),%%xmm3 \n" \
1278 "pmullw 112(%5),%%xmm3 \n" \
1279 "paddsw %%xmm3,%%xmm0 \n" \
1280 "paddsw %%xmm3,%%xmm1 \n" \
1281 "paddsw %%xmm3,%%xmm2 \n" \
1282 "psraw $0x6,%%xmm0 \n" \
1283 "psraw $0x6,%%xmm1 \n" \
1284 "psraw $0x6,%%xmm2 \n" \
1285 "packuswb %%xmm0,%%xmm0 \n" \
1286 "packuswb %%xmm1,%%xmm1 \n" \
1287 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001288
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001289void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
1290 const uint8* u_buf,
1291 const uint8* v_buf,
1292 uint8* rgb_buf,
1293 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +00001294 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001295 "sub %1,%2 \n"
1296 "pcmpeqb %%xmm5,%%xmm5 \n"
1297 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001298 "1: \n"
1299 YUVTORGB
1300 "punpcklbw %%xmm1,%%xmm0 \n"
1301 "punpcklbw %%xmm5,%%xmm2 \n"
1302 "movdqa %%xmm0,%%xmm1 \n"
1303 "punpcklwd %%xmm2,%%xmm0 \n"
1304 "punpckhwd %%xmm2,%%xmm1 \n"
1305 "movdqa %%xmm0,(%3) \n"
1306 "movdqa %%xmm1,0x10(%3) \n"
1307 "lea 0x20(%3),%3 \n"
1308 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001309 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001310 : "+r"(y_buf), // %0
1311 "+r"(u_buf), // %1
1312 "+r"(v_buf), // %2
1313 "+r"(rgb_buf), // %3
1314 "+rm"(width) // %4
1315 : "r"(&kYuvConstants.kUVToB) // %5
1316 : "memory", "cc"
1317#if defined(__SSE2__)
1318 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1319#endif
1320 );
1321}
1322
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001323void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
1324 const uint8* u_buf,
1325 const uint8* v_buf,
1326 uint8* rgb_buf,
1327 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001328 asm volatile (
1329 "sub %1,%2 \n"
1330 "pcmpeqb %%xmm5,%%xmm5 \n"
1331 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001332 "1: \n"
1333 YUVTORGB
1334 "pcmpeqb %%xmm5,%%xmm5 \n"
1335 "punpcklbw %%xmm0,%%xmm1 \n"
1336 "punpcklbw %%xmm2,%%xmm5 \n"
1337 "movdqa %%xmm5,%%xmm0 \n"
1338 "punpcklwd %%xmm1,%%xmm5 \n"
1339 "punpckhwd %%xmm1,%%xmm0 \n"
1340 "movdqa %%xmm5,(%3) \n"
1341 "movdqa %%xmm0,0x10(%3) \n"
1342 "lea 0x20(%3),%3 \n"
1343 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001344 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001345 : "+r"(y_buf), // %0
1346 "+r"(u_buf), // %1
1347 "+r"(v_buf), // %2
1348 "+r"(rgb_buf), // %3
1349 "+rm"(width) // %4
1350 : "r"(&kYuvConstants.kUVToB) // %5
1351 : "memory", "cc"
1352#if defined(__SSE2__)
1353 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1354#endif
1355 );
1356}
1357
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001358void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
1359 const uint8* u_buf,
1360 const uint8* v_buf,
1361 uint8* rgb_buf,
1362 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001363 asm volatile (
1364 "sub %1,%2 \n"
1365 "pcmpeqb %%xmm5,%%xmm5 \n"
1366 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001367 "1: \n"
1368 YUVTORGB
1369 "punpcklbw %%xmm1,%%xmm2 \n"
1370 "punpcklbw %%xmm5,%%xmm0 \n"
1371 "movdqa %%xmm2,%%xmm1 \n"
1372 "punpcklwd %%xmm0,%%xmm2 \n"
1373 "punpckhwd %%xmm0,%%xmm1 \n"
1374 "movdqa %%xmm2,(%3) \n"
1375 "movdqa %%xmm1,0x10(%3) \n"
1376 "lea 0x20(%3),%3 \n"
1377 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001378 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001379 : "+r"(y_buf), // %0
1380 "+r"(u_buf), // %1
1381 "+r"(v_buf), // %2
1382 "+r"(rgb_buf), // %3
1383 "+rm"(width) // %4
1384 : "r"(&kYuvConstants.kUVToB) // %5
1385 : "memory", "cc"
1386#if defined(__SSE2__)
1387 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1388#endif
1389 );
1390}
1391
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001392void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1393 const uint8* u_buf,
1394 const uint8* v_buf,
1395 uint8* rgb_buf,
1396 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001397 asm volatile (
1398 "sub %1,%2 \n"
1399 "pcmpeqb %%xmm5,%%xmm5 \n"
1400 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001401 "1: \n"
1402 "movd (%1),%%xmm0 \n"
1403 "movd (%1,%2,1),%%xmm1 \n"
1404 "lea 0x4(%1),%1 \n"
1405 "punpcklbw %%xmm1,%%xmm0 \n"
1406 "movdqa %%xmm0,%%xmm1 \n"
1407 "movdqa %%xmm0,%%xmm2 \n"
1408 "pmaddubsw (%5),%%xmm0 \n"
1409 "pmaddubsw 16(%5),%%xmm1 \n"
1410 "pmaddubsw 32(%5),%%xmm2 \n"
1411 "psubw 48(%5),%%xmm0 \n"
1412 "psubw 64(%5),%%xmm1 \n"
1413 "psubw 80(%5),%%xmm2 \n"
1414 "movd (%0),%%xmm3 \n"
1415 "lea 0x4(%0),%0 \n"
1416 "punpcklbw %%xmm4,%%xmm3 \n"
1417 "psubsw 96(%5),%%xmm3 \n"
1418 "pmullw 112(%5),%%xmm3 \n"
1419 "paddsw %%xmm3,%%xmm0 \n"
1420 "paddsw %%xmm3,%%xmm1 \n"
1421 "paddsw %%xmm3,%%xmm2 \n"
1422 "psraw $0x6,%%xmm0 \n"
1423 "psraw $0x6,%%xmm1 \n"
1424 "psraw $0x6,%%xmm2 \n"
1425 "packuswb %%xmm0,%%xmm0 \n"
1426 "packuswb %%xmm1,%%xmm1 \n"
1427 "packuswb %%xmm2,%%xmm2 \n"
1428 "punpcklbw %%xmm1,%%xmm0 \n"
1429 "punpcklbw %%xmm5,%%xmm2 \n"
1430 "punpcklwd %%xmm2,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001431 "sub $0x4,%4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001432 "movdqa %%xmm0,(%3) \n"
1433 "lea 0x10(%3),%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001434 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001435 : "+r"(y_buf), // %0
1436 "+r"(u_buf), // %1
1437 "+r"(v_buf), // %2
1438 "+r"(rgb_buf), // %3
1439 "+rm"(width) // %4
1440 : "r"(&kYuvConstants.kUVToB) // %5
1441 : "memory", "cc"
1442#if defined(__SSE2__)
1443 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1444#endif
1445 );
1446}
1447#endif
1448
1449#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001450void YToARGBRow_SSE2(const uint8* y_buf,
1451 uint8* rgb_buf,
1452 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001453 asm volatile (
1454 "pcmpeqb %%xmm4,%%xmm4 \n"
1455 "pslld $0x18,%%xmm4 \n"
1456 "mov $0x10001000,%%eax \n"
1457 "movd %%eax,%%xmm3 \n"
1458 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1459 "mov $0x012a012a,%%eax \n"
1460 "movd %%eax,%%xmm2 \n"
1461 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001462 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001463 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001464 "movq (%0),%%xmm0 \n"
1465 "lea 0x8(%0),%0 \n"
1466 "punpcklbw %%xmm0,%%xmm0 \n"
1467 "psubusw %%xmm3,%%xmm0 \n"
1468 "pmulhuw %%xmm2,%%xmm0 \n"
1469 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001470
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001471 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001472 "punpcklbw %%xmm0,%%xmm0 \n"
1473 "movdqa %%xmm0,%%xmm1 \n"
1474 "punpcklwd %%xmm0,%%xmm0 \n"
1475 "punpckhwd %%xmm1,%%xmm1 \n"
1476 "por %%xmm4,%%xmm0 \n"
1477 "por %%xmm4,%%xmm1 \n"
1478 "movdqa %%xmm0,(%1) \n"
1479 "movdqa %%xmm1,16(%1) \n"
1480 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001481
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001482 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001483 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001484 : "+r"(y_buf), // %0
1485 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001486 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001487 :
1488 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001489#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001491#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001492 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001493}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001494#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001495
fbarchard@google.com42831e02012-01-21 02:54:17 +00001496#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001497// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001498CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001499 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1500};
1501
fbarchard@google.com42831e02012-01-21 02:54:17 +00001502void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001503 intptr_t temp_width = static_cast<intptr_t>(width);
1504 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001505 "movdqa %3,%%xmm5 \n"
1506 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001507 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001508 "movdqa (%0,%2),%%xmm0 \n"
1509 "pshufb %%xmm5,%%xmm0 \n"
1510 "sub $0x10,%2 \n"
1511 "movdqa %%xmm0,(%1) \n"
1512 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001513 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001514 : "+r"(src), // %0
1515 "+r"(dst), // %1
1516 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001517 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001518 : "memory", "cc"
1519#if defined(__SSE2__)
1520 , "xmm0", "xmm5"
1521#endif
1522 );
1523}
1524#endif
1525
fbarchard@google.com42831e02012-01-21 02:54:17 +00001526#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001527void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001528 intptr_t temp_width = static_cast<intptr_t>(width);
1529 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001530 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001531 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001532 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001533 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001534 "psllw $0x8,%%xmm0 \n"
1535 "psrlw $0x8,%%xmm1 \n"
1536 "por %%xmm1,%%xmm0 \n"
1537 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1538 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1539 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1540 "sub $0x10,%2 \n"
1541 "movdqu %%xmm0,(%1) \n"
1542 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001543 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001544 : "+r"(src), // %0
1545 "+r"(dst), // %1
1546 "+r"(temp_width) // %2
1547 :
1548 : "memory", "cc"
1549#if defined(__SSE2__)
1550 , "xmm0", "xmm1"
1551#endif
1552 );
1553}
1554#endif
1555
fbarchard@google.com16a96642012-03-02 22:38:09 +00001556#ifdef HAS_MIRRORROW_UV_SSSE3
1557// Shuffle table for reversing the bytes of UV channels.
1558CONST uvec8 kShuffleMirrorUV = {
1559 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1560};
1561void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1562 int width) {
1563 intptr_t temp_width = static_cast<intptr_t>(width);
1564 asm volatile (
1565 "movdqa %4,%%xmm1 \n"
1566 "lea -16(%0,%3,2),%0 \n"
1567 "sub %1,%2 \n"
1568 "1: \n"
1569 "movdqa (%0),%%xmm0 \n"
1570 "lea -16(%0),%0 \n"
1571 "pshufb %%xmm1,%%xmm0 \n"
1572 "sub $8,%3 \n"
1573 "movlpd %%xmm0,(%1) \n"
1574 "movhpd %%xmm0,(%1,%2) \n"
1575 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001576 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001577 : "+r"(src), // %0
1578 "+r"(dst_u), // %1
1579 "+r"(dst_v), // %2
1580 "+r"(temp_width) // %3
1581 : "m"(kShuffleMirrorUV) // %4
1582 : "memory", "cc"
1583#if defined(__SSE2__)
1584 , "xmm0", "xmm1"
1585#endif
1586 );
1587}
1588#endif
1589
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001590#ifdef HAS_SPLITUV_SSE2
1591void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
1592 asm volatile (
1593 "pcmpeqb %%xmm5,%%xmm5 \n"
1594 "psrlw $0x8,%%xmm5 \n"
1595 "sub %1,%2 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001596 "1: \n"
1597 "movdqa (%0),%%xmm0 \n"
1598 "movdqa 0x10(%0),%%xmm1 \n"
1599 "lea 0x20(%0),%0 \n"
1600 "movdqa %%xmm0,%%xmm2 \n"
1601 "movdqa %%xmm1,%%xmm3 \n"
1602 "pand %%xmm5,%%xmm0 \n"
1603 "pand %%xmm5,%%xmm1 \n"
1604 "packuswb %%xmm1,%%xmm0 \n"
1605 "psrlw $0x8,%%xmm2 \n"
1606 "psrlw $0x8,%%xmm3 \n"
1607 "packuswb %%xmm3,%%xmm2 \n"
1608 "movdqa %%xmm0,(%1) \n"
1609 "movdqa %%xmm2,(%1,%2) \n"
1610 "lea 0x10(%1),%1 \n"
1611 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001612 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001613 : "+r"(src_uv), // %0
1614 "+r"(dst_u), // %1
1615 "+r"(dst_v), // %2
1616 "+r"(pix) // %3
1617 :
1618 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001619#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001620 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001621#endif
1622 );
1623}
1624#endif
1625
fbarchard@google.com19932f82012-02-16 22:19:14 +00001626#ifdef HAS_COPYROW_SSE2
1627void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
1628 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001629 "sub %0,%1 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001630 "1: \n"
1631 "movdqa (%0),%%xmm0 \n"
1632 "movdqa 0x10(%0),%%xmm1 \n"
1633 "movdqa %%xmm0,(%0,%1) \n"
1634 "movdqa %%xmm1,0x10(%0,%1) \n"
1635 "lea 0x20(%0),%0 \n"
1636 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001637 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001638 : "+r"(src), // %0
1639 "+r"(dst), // %1
1640 "+r"(count) // %2
1641 :
1642 : "memory", "cc"
1643#if defined(__SSE2__)
1644 , "xmm0", "xmm1"
1645#endif
1646 );
1647}
1648#endif // HAS_COPYROW_SSE2
1649
1650#ifdef HAS_COPYROW_X86
1651void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1652 size_t width_tmp = static_cast<size_t>(width);
1653 asm volatile (
1654 "shr $0x2,%2 \n"
1655 "rep movsl \n"
1656 : "+S"(src), // %0
1657 "+D"(dst), // %1
1658 "+c"(width_tmp) // %2
1659 :
1660 : "memory", "cc"
1661 );
1662}
1663#endif
1664
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001665#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001666void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
1667 asm volatile (
1668 "pcmpeqb %%xmm5,%%xmm5 \n"
1669 "psrlw $0x8,%%xmm5 \n"
1670 "1: \n"
1671 "movdqa (%0),%%xmm0 \n"
1672 "movdqa 0x10(%0),%%xmm1 \n"
1673 "lea 0x20(%0),%0 \n"
1674 "pand %%xmm5,%%xmm0 \n"
1675 "pand %%xmm5,%%xmm1 \n"
1676 "packuswb %%xmm1,%%xmm0 \n"
1677 "movdqa %%xmm0,(%1) \n"
1678 "lea 0x10(%1),%1 \n"
1679 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001680 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001681 : "+r"(src_yuy2), // %0
1682 "+r"(dst_y), // %1
1683 "+r"(pix) // %2
1684 :
1685 : "memory", "cc"
1686#if defined(__SSE2__)
1687 , "xmm0", "xmm1", "xmm5"
1688#endif
1689 );
1690}
1691
1692void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1693 uint8* dst_u, uint8* dst_y, int pix) {
1694 asm volatile (
1695 "pcmpeqb %%xmm5,%%xmm5 \n"
1696 "psrlw $0x8,%%xmm5 \n"
1697 "sub %1,%2 \n"
1698 "1: \n"
1699 "movdqa (%0),%%xmm0 \n"
1700 "movdqa 0x10(%0),%%xmm1 \n"
1701 "movdqa (%0,%4,1),%%xmm2 \n"
1702 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1703 "lea 0x20(%0),%0 \n"
1704 "pavgb %%xmm2,%%xmm0 \n"
1705 "pavgb %%xmm3,%%xmm1 \n"
1706 "psrlw $0x8,%%xmm0 \n"
1707 "psrlw $0x8,%%xmm1 \n"
1708 "packuswb %%xmm1,%%xmm0 \n"
1709 "movdqa %%xmm0,%%xmm1 \n"
1710 "pand %%xmm5,%%xmm0 \n"
1711 "packuswb %%xmm0,%%xmm0 \n"
1712 "psrlw $0x8,%%xmm1 \n"
1713 "packuswb %%xmm1,%%xmm1 \n"
1714 "movq %%xmm0,(%1) \n"
1715 "movq %%xmm1,(%1,%2) \n"
1716 "lea 0x8(%1),%1 \n"
1717 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001718 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001719 : "+r"(src_yuy2), // %0
1720 "+r"(dst_u), // %1
1721 "+r"(dst_y), // %2
1722 "+r"(pix) // %3
1723 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1724 : "memory", "cc"
1725#if defined(__SSE2__)
1726 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1727#endif
1728 );
1729}
1730
1731void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1732 uint8* dst_y, int pix) {
1733 asm volatile (
1734 "pcmpeqb %%xmm5,%%xmm5 \n"
1735 "psrlw $0x8,%%xmm5 \n"
1736 "1: \n"
1737 "movdqu (%0),%%xmm0 \n"
1738 "movdqu 0x10(%0),%%xmm1 \n"
1739 "lea 0x20(%0),%0 \n"
1740 "pand %%xmm5,%%xmm0 \n"
1741 "pand %%xmm5,%%xmm1 \n"
1742 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001743 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001744 "movdqu %%xmm0,(%1) \n"
1745 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001746 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001747 : "+r"(src_yuy2), // %0
1748 "+r"(dst_y), // %1
1749 "+r"(pix) // %2
1750 :
1751 : "memory", "cc"
1752#if defined(__SSE2__)
1753 , "xmm0", "xmm1", "xmm5"
1754#endif
1755 );
1756}
1757
1758void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1759 int stride_yuy2,
1760 uint8* dst_u, uint8* dst_y,
1761 int pix) {
1762 asm volatile (
1763 "pcmpeqb %%xmm5,%%xmm5 \n"
1764 "psrlw $0x8,%%xmm5 \n"
1765 "sub %1,%2 \n"
1766 "1: \n"
1767 "movdqu (%0),%%xmm0 \n"
1768 "movdqu 0x10(%0),%%xmm1 \n"
1769 "movdqu (%0,%4,1),%%xmm2 \n"
1770 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1771 "lea 0x20(%0),%0 \n"
1772 "pavgb %%xmm2,%%xmm0 \n"
1773 "pavgb %%xmm3,%%xmm1 \n"
1774 "psrlw $0x8,%%xmm0 \n"
1775 "psrlw $0x8,%%xmm1 \n"
1776 "packuswb %%xmm1,%%xmm0 \n"
1777 "movdqa %%xmm0,%%xmm1 \n"
1778 "pand %%xmm5,%%xmm0 \n"
1779 "packuswb %%xmm0,%%xmm0 \n"
1780 "psrlw $0x8,%%xmm1 \n"
1781 "packuswb %%xmm1,%%xmm1 \n"
1782 "movq %%xmm0,(%1) \n"
1783 "movq %%xmm1,(%1,%2) \n"
1784 "lea 0x8(%1),%1 \n"
1785 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001786 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001787 : "+r"(src_yuy2), // %0
1788 "+r"(dst_u), // %1
1789 "+r"(dst_y), // %2
1790 "+r"(pix) // %3
1791 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1792 : "memory", "cc"
1793#if defined(__SSE2__)
1794 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1795#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001796 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001797}
1798
1799void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
1800 asm volatile (
1801 "1: \n"
1802 "movdqa (%0),%%xmm0 \n"
1803 "movdqa 0x10(%0),%%xmm1 \n"
1804 "lea 0x20(%0),%0 \n"
1805 "psrlw $0x8,%%xmm0 \n"
1806 "psrlw $0x8,%%xmm1 \n"
1807 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001808 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001809 "movdqa %%xmm0,(%1) \n"
1810 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001811 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001812 : "+r"(src_uyvy), // %0
1813 "+r"(dst_y), // %1
1814 "+r"(pix) // %2
1815 :
1816 : "memory", "cc"
1817#if defined(__SSE2__)
1818 , "xmm0", "xmm1"
1819#endif
1820 );
1821}
1822
1823void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
1824 uint8* dst_u, uint8* dst_y, int pix) {
1825 asm volatile (
1826 "pcmpeqb %%xmm5,%%xmm5 \n"
1827 "psrlw $0x8,%%xmm5 \n"
1828 "sub %1,%2 \n"
1829 "1: \n"
1830 "movdqa (%0),%%xmm0 \n"
1831 "movdqa 0x10(%0),%%xmm1 \n"
1832 "movdqa (%0,%4,1),%%xmm2 \n"
1833 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1834 "lea 0x20(%0),%0 \n"
1835 "pavgb %%xmm2,%%xmm0 \n"
1836 "pavgb %%xmm3,%%xmm1 \n"
1837 "pand %%xmm5,%%xmm0 \n"
1838 "pand %%xmm5,%%xmm1 \n"
1839 "packuswb %%xmm1,%%xmm0 \n"
1840 "movdqa %%xmm0,%%xmm1 \n"
1841 "pand %%xmm5,%%xmm0 \n"
1842 "packuswb %%xmm0,%%xmm0 \n"
1843 "psrlw $0x8,%%xmm1 \n"
1844 "packuswb %%xmm1,%%xmm1 \n"
1845 "movq %%xmm0,(%1) \n"
1846 "movq %%xmm1,(%1,%2) \n"
1847 "lea 0x8(%1),%1 \n"
1848 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001849 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001850 : "+r"(src_uyvy), // %0
1851 "+r"(dst_u), // %1
1852 "+r"(dst_y), // %2
1853 "+r"(pix) // %3
1854 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1855 : "memory", "cc"
1856#if defined(__SSE2__)
1857 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1858#endif
1859 );
1860}
1861
1862void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
1863 uint8* dst_y, int pix) {
1864 asm volatile (
1865 "1: \n"
1866 "movdqu (%0),%%xmm0 \n"
1867 "movdqu 0x10(%0),%%xmm1 \n"
1868 "lea 0x20(%0),%0 \n"
1869 "psrlw $0x8,%%xmm0 \n"
1870 "psrlw $0x8,%%xmm1 \n"
1871 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001872 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001873 "movdqu %%xmm0,(%1) \n"
1874 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001875 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001876 : "+r"(src_uyvy), // %0
1877 "+r"(dst_y), // %1
1878 "+r"(pix) // %2
1879 :
1880 : "memory", "cc"
1881#if defined(__SSE2__)
1882 , "xmm0", "xmm1"
1883#endif
1884 );
1885}
1886
1887void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
1888 uint8* dst_u, uint8* dst_y, int pix) {
1889 asm volatile (
1890 "pcmpeqb %%xmm5,%%xmm5 \n"
1891 "psrlw $0x8,%%xmm5 \n"
1892 "sub %1,%2 \n"
1893 "1: \n"
1894 "movdqu (%0),%%xmm0 \n"
1895 "movdqu 0x10(%0),%%xmm1 \n"
1896 "movdqu (%0,%4,1),%%xmm2 \n"
1897 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1898 "lea 0x20(%0),%0 \n"
1899 "pavgb %%xmm2,%%xmm0 \n"
1900 "pavgb %%xmm3,%%xmm1 \n"
1901 "pand %%xmm5,%%xmm0 \n"
1902 "pand %%xmm5,%%xmm1 \n"
1903 "packuswb %%xmm1,%%xmm0 \n"
1904 "movdqa %%xmm0,%%xmm1 \n"
1905 "pand %%xmm5,%%xmm0 \n"
1906 "packuswb %%xmm0,%%xmm0 \n"
1907 "psrlw $0x8,%%xmm1 \n"
1908 "packuswb %%xmm1,%%xmm1 \n"
1909 "movq %%xmm0,(%1) \n"
1910 "movq %%xmm1,(%1,%2) \n"
1911 "lea 0x8(%1),%1 \n"
1912 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001913 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001914 : "+r"(src_uyvy), // %0
1915 "+r"(dst_u), // %1
1916 "+r"(dst_y), // %2
1917 "+r"(pix) // %3
1918 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1919 : "memory", "cc"
1920#if defined(__SSE2__)
1921 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1922#endif
1923 );
1924}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001925#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001926
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001927#ifdef HAS_ARGBBLENDROW_SSE2
1928void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
1929 uint32 pixel = 0;
1930 asm volatile (
1931 "pcmpeqb %%xmm4,%%xmm4 \n"
fbarchard@google.com2217ced2012-03-09 22:44:57 +00001932 "pcmpeqb %%xmm5,%%xmm5 \n"
1933 "pslld $24,%%xmm5 \n"
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001934 "sub %0,%1 \n"
1935 "mov (%0),%3 \n"
1936 "sub $0x1,%2 \n"
fbarchard@google.com976423f2012-03-08 18:39:39 +00001937 "jle 8f \n" // last1
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001938 "cmp $0xff000000,%3 \n"
1939 "jae 2f \n" // opaqueloop
1940 "cmp $0xffffff,%3 \n"
fbarchard@google.com976423f2012-03-08 18:39:39 +00001941 "ja 3f \n" // translucentloop
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001942
1943 // transparentloop
1944 "1: \n"
1945 "sub $0x1,%2 \n"
1946 "lea 0x4(%0),%0 \n"
fbarchard@google.com976423f2012-03-08 18:39:39 +00001947 "jle 8f \n" // last1
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001948 "mov (%0),%3 \n"
1949 "cmp $0xffffff,%3 \n"
1950 "jbe 1b \n" // transparentloop
1951 "cmp $0xff000000,%3 \n"
fbarchard@google.com976423f2012-03-08 18:39:39 +00001952 "jb 3f \n" // translucentloop
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001953
1954 // opaqueloop
1955 "2: \n"
1956 "mov %3,(%0,%1,1) \n"
1957 "lea 0x4(%0),%0 \n"
1958 "sub $0x1,%2 \n"
fbarchard@google.com976423f2012-03-08 18:39:39 +00001959 "jle 8f \n" // last1
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001960 "mov (%0),%3 \n"
1961 "cmp $0xff000000,%3 \n"
1962 "jae 2b \n" // opaqueloop
1963 "cmp $0xffffff,%3 \n"
1964 "jbe 1b \n" // transparentloop
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001965
fbarchard@google.com976423f2012-03-08 18:39:39 +00001966 // translucentloop
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001967 "3: \n"
fbarchard@google.com2217ced2012-03-09 22:44:57 +00001968 "movq (%0),%%xmm0 \n"
1969 "movq (%0,%1,1),%%xmm1 \n"
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001970 "punpcklbw %%xmm0,%%xmm0 \n"
1971 "punpcklbw %%xmm1,%%xmm1 \n"
1972 "pshuflw $0xff,%%xmm0,%%xmm2 \n"
fbarchard@google.com2217ced2012-03-09 22:44:57 +00001973 "pshufhw $0xff,%%xmm2,%%xmm2 \n"
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00001974 "movdqa %%xmm2,%%xmm3 \n"
1975 "pxor %%xmm4,%%xmm3 \n"
1976 "pmulhuw %%xmm2,%%xmm0 \n"
1977 "pmulhuw %%xmm3,%%xmm1 \n"
fbarchard@google.com976423f2012-03-08 18:39:39 +00001978 "paddusw %%xmm1,%%xmm0 \n"
1979 "psrlw $0x8,%%xmm0 \n"
1980 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com2217ced2012-03-09 22:44:57 +00001981 "por %%xmm5,%%xmm0 \n"
1982 "movq %%xmm0,(%0,%1,1) \n"
fbarchard@google.com976423f2012-03-08 18:39:39 +00001983 "lea 0x8(%0),%0 \n"
1984 "sub $0x2,%2 \n"
1985 "jle 8f \n" // last1
1986 "mov (%0),%3 \n"
1987 "cmp $0xffffff,%3 \n"
1988 "jbe 1b \n" // transparentloop
1989 "cmp $0xff000000,%3 \n"
1990 "jb 3b \n" // translucentloop
1991 "jmp 2b \n" // opaqueloop
1992
1993 // last1
1994 "8: \n"
1995 "add $0x1,%2 \n" // 1 pixel left?
1996 "cmp $0x1,%2 \n"
1997 "jl 9f \n" // done
1998 "mov (%0),%3 \n"
1999 "movd %3,%%xmm0 \n"
2000 "mov (%0,%1,1),%3 \n"
2001 "movd %3,%%xmm1 \n"
2002 "punpcklbw %%xmm0,%%xmm0 \n"
2003 "punpcklbw %%xmm1,%%xmm1 \n"
2004 "pshuflw $0xff,%%xmm0,%%xmm2 \n"
2005 "movdqa %%xmm2,%%xmm3 \n"
2006 "pxor %%xmm4,%%xmm3 \n"
2007 "pmulhuw %%xmm2,%%xmm0 \n"
2008 "pmulhuw %%xmm3,%%xmm1 \n"
2009 "paddusw %%xmm1,%%xmm0 \n"
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002010 "psrlw $0x8,%%xmm0 \n"
2011 "packuswb %%xmm0,%%xmm0 \n"
2012 "movd %%xmm0,%3 \n"
2013 "mov %3,(%0,%1,1) \n"
2014
2015 // done
2016 "9: \n"
2017 : "+r"(src_argb), // %0
2018 "+r"(dst_argb), // %1
2019 "+r"(width), // %2
2020 "+r"(pixel) // %3
2021 :
2022 : "memory", "cc"
2023#if defined(__SSE2__)
fbarchard@google.com2217ced2012-03-09 22:44:57 +00002024 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002025#endif
2026 );
2027}
2028#endif // HAS_ARGBBLENDROW_SSE2
2029
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002030#endif // defined(__x86_64__) || defined(__i386__)
2031
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002032#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002033} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002034} // namespace libyuv
2035#endif