blob: 8d25df0d6e66635eda503a1f4c43544a981023f4 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
115 "1: \n"
116 "movq (%0),%%xmm0 \n"
117 "lea 0x8(%0),%0 \n"
118 "punpcklbw %%xmm0,%%xmm0 \n"
119 "movdqa %%xmm0,%%xmm1 \n"
120 "punpcklwd %%xmm0,%%xmm0 \n"
121 "punpckhwd %%xmm1,%%xmm1 \n"
122 "por %%xmm5,%%xmm0 \n"
123 "por %%xmm5,%%xmm1 \n"
124 "movdqa %%xmm0,(%1) \n"
125 "movdqa %%xmm1,0x10(%1) \n"
126 "lea 0x20(%1),%1 \n"
127 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000128 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000129 : "+r"(src_y), // %0
130 "+r"(dst_argb), // %1
131 "+r"(pix) // %2
132 :
133 : "memory", "cc"
134#if defined(__SSE2__)
135 , "xmm0", "xmm1", "xmm5"
136#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000137 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000138}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000139
140void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000141 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000142 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000143 "sub %0,%1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000144 "1: \n"
145 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000147 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000148 "movdqa %%xmm0,(%0,%1,1) \n"
149 "lea 0x10(%0),%0 \n"
150 "jg 1b \n"
151
fbarchard@google.comb6149762011-11-07 21:58:52 +0000152 : "+r"(src_abgr), // %0
153 "+r"(dst_argb), // %1
154 "+r"(pix) // %2
155 : "m"(kShuffleMaskABGRToARGB) // %3
156 : "memory", "cc"
157#if defined(__SSE2__)
158 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000159#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000160 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000161}
162
163void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000164 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000165 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000166 "sub %0,%1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "1: \n"
168 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000169 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000171 "movdqa %%xmm0,(%0,%1,1) \n"
172 "lea 0x10(%0),%0 \n"
173 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000174 : "+r"(src_bgra), // %0
175 "+r"(dst_argb), // %1
176 "+r"(pix) // %2
177 : "m"(kShuffleMaskBGRAToARGB) // %3
178 : "memory", "cc"
179#if defined(__SSE2__)
180 , "xmm0", "xmm5"
181#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000182 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000183}
184
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000185void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000186 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000187 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
188 "pslld $0x18,%%xmm5 \n"
189 "movdqa %3,%%xmm4 \n"
190 "1: \n"
191 "movdqu (%0),%%xmm0 \n"
192 "movdqu 0x10(%0),%%xmm1 \n"
193 "movdqu 0x20(%0),%%xmm3 \n"
194 "lea 0x30(%0),%0 \n"
195 "movdqa %%xmm3,%%xmm2 \n"
196 "palignr $0x8,%%xmm1,%%xmm2 \n"
197 "pshufb %%xmm4,%%xmm2 \n"
198 "por %%xmm5,%%xmm2 \n"
199 "palignr $0xc,%%xmm0,%%xmm1 \n"
200 "pshufb %%xmm4,%%xmm0 \n"
201 "movdqa %%xmm2,0x20(%1) \n"
202 "por %%xmm5,%%xmm0 \n"
203 "pshufb %%xmm4,%%xmm1 \n"
204 "movdqa %%xmm0,(%1) \n"
205 "por %%xmm5,%%xmm1 \n"
206 "palignr $0x4,%%xmm3,%%xmm3 \n"
207 "pshufb %%xmm4,%%xmm3 \n"
208 "movdqa %%xmm1,0x10(%1) \n"
209 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000210 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000211 "movdqa %%xmm3,0x30(%1) \n"
212 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000213 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000214 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000215 "+r"(dst_argb), // %1
216 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000217 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000218 : "memory", "cc"
219#if defined(__SSE2__)
220 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
221#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000222 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000223}
224
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000225void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000226 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000227 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
228 "pslld $0x18,%%xmm5 \n"
229 "movdqa %3,%%xmm4 \n"
230 "1: \n"
231 "movdqu (%0),%%xmm0 \n"
232 "movdqu 0x10(%0),%%xmm1 \n"
233 "movdqu 0x20(%0),%%xmm3 \n"
234 "lea 0x30(%0),%0 \n"
235 "movdqa %%xmm3,%%xmm2 \n"
236 "palignr $0x8,%%xmm1,%%xmm2 \n"
237 "pshufb %%xmm4,%%xmm2 \n"
238 "por %%xmm5,%%xmm2 \n"
239 "palignr $0xc,%%xmm0,%%xmm1 \n"
240 "pshufb %%xmm4,%%xmm0 \n"
241 "movdqa %%xmm2,0x20(%1) \n"
242 "por %%xmm5,%%xmm0 \n"
243 "pshufb %%xmm4,%%xmm1 \n"
244 "movdqa %%xmm0,(%1) \n"
245 "por %%xmm5,%%xmm1 \n"
246 "palignr $0x4,%%xmm3,%%xmm3 \n"
247 "pshufb %%xmm4,%%xmm3 \n"
248 "movdqa %%xmm1,0x10(%1) \n"
249 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000250 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000251 "movdqa %%xmm3,0x30(%1) \n"
252 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000253 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000254 : "+r"(src_raw), // %0
255 "+r"(dst_argb), // %1
256 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000257 : "m"(kShuffleMaskRAWToARGB) // %3
258 : "memory", "cc"
259#if defined(__SSE2__)
260 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
261#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000262 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000263}
264
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000265void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000266 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267 "mov $0x1080108,%%eax \n"
268 "movd %%eax,%%xmm5 \n"
269 "pshufd $0x0,%%xmm5,%%xmm5 \n"
270 "mov $0x20082008,%%eax \n"
271 "movd %%eax,%%xmm6 \n"
272 "pshufd $0x0,%%xmm6,%%xmm6 \n"
273 "pcmpeqb %%xmm3,%%xmm3 \n"
274 "psllw $0xb,%%xmm3 \n"
275 "pcmpeqb %%xmm4,%%xmm4 \n"
276 "psllw $0xa,%%xmm4 \n"
277 "psrlw $0x5,%%xmm4 \n"
278 "pcmpeqb %%xmm7,%%xmm7 \n"
279 "psllw $0x8,%%xmm7 \n"
280 "sub %0,%1 \n"
281 "sub %0,%1 \n"
282 "1: \n"
283 "movdqu (%0),%%xmm0 \n"
284 "movdqa %%xmm0,%%xmm1 \n"
285 "movdqa %%xmm0,%%xmm2 \n"
286 "pand %%xmm3,%%xmm1 \n"
287 "psllw $0xb,%%xmm2 \n"
288 "pmulhuw %%xmm5,%%xmm1 \n"
289 "pmulhuw %%xmm5,%%xmm2 \n"
290 "psllw $0x8,%%xmm1 \n"
291 "por %%xmm2,%%xmm1 \n"
292 "pand %%xmm4,%%xmm0 \n"
293 "pmulhuw %%xmm6,%%xmm0 \n"
294 "por %%xmm7,%%xmm0 \n"
295 "movdqa %%xmm1,%%xmm2 \n"
296 "punpcklbw %%xmm0,%%xmm1 \n"
297 "punpckhbw %%xmm0,%%xmm2 \n"
298 "movdqa %%xmm1,(%1,%0,2) \n"
299 "movdqa %%xmm2,0x10(%1,%0,2) \n"
300 "lea 0x10(%0),%0 \n"
301 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000302 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000303 : "+r"(src), // %0
304 "+r"(dst), // %1
305 "+r"(pix) // %2
306 :
307 : "memory", "cc", "eax"
308#if defined(__SSE2__)
309 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
310#endif
311 );
312}
313
314void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000315 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000316 "mov $0x1080108,%%eax \n"
317 "movd %%eax,%%xmm5 \n"
318 "pshufd $0x0,%%xmm5,%%xmm5 \n"
319 "mov $0x42004200,%%eax \n"
320 "movd %%eax,%%xmm6 \n"
321 "pshufd $0x0,%%xmm6,%%xmm6 \n"
322 "pcmpeqb %%xmm3,%%xmm3 \n"
323 "psllw $0xb,%%xmm3 \n"
324 "movdqa %%xmm3,%%xmm4 \n"
325 "psrlw $0x6,%%xmm4 \n"
326 "pcmpeqb %%xmm7,%%xmm7 \n"
327 "psllw $0x8,%%xmm7 \n"
328 "sub %0,%1 \n"
329 "sub %0,%1 \n"
330 "1: \n"
331 "movdqu (%0),%%xmm0 \n"
332 "movdqa %%xmm0,%%xmm1 \n"
333 "movdqa %%xmm0,%%xmm2 \n"
334 "psllw $0x1,%%xmm1 \n"
335 "psllw $0xb,%%xmm2 \n"
336 "pand %%xmm3,%%xmm1 \n"
337 "pmulhuw %%xmm5,%%xmm2 \n"
338 "pmulhuw %%xmm5,%%xmm1 \n"
339 "psllw $0x8,%%xmm1 \n"
340 "por %%xmm2,%%xmm1 \n"
341 "movdqa %%xmm0,%%xmm2 \n"
342 "pand %%xmm4,%%xmm0 \n"
343 "psraw $0x8,%%xmm2 \n"
344 "pmulhuw %%xmm6,%%xmm0 \n"
345 "pand %%xmm7,%%xmm2 \n"
346 "por %%xmm2,%%xmm0 \n"
347 "movdqa %%xmm1,%%xmm2 \n"
348 "punpcklbw %%xmm0,%%xmm1 \n"
349 "punpckhbw %%xmm0,%%xmm2 \n"
350 "movdqa %%xmm1,(%1,%0,2) \n"
351 "movdqa %%xmm2,0x10(%1,%0,2) \n"
352 "lea 0x10(%0),%0 \n"
353 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000354 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000355 : "+r"(src), // %0
356 "+r"(dst), // %1
357 "+r"(pix) // %2
358 :
359 : "memory", "cc", "eax"
360#if defined(__SSE2__)
361 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
362#endif
363 );
364}
365
366void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000367 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000368 "mov $0xf0f0f0f,%%eax \n"
369 "movd %%eax,%%xmm4 \n"
370 "pshufd $0x0,%%xmm4,%%xmm4 \n"
371 "movdqa %%xmm4,%%xmm5 \n"
372 "pslld $0x4,%%xmm5 \n"
373 "sub %0,%1 \n"
374 "sub %0,%1 \n"
375 "1: \n"
376 "movdqu (%0),%%xmm0 \n"
377 "movdqa %%xmm0,%%xmm2 \n"
378 "pand %%xmm4,%%xmm0 \n"
379 "pand %%xmm5,%%xmm2 \n"
380 "movdqa %%xmm0,%%xmm1 \n"
381 "movdqa %%xmm2,%%xmm3 \n"
382 "psllw $0x4,%%xmm1 \n"
383 "psrlw $0x4,%%xmm3 \n"
384 "por %%xmm1,%%xmm0 \n"
385 "por %%xmm3,%%xmm2 \n"
386 "movdqa %%xmm0,%%xmm1 \n"
387 "punpcklbw %%xmm2,%%xmm0 \n"
388 "punpckhbw %%xmm2,%%xmm1 \n"
389 "movdqa %%xmm0,(%1,%0,2) \n"
390 "movdqa %%xmm1,0x10(%1,%0,2) \n"
391 "lea 0x10(%0),%0 \n"
392 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000393 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000394 : "+r"(src), // %0
395 "+r"(dst), // %1
396 "+r"(pix) // %2
397 :
398 : "memory", "cc", "eax"
399#if defined(__SSE2__)
400 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
401#endif
402 );
403}
404
405void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000406 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000407 "movdqa %3,%%xmm6 \n"
408 "1: \n"
409 "movdqa (%0),%%xmm0 \n"
410 "movdqa 0x10(%0),%%xmm1 \n"
411 "movdqa 0x20(%0),%%xmm2 \n"
412 "movdqa 0x30(%0),%%xmm3 \n"
413 "lea 0x40(%0),%0 \n"
414 "pshufb %%xmm6,%%xmm0 \n"
415 "pshufb %%xmm6,%%xmm1 \n"
416 "pshufb %%xmm6,%%xmm2 \n"
417 "pshufb %%xmm6,%%xmm3 \n"
418 "movdqa %%xmm1,%%xmm4 \n"
419 "psrldq $0x4,%%xmm1 \n"
420 "pslldq $0xc,%%xmm4 \n"
421 "movdqa %%xmm2,%%xmm5 \n"
422 "por %%xmm4,%%xmm0 \n"
423 "pslldq $0x8,%%xmm5 \n"
424 "movdqa %%xmm0,(%1) \n"
425 "por %%xmm5,%%xmm1 \n"
426 "psrldq $0x8,%%xmm2 \n"
427 "pslldq $0x4,%%xmm3 \n"
428 "por %%xmm3,%%xmm2 \n"
429 "movdqa %%xmm1,0x10(%1) \n"
430 "movdqa %%xmm2,0x20(%1) \n"
431 "lea 0x30(%1),%1 \n"
432 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000433 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000434 : "+r"(src), // %0
435 "+r"(dst), // %1
436 "+r"(pix) // %2
437 : "m"(kShuffleMaskARGBToRGB24) // %3
438 : "memory", "cc"
439#if defined(__SSE2__)
440 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
441#endif
442 );
443}
444
445void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000446 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000447 "movdqa %3,%%xmm6 \n"
448 "1: \n"
449 "movdqa (%0),%%xmm0 \n"
450 "movdqa 0x10(%0),%%xmm1 \n"
451 "movdqa 0x20(%0),%%xmm2 \n"
452 "movdqa 0x30(%0),%%xmm3 \n"
453 "lea 0x40(%0),%0 \n"
454 "pshufb %%xmm6,%%xmm0 \n"
455 "pshufb %%xmm6,%%xmm1 \n"
456 "pshufb %%xmm6,%%xmm2 \n"
457 "pshufb %%xmm6,%%xmm3 \n"
458 "movdqa %%xmm1,%%xmm4 \n"
459 "psrldq $0x4,%%xmm1 \n"
460 "pslldq $0xc,%%xmm4 \n"
461 "movdqa %%xmm2,%%xmm5 \n"
462 "por %%xmm4,%%xmm0 \n"
463 "pslldq $0x8,%%xmm5 \n"
464 "movdqa %%xmm0,(%1) \n"
465 "por %%xmm5,%%xmm1 \n"
466 "psrldq $0x8,%%xmm2 \n"
467 "pslldq $0x4,%%xmm3 \n"
468 "por %%xmm3,%%xmm2 \n"
469 "movdqa %%xmm1,0x10(%1) \n"
470 "movdqa %%xmm2,0x20(%1) \n"
471 "lea 0x30(%1),%1 \n"
472 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000473 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000474 : "+r"(src), // %0
475 "+r"(dst), // %1
476 "+r"(pix) // %2
477 : "m"(kShuffleMaskARGBToRAW) // %3
478 : "memory", "cc"
479#if defined(__SSE2__)
480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
481#endif
482 );
483}
484
485void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000486 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000487 "pcmpeqb %%xmm3,%%xmm3 \n"
488 "psrld $0x1b,%%xmm3 \n"
489 "pcmpeqb %%xmm4,%%xmm4 \n"
490 "psrld $0x1a,%%xmm4 \n"
491 "pslld $0x5,%%xmm4 \n"
492 "pcmpeqb %%xmm5,%%xmm5 \n"
493 "pslld $0xb,%%xmm5 \n"
494 "1: \n"
495 "movdqa (%0),%%xmm0 \n"
496 "movdqa %%xmm0,%%xmm1 \n"
497 "movdqa %%xmm0,%%xmm2 \n"
498 "pslld $0x8,%%xmm0 \n"
499 "psrld $0x3,%%xmm1 \n"
500 "psrld $0x5,%%xmm2 \n"
501 "psrad $0x10,%%xmm0 \n"
502 "pand %%xmm3,%%xmm1 \n"
503 "pand %%xmm4,%%xmm2 \n"
504 "pand %%xmm5,%%xmm0 \n"
505 "por %%xmm2,%%xmm1 \n"
506 "por %%xmm1,%%xmm0 \n"
507 "packssdw %%xmm0,%%xmm0 \n"
508 "lea 0x10(%0),%0 \n"
509 "movq %%xmm0,(%1) \n"
510 "lea 0x8(%1),%1 \n"
511 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000512 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000513 : "+r"(src), // %0
514 "+r"(dst), // %1
515 "+r"(pix) // %2
516 :
517 : "memory", "cc"
518#if defined(__SSE2__)
519 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
520#endif
521 );
522}
523
524void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000525 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000526 "pcmpeqb %%xmm4,%%xmm4 \n"
527 "psrld $0x1b,%%xmm4 \n"
528 "movdqa %%xmm4,%%xmm5 \n"
529 "pslld $0x5,%%xmm5 \n"
530 "movdqa %%xmm4,%%xmm6 \n"
531 "pslld $0xa,%%xmm6 \n"
532 "pcmpeqb %%xmm7,%%xmm7 \n"
533 "pslld $0xf,%%xmm7 \n"
534 "1: \n"
535 "movdqa (%0),%%xmm0 \n"
536 "movdqa %%xmm0,%%xmm1 \n"
537 "movdqa %%xmm0,%%xmm2 \n"
538 "movdqa %%xmm0,%%xmm3 \n"
539 "psrad $0x10,%%xmm0 \n"
540 "psrld $0x3,%%xmm1 \n"
541 "psrld $0x6,%%xmm2 \n"
542 "psrld $0x9,%%xmm3 \n"
543 "pand %%xmm7,%%xmm0 \n"
544 "pand %%xmm4,%%xmm1 \n"
545 "pand %%xmm5,%%xmm2 \n"
546 "pand %%xmm6,%%xmm3 \n"
547 "por %%xmm1,%%xmm0 \n"
548 "por %%xmm3,%%xmm2 \n"
549 "por %%xmm2,%%xmm0 \n"
550 "packssdw %%xmm0,%%xmm0 \n"
551 "lea 0x10(%0),%0 \n"
552 "movq %%xmm0,(%1) \n"
553 "lea 0x8(%1),%1 \n"
554 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000555 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000556 : "+r"(src), // %0
557 "+r"(dst), // %1
558 "+r"(pix) // %2
559 :
560 : "memory", "cc"
561#if defined(__SSE2__)
562 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
563#endif
564 );
565}
566
567void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000568 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000569 "pcmpeqb %%xmm4,%%xmm4 \n"
570 "psllw $0xc,%%xmm4 \n"
571 "movdqa %%xmm4,%%xmm3 \n"
572 "psrlw $0x8,%%xmm3 \n"
573 "1: \n"
574 "movdqa (%0),%%xmm0 \n"
575 "movdqa %%xmm0,%%xmm1 \n"
576 "pand %%xmm3,%%xmm0 \n"
577 "pand %%xmm4,%%xmm1 \n"
578 "psrlq $0x4,%%xmm0 \n"
579 "psrlq $0x8,%%xmm1 \n"
580 "por %%xmm1,%%xmm0 \n"
581 "packuswb %%xmm0,%%xmm0 \n"
582 "lea 0x10(%0),%0 \n"
583 "movq %%xmm0,(%1) \n"
584 "lea 0x8(%1),%1 \n"
585 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000586 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000587 : "+r"(src), // %0
588 "+r"(dst), // %1
589 "+r"(pix) // %2
590 :
591 : "memory", "cc"
592#if defined(__SSE2__)
593 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
594#endif
595 );
596}
597
fbarchard@google.comb6149762011-11-07 21:58:52 +0000598void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000599 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000600 "movdqa %4,%%xmm5 \n"
601 "movdqa %3,%%xmm4 \n"
602 "1: \n"
603 "movdqa (%0),%%xmm0 \n"
604 "movdqa 0x10(%0),%%xmm1 \n"
605 "movdqa 0x20(%0),%%xmm2 \n"
606 "movdqa 0x30(%0),%%xmm3 \n"
607 "pmaddubsw %%xmm4,%%xmm0 \n"
608 "pmaddubsw %%xmm4,%%xmm1 \n"
609 "pmaddubsw %%xmm4,%%xmm2 \n"
610 "pmaddubsw %%xmm4,%%xmm3 \n"
611 "lea 0x40(%0),%0 \n"
612 "phaddw %%xmm1,%%xmm0 \n"
613 "phaddw %%xmm3,%%xmm2 \n"
614 "psrlw $0x7,%%xmm0 \n"
615 "psrlw $0x7,%%xmm2 \n"
616 "packuswb %%xmm2,%%xmm0 \n"
617 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000618 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000619 "movdqa %%xmm0,(%1) \n"
620 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000621 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000622 : "+r"(src_argb), // %0
623 "+r"(dst_y), // %1
624 "+r"(pix) // %2
625 : "m"(kARGBToY), // %3
626 "m"(kAddY16) // %4
627 : "memory", "cc"
628#if defined(__SSE2__)
629 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
630#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000631 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000632}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000633
634void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000635 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000636 "movdqa %4,%%xmm5 \n"
637 "movdqa %3,%%xmm4 \n"
638 "1: \n"
639 "movdqu (%0),%%xmm0 \n"
640 "movdqu 0x10(%0),%%xmm1 \n"
641 "movdqu 0x20(%0),%%xmm2 \n"
642 "movdqu 0x30(%0),%%xmm3 \n"
643 "pmaddubsw %%xmm4,%%xmm0 \n"
644 "pmaddubsw %%xmm4,%%xmm1 \n"
645 "pmaddubsw %%xmm4,%%xmm2 \n"
646 "pmaddubsw %%xmm4,%%xmm3 \n"
647 "lea 0x40(%0),%0 \n"
648 "phaddw %%xmm1,%%xmm0 \n"
649 "phaddw %%xmm3,%%xmm2 \n"
650 "psrlw $0x7,%%xmm0 \n"
651 "psrlw $0x7,%%xmm2 \n"
652 "packuswb %%xmm2,%%xmm0 \n"
653 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000654 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000655 "movdqu %%xmm0,(%1) \n"
656 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000657 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000658 : "+r"(src_argb), // %0
659 "+r"(dst_y), // %1
660 "+r"(pix) // %2
661 : "m"(kARGBToY), // %3
662 "m"(kAddY16) // %4
663 : "memory", "cc"
664#if defined(__SSE2__)
665 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
666#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000667 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000668}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000669
fbarchard@google.com714050a2012-02-17 22:59:56 +0000670// TODO(fbarchard): pass xmm constants to single block of assembly.
671// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
672// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
673// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
674// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000675void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
676 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000677 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000678 "movdqa %0,%%xmm4 \n"
679 "movdqa %1,%%xmm3 \n"
680 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000681 :
682 : "m"(kARGBToU), // %0
683 "m"(kARGBToV), // %1
684 "m"(kAddUV128) // %2
685 :
686#if defined(__SSE2__)
687 "xmm3", "xmm4", "xmm5"
688#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000689 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000690 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000691 "sub %1,%2 \n"
692 "1: \n"
693 "movdqa (%0),%%xmm0 \n"
694 "movdqa 0x10(%0),%%xmm1 \n"
695 "movdqa 0x20(%0),%%xmm2 \n"
696 "movdqa 0x30(%0),%%xmm6 \n"
697 "pavgb (%0,%4,1),%%xmm0 \n"
698 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
699 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
700 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
701 "lea 0x40(%0),%0 \n"
702 "movdqa %%xmm0,%%xmm7 \n"
703 "shufps $0x88,%%xmm1,%%xmm0 \n"
704 "shufps $0xdd,%%xmm1,%%xmm7 \n"
705 "pavgb %%xmm7,%%xmm0 \n"
706 "movdqa %%xmm2,%%xmm7 \n"
707 "shufps $0x88,%%xmm6,%%xmm2 \n"
708 "shufps $0xdd,%%xmm6,%%xmm7 \n"
709 "pavgb %%xmm7,%%xmm2 \n"
710 "movdqa %%xmm0,%%xmm1 \n"
711 "movdqa %%xmm2,%%xmm6 \n"
712 "pmaddubsw %%xmm4,%%xmm0 \n"
713 "pmaddubsw %%xmm4,%%xmm2 \n"
714 "pmaddubsw %%xmm3,%%xmm1 \n"
715 "pmaddubsw %%xmm3,%%xmm6 \n"
716 "phaddw %%xmm2,%%xmm0 \n"
717 "phaddw %%xmm6,%%xmm1 \n"
718 "psraw $0x8,%%xmm0 \n"
719 "psraw $0x8,%%xmm1 \n"
720 "packsswb %%xmm1,%%xmm0 \n"
721 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000722 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000723 "movlps %%xmm0,(%1) \n"
724 "movhps %%xmm0,(%1,%2,1) \n"
725 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000726 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000727 : "+r"(src_argb0), // %0
728 "+r"(dst_u), // %1
729 "+r"(dst_v), // %2
730 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000731 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000732 : "memory", "cc"
733#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000734 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000735#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000736 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000737}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000738
739void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
740 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000741 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000742 "movdqa %0,%%xmm4 \n"
743 "movdqa %1,%%xmm3 \n"
744 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000745 :
746 : "m"(kARGBToU), // %0
747 "m"(kARGBToV), // %1
748 "m"(kAddUV128) // %2
749 :
750#if defined(__SSE2__)
751 "xmm3", "xmm4", "xmm5"
752#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000753 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000754 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000755 "sub %1,%2 \n"
756 "1: \n"
757 "movdqu (%0),%%xmm0 \n"
758 "movdqu 0x10(%0),%%xmm1 \n"
759 "movdqu 0x20(%0),%%xmm2 \n"
760 "movdqu 0x30(%0),%%xmm6 \n"
761 "movdqu (%0,%4,1),%%xmm7 \n"
762 "pavgb %%xmm7,%%xmm0 \n"
763 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
764 "pavgb %%xmm7,%%xmm1 \n"
765 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
766 "pavgb %%xmm7,%%xmm2 \n"
767 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
768 "pavgb %%xmm7,%%xmm6 \n"
769 "lea 0x40(%0),%0 \n"
770 "movdqa %%xmm0,%%xmm7 \n"
771 "shufps $0x88,%%xmm1,%%xmm0 \n"
772 "shufps $0xdd,%%xmm1,%%xmm7 \n"
773 "pavgb %%xmm7,%%xmm0 \n"
774 "movdqa %%xmm2,%%xmm7 \n"
775 "shufps $0x88,%%xmm6,%%xmm2 \n"
776 "shufps $0xdd,%%xmm6,%%xmm7 \n"
777 "pavgb %%xmm7,%%xmm2 \n"
778 "movdqa %%xmm0,%%xmm1 \n"
779 "movdqa %%xmm2,%%xmm6 \n"
780 "pmaddubsw %%xmm4,%%xmm0 \n"
781 "pmaddubsw %%xmm4,%%xmm2 \n"
782 "pmaddubsw %%xmm3,%%xmm1 \n"
783 "pmaddubsw %%xmm3,%%xmm6 \n"
784 "phaddw %%xmm2,%%xmm0 \n"
785 "phaddw %%xmm6,%%xmm1 \n"
786 "psraw $0x8,%%xmm0 \n"
787 "psraw $0x8,%%xmm1 \n"
788 "packsswb %%xmm1,%%xmm0 \n"
789 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000790 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000791 "movlps %%xmm0,(%1) \n"
792 "movhps %%xmm0,(%1,%2,1) \n"
793 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000794 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000795 : "+r"(src_argb0), // %0
796 "+r"(dst_u), // %1
797 "+r"(dst_v), // %2
798 "+rm"(width) // %3
799 : "r"(static_cast<intptr_t>(src_stride_argb))
800 : "memory", "cc"
801#if defined(__SSE2__)
802 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
803#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000804 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000805}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000806
fbarchard@google.com714050a2012-02-17 22:59:56 +0000807void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000808 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000809 "movdqa %4,%%xmm5 \n"
810 "movdqa %3,%%xmm4 \n"
811 "1: \n"
812 "movdqa (%0),%%xmm0 \n"
813 "movdqa 0x10(%0),%%xmm1 \n"
814 "movdqa 0x20(%0),%%xmm2 \n"
815 "movdqa 0x30(%0),%%xmm3 \n"
816 "pmaddubsw %%xmm4,%%xmm0 \n"
817 "pmaddubsw %%xmm4,%%xmm1 \n"
818 "pmaddubsw %%xmm4,%%xmm2 \n"
819 "pmaddubsw %%xmm4,%%xmm3 \n"
820 "lea 0x40(%0),%0 \n"
821 "phaddw %%xmm1,%%xmm0 \n"
822 "phaddw %%xmm3,%%xmm2 \n"
823 "psrlw $0x7,%%xmm0 \n"
824 "psrlw $0x7,%%xmm2 \n"
825 "packuswb %%xmm2,%%xmm0 \n"
826 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000827 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000828 "movdqa %%xmm0,(%1) \n"
829 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000830 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000831 : "+r"(src_bgra), // %0
832 "+r"(dst_y), // %1
833 "+r"(pix) // %2
834 : "m"(kBGRAToY), // %3
835 "m"(kAddY16) // %4
836 : "memory", "cc"
837#if defined(__SSE2__)
838 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000839#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000840 );
841}
842
843void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000844 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000845 "movdqa %4,%%xmm5 \n"
846 "movdqa %3,%%xmm4 \n"
847 "1: \n"
848 "movdqu (%0),%%xmm0 \n"
849 "movdqu 0x10(%0),%%xmm1 \n"
850 "movdqu 0x20(%0),%%xmm2 \n"
851 "movdqu 0x30(%0),%%xmm3 \n"
852 "pmaddubsw %%xmm4,%%xmm0 \n"
853 "pmaddubsw %%xmm4,%%xmm1 \n"
854 "pmaddubsw %%xmm4,%%xmm2 \n"
855 "pmaddubsw %%xmm4,%%xmm3 \n"
856 "lea 0x40(%0),%0 \n"
857 "phaddw %%xmm1,%%xmm0 \n"
858 "phaddw %%xmm3,%%xmm2 \n"
859 "psrlw $0x7,%%xmm0 \n"
860 "psrlw $0x7,%%xmm2 \n"
861 "packuswb %%xmm2,%%xmm0 \n"
862 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000863 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000864 "movdqu %%xmm0,(%1) \n"
865 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000866 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000867 : "+r"(src_bgra), // %0
868 "+r"(dst_y), // %1
869 "+r"(pix) // %2
870 : "m"(kBGRAToY), // %3
871 "m"(kAddY16) // %4
872 : "memory", "cc"
873#if defined(__SSE2__)
874 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
875#endif
876 );
877}
878
879void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
880 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000881 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000882 "movdqa %0,%%xmm4 \n"
883 "movdqa %1,%%xmm3 \n"
884 "movdqa %2,%%xmm5 \n"
885 :
886 : "m"(kBGRAToU), // %0
887 "m"(kBGRAToV), // %1
888 "m"(kAddUV128) // %2
889 :
890#if defined(__SSE2__)
891 "xmm3", "xmm4", "xmm5"
892#endif
893 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000894 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000895 "sub %1,%2 \n"
896 "1: \n"
897 "movdqa (%0),%%xmm0 \n"
898 "movdqa 0x10(%0),%%xmm1 \n"
899 "movdqa 0x20(%0),%%xmm2 \n"
900 "movdqa 0x30(%0),%%xmm6 \n"
901 "pavgb (%0,%4,1),%%xmm0 \n"
902 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
903 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
904 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
905 "lea 0x40(%0),%0 \n"
906 "movdqa %%xmm0,%%xmm7 \n"
907 "shufps $0x88,%%xmm1,%%xmm0 \n"
908 "shufps $0xdd,%%xmm1,%%xmm7 \n"
909 "pavgb %%xmm7,%%xmm0 \n"
910 "movdqa %%xmm2,%%xmm7 \n"
911 "shufps $0x88,%%xmm6,%%xmm2 \n"
912 "shufps $0xdd,%%xmm6,%%xmm7 \n"
913 "pavgb %%xmm7,%%xmm2 \n"
914 "movdqa %%xmm0,%%xmm1 \n"
915 "movdqa %%xmm2,%%xmm6 \n"
916 "pmaddubsw %%xmm4,%%xmm0 \n"
917 "pmaddubsw %%xmm4,%%xmm2 \n"
918 "pmaddubsw %%xmm3,%%xmm1 \n"
919 "pmaddubsw %%xmm3,%%xmm6 \n"
920 "phaddw %%xmm2,%%xmm0 \n"
921 "phaddw %%xmm6,%%xmm1 \n"
922 "psraw $0x8,%%xmm0 \n"
923 "psraw $0x8,%%xmm1 \n"
924 "packsswb %%xmm1,%%xmm0 \n"
925 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000926 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000927 "movlps %%xmm0,(%1) \n"
928 "movhps %%xmm0,(%1,%2,1) \n"
929 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000930 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000931 : "+r"(src_bgra0), // %0
932 "+r"(dst_u), // %1
933 "+r"(dst_v), // %2
934 "+rm"(width) // %3
935 : "r"(static_cast<intptr_t>(src_stride_bgra))
936 : "memory", "cc"
937#if defined(__SSE2__)
938 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
939#endif
940 );
941}
942
943void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
944 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000945 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000946 "movdqa %0,%%xmm4 \n"
947 "movdqa %1,%%xmm3 \n"
948 "movdqa %2,%%xmm5 \n"
949 :
950 : "m"(kBGRAToU), // %0
951 "m"(kBGRAToV), // %1
952 "m"(kAddUV128) // %2
953 :
954#if defined(__SSE2__)
955 "xmm3", "xmm4", "xmm5"
956#endif
957 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000958 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000959 "sub %1,%2 \n"
960 "1: \n"
961 "movdqu (%0),%%xmm0 \n"
962 "movdqu 0x10(%0),%%xmm1 \n"
963 "movdqu 0x20(%0),%%xmm2 \n"
964 "movdqu 0x30(%0),%%xmm6 \n"
965 "movdqu (%0,%4,1),%%xmm7 \n"
966 "pavgb %%xmm7,%%xmm0 \n"
967 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
968 "pavgb %%xmm7,%%xmm1 \n"
969 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
970 "pavgb %%xmm7,%%xmm2 \n"
971 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
972 "pavgb %%xmm7,%%xmm6 \n"
973 "lea 0x40(%0),%0 \n"
974 "movdqa %%xmm0,%%xmm7 \n"
975 "shufps $0x88,%%xmm1,%%xmm0 \n"
976 "shufps $0xdd,%%xmm1,%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm0 \n"
978 "movdqa %%xmm2,%%xmm7 \n"
979 "shufps $0x88,%%xmm6,%%xmm2 \n"
980 "shufps $0xdd,%%xmm6,%%xmm7 \n"
981 "pavgb %%xmm7,%%xmm2 \n"
982 "movdqa %%xmm0,%%xmm1 \n"
983 "movdqa %%xmm2,%%xmm6 \n"
984 "pmaddubsw %%xmm4,%%xmm0 \n"
985 "pmaddubsw %%xmm4,%%xmm2 \n"
986 "pmaddubsw %%xmm3,%%xmm1 \n"
987 "pmaddubsw %%xmm3,%%xmm6 \n"
988 "phaddw %%xmm2,%%xmm0 \n"
989 "phaddw %%xmm6,%%xmm1 \n"
990 "psraw $0x8,%%xmm0 \n"
991 "psraw $0x8,%%xmm1 \n"
992 "packsswb %%xmm1,%%xmm0 \n"
993 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000994 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000995 "movlps %%xmm0,(%1) \n"
996 "movhps %%xmm0,(%1,%2,1) \n"
997 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000998 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000999 : "+r"(src_bgra0), // %0
1000 "+r"(dst_u), // %1
1001 "+r"(dst_v), // %2
1002 "+rm"(width) // %3
1003 : "r"(static_cast<intptr_t>(src_stride_bgra))
1004 : "memory", "cc"
1005#if defined(__SSE2__)
1006 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1007#endif
1008 );
1009}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001010
1011void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001012 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001013 "movdqa %4,%%xmm5 \n"
1014 "movdqa %3,%%xmm4 \n"
1015 "1: \n"
1016 "movdqa (%0),%%xmm0 \n"
1017 "movdqa 0x10(%0),%%xmm1 \n"
1018 "movdqa 0x20(%0),%%xmm2 \n"
1019 "movdqa 0x30(%0),%%xmm3 \n"
1020 "pmaddubsw %%xmm4,%%xmm0 \n"
1021 "pmaddubsw %%xmm4,%%xmm1 \n"
1022 "pmaddubsw %%xmm4,%%xmm2 \n"
1023 "pmaddubsw %%xmm4,%%xmm3 \n"
1024 "lea 0x40(%0),%0 \n"
1025 "phaddw %%xmm1,%%xmm0 \n"
1026 "phaddw %%xmm3,%%xmm2 \n"
1027 "psrlw $0x7,%%xmm0 \n"
1028 "psrlw $0x7,%%xmm2 \n"
1029 "packuswb %%xmm2,%%xmm0 \n"
1030 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001031 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001032 "movdqa %%xmm0,(%1) \n"
1033 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001034 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001035 : "+r"(src_abgr), // %0
1036 "+r"(dst_y), // %1
1037 "+r"(pix) // %2
1038 : "m"(kABGRToY), // %3
1039 "m"(kAddY16) // %4
1040 : "memory", "cc"
1041#if defined(__SSE2__)
1042 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1043#endif
1044 );
1045}
1046
1047void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001048 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001049 "movdqa %4,%%xmm5 \n"
1050 "movdqa %3,%%xmm4 \n"
1051 "1: \n"
1052 "movdqu (%0),%%xmm0 \n"
1053 "movdqu 0x10(%0),%%xmm1 \n"
1054 "movdqu 0x20(%0),%%xmm2 \n"
1055 "movdqu 0x30(%0),%%xmm3 \n"
1056 "pmaddubsw %%xmm4,%%xmm0 \n"
1057 "pmaddubsw %%xmm4,%%xmm1 \n"
1058 "pmaddubsw %%xmm4,%%xmm2 \n"
1059 "pmaddubsw %%xmm4,%%xmm3 \n"
1060 "lea 0x40(%0),%0 \n"
1061 "phaddw %%xmm1,%%xmm0 \n"
1062 "phaddw %%xmm3,%%xmm2 \n"
1063 "psrlw $0x7,%%xmm0 \n"
1064 "psrlw $0x7,%%xmm2 \n"
1065 "packuswb %%xmm2,%%xmm0 \n"
1066 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001067 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001068 "movdqu %%xmm0,(%1) \n"
1069 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001070 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001071 : "+r"(src_abgr), // %0
1072 "+r"(dst_y), // %1
1073 "+r"(pix) // %2
1074 : "m"(kABGRToY), // %3
1075 "m"(kAddY16) // %4
1076 : "memory", "cc"
1077#if defined(__SSE2__)
1078 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1079#endif
1080 );
1081}
1082
1083void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1084 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001085 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001086 "movdqa %0,%%xmm4 \n"
1087 "movdqa %1,%%xmm3 \n"
1088 "movdqa %2,%%xmm5 \n"
1089 :
1090 : "m"(kABGRToU), // %0
1091 "m"(kABGRToV), // %1
1092 "m"(kAddUV128) // %2
1093 :
1094#if defined(__SSE2__)
1095 "xmm3", "xmm4", "xmm5"
1096#endif
1097 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001098 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001099 "sub %1,%2 \n"
1100 "1: \n"
1101 "movdqa (%0),%%xmm0 \n"
1102 "movdqa 0x10(%0),%%xmm1 \n"
1103 "movdqa 0x20(%0),%%xmm2 \n"
1104 "movdqa 0x30(%0),%%xmm6 \n"
1105 "pavgb (%0,%4,1),%%xmm0 \n"
1106 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1107 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1108 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1109 "lea 0x40(%0),%0 \n"
1110 "movdqa %%xmm0,%%xmm7 \n"
1111 "shufps $0x88,%%xmm1,%%xmm0 \n"
1112 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1113 "pavgb %%xmm7,%%xmm0 \n"
1114 "movdqa %%xmm2,%%xmm7 \n"
1115 "shufps $0x88,%%xmm6,%%xmm2 \n"
1116 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1117 "pavgb %%xmm7,%%xmm2 \n"
1118 "movdqa %%xmm0,%%xmm1 \n"
1119 "movdqa %%xmm2,%%xmm6 \n"
1120 "pmaddubsw %%xmm4,%%xmm0 \n"
1121 "pmaddubsw %%xmm4,%%xmm2 \n"
1122 "pmaddubsw %%xmm3,%%xmm1 \n"
1123 "pmaddubsw %%xmm3,%%xmm6 \n"
1124 "phaddw %%xmm2,%%xmm0 \n"
1125 "phaddw %%xmm6,%%xmm1 \n"
1126 "psraw $0x8,%%xmm0 \n"
1127 "psraw $0x8,%%xmm1 \n"
1128 "packsswb %%xmm1,%%xmm0 \n"
1129 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001130 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001131 "movlps %%xmm0,(%1) \n"
1132 "movhps %%xmm0,(%1,%2,1) \n"
1133 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001134 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 : "+r"(src_abgr0), // %0
1136 "+r"(dst_u), // %1
1137 "+r"(dst_v), // %2
1138 "+rm"(width) // %3
1139 : "r"(static_cast<intptr_t>(src_stride_abgr))
1140 : "memory", "cc"
1141#if defined(__SSE2__)
1142 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1143#endif
1144 );
1145}
1146
1147void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1148 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001149 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001150 "movdqa %0,%%xmm4 \n"
1151 "movdqa %1,%%xmm3 \n"
1152 "movdqa %2,%%xmm5 \n"
1153 :
1154 : "m"(kABGRToU), // %0
1155 "m"(kABGRToV), // %1
1156 "m"(kAddUV128) // %2
1157 :
1158#if defined(__SSE2__)
1159 "xmm3", "xmm4", "xmm5"
1160#endif
1161 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001162 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001163 "sub %1,%2 \n"
1164 "1: \n"
1165 "movdqu (%0),%%xmm0 \n"
1166 "movdqu 0x10(%0),%%xmm1 \n"
1167 "movdqu 0x20(%0),%%xmm2 \n"
1168 "movdqu 0x30(%0),%%xmm6 \n"
1169 "movdqu (%0,%4,1),%%xmm7 \n"
1170 "pavgb %%xmm7,%%xmm0 \n"
1171 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1172 "pavgb %%xmm7,%%xmm1 \n"
1173 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1174 "pavgb %%xmm7,%%xmm2 \n"
1175 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1176 "pavgb %%xmm7,%%xmm6 \n"
1177 "lea 0x40(%0),%0 \n"
1178 "movdqa %%xmm0,%%xmm7 \n"
1179 "shufps $0x88,%%xmm1,%%xmm0 \n"
1180 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1181 "pavgb %%xmm7,%%xmm0 \n"
1182 "movdqa %%xmm2,%%xmm7 \n"
1183 "shufps $0x88,%%xmm6,%%xmm2 \n"
1184 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1185 "pavgb %%xmm7,%%xmm2 \n"
1186 "movdqa %%xmm0,%%xmm1 \n"
1187 "movdqa %%xmm2,%%xmm6 \n"
1188 "pmaddubsw %%xmm4,%%xmm0 \n"
1189 "pmaddubsw %%xmm4,%%xmm2 \n"
1190 "pmaddubsw %%xmm3,%%xmm1 \n"
1191 "pmaddubsw %%xmm3,%%xmm6 \n"
1192 "phaddw %%xmm2,%%xmm0 \n"
1193 "phaddw %%xmm6,%%xmm1 \n"
1194 "psraw $0x8,%%xmm0 \n"
1195 "psraw $0x8,%%xmm1 \n"
1196 "packsswb %%xmm1,%%xmm0 \n"
1197 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001198 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001199 "movlps %%xmm0,(%1) \n"
1200 "movhps %%xmm0,(%1,%2,1) \n"
1201 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001202 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001203 : "+r"(src_abgr0), // %0
1204 "+r"(dst_u), // %1
1205 "+r"(dst_v), // %2
1206 "+rm"(width) // %3
1207 : "r"(static_cast<intptr_t>(src_stride_abgr))
1208 : "memory", "cc"
1209#if defined(__SSE2__)
1210 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1211#endif
1212 );
1213}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001214
1215#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001216
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001217#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001218#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1219#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1220#define UR 0
1221
1222#define VB 0
1223#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1224#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1225
1226// Bias
1227#define BB UB * 128 + VB * 128
1228#define BG UG * 128 + VG * 128
1229#define BR UR * 128 + VR * 128
1230
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001231#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001232
fbarchard@google.comb6149762011-11-07 21:58:52 +00001233#if defined(__APPLE__) || defined(__x86_64__)
1234#define OMITFP
1235#else
1236#define OMITFP __attribute__((optimize("omit-frame-pointer")))
1237#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001238
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001239struct {
1240 vec8 kUVToB;
1241 vec8 kUVToG;
1242 vec8 kUVToR;
1243 vec16 kUVBiasB;
1244 vec16 kUVBiasG;
1245 vec16 kUVBiasR;
1246 vec16 kYSub16;
1247 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001248} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001249 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1250 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1251 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1252 { BB, BB, BB, BB, BB, BB, BB, BB },
1253 { BG, BG, BG, BG, BG, BG, BG, BG },
1254 { BR, BR, BR, BR, BR, BR, BR, BR },
1255 { 16, 16, 16, 16, 16, 16, 16, 16 },
1256 { YG, YG, YG, YG, YG, YG, YG, YG }
1257};
1258
1259// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +00001260#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001261 "movd (%1),%%xmm0 \n" \
1262 "movd (%1,%2,1),%%xmm1 \n" \
1263 "lea 0x4(%1),%1 \n" \
1264 "punpcklbw %%xmm1,%%xmm0 \n" \
1265 "punpcklwd %%xmm0,%%xmm0 \n" \
1266 "movdqa %%xmm0,%%xmm1 \n" \
1267 "movdqa %%xmm0,%%xmm2 \n" \
1268 "pmaddubsw (%5),%%xmm0 \n" \
1269 "pmaddubsw 16(%5),%%xmm1 \n" \
1270 "pmaddubsw 32(%5),%%xmm2 \n" \
1271 "psubw 48(%5),%%xmm0 \n" \
1272 "psubw 64(%5),%%xmm1 \n" \
1273 "psubw 80(%5),%%xmm2 \n" \
1274 "movq (%0),%%xmm3 \n" \
1275 "lea 0x8(%0),%0 \n" \
1276 "punpcklbw %%xmm4,%%xmm3 \n" \
1277 "psubsw 96(%5),%%xmm3 \n" \
1278 "pmullw 112(%5),%%xmm3 \n" \
1279 "paddsw %%xmm3,%%xmm0 \n" \
1280 "paddsw %%xmm3,%%xmm1 \n" \
1281 "paddsw %%xmm3,%%xmm2 \n" \
1282 "psraw $0x6,%%xmm0 \n" \
1283 "psraw $0x6,%%xmm1 \n" \
1284 "psraw $0x6,%%xmm2 \n" \
1285 "packuswb %%xmm0,%%xmm0 \n" \
1286 "packuswb %%xmm1,%%xmm1 \n" \
1287 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001288
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001289void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
1290 const uint8* u_buf,
1291 const uint8* v_buf,
1292 uint8* rgb_buf,
1293 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001294 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001295 "sub %1,%2 \n"
1296 "pcmpeqb %%xmm5,%%xmm5 \n"
1297 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001298 "1: \n"
1299 YUVTORGB
1300 "punpcklbw %%xmm1,%%xmm0 \n"
1301 "punpcklbw %%xmm5,%%xmm2 \n"
1302 "movdqa %%xmm0,%%xmm1 \n"
1303 "punpcklwd %%xmm2,%%xmm0 \n"
1304 "punpckhwd %%xmm2,%%xmm1 \n"
1305 "movdqa %%xmm0,(%3) \n"
1306 "movdqa %%xmm1,0x10(%3) \n"
1307 "lea 0x20(%3),%3 \n"
1308 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001309 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001310 : "+r"(y_buf), // %0
1311 "+r"(u_buf), // %1
1312 "+r"(v_buf), // %2
1313 "+r"(rgb_buf), // %3
1314 "+rm"(width) // %4
1315 : "r"(&kYuvConstants.kUVToB) // %5
1316 : "memory", "cc"
1317#if defined(__SSE2__)
1318 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1319#endif
1320 );
1321}
1322
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001323void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
1324 const uint8* u_buf,
1325 const uint8* v_buf,
1326 uint8* rgb_buf,
1327 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001328 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001329 "sub %1,%2 \n"
1330 "pcmpeqb %%xmm5,%%xmm5 \n"
1331 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001332 "1: \n"
1333 YUVTORGB
1334 "pcmpeqb %%xmm5,%%xmm5 \n"
1335 "punpcklbw %%xmm0,%%xmm1 \n"
1336 "punpcklbw %%xmm2,%%xmm5 \n"
1337 "movdqa %%xmm5,%%xmm0 \n"
1338 "punpcklwd %%xmm1,%%xmm5 \n"
1339 "punpckhwd %%xmm1,%%xmm0 \n"
1340 "movdqa %%xmm5,(%3) \n"
1341 "movdqa %%xmm0,0x10(%3) \n"
1342 "lea 0x20(%3),%3 \n"
1343 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001344 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001345 : "+r"(y_buf), // %0
1346 "+r"(u_buf), // %1
1347 "+r"(v_buf), // %2
1348 "+r"(rgb_buf), // %3
1349 "+rm"(width) // %4
1350 : "r"(&kYuvConstants.kUVToB) // %5
1351 : "memory", "cc"
1352#if defined(__SSE2__)
1353 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1354#endif
1355 );
1356}
1357
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001358void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
1359 const uint8* u_buf,
1360 const uint8* v_buf,
1361 uint8* rgb_buf,
1362 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001363 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001364 "sub %1,%2 \n"
1365 "pcmpeqb %%xmm5,%%xmm5 \n"
1366 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001367 "1: \n"
1368 YUVTORGB
1369 "punpcklbw %%xmm1,%%xmm2 \n"
1370 "punpcklbw %%xmm5,%%xmm0 \n"
1371 "movdqa %%xmm2,%%xmm1 \n"
1372 "punpcklwd %%xmm0,%%xmm2 \n"
1373 "punpckhwd %%xmm0,%%xmm1 \n"
1374 "movdqa %%xmm2,(%3) \n"
1375 "movdqa %%xmm1,0x10(%3) \n"
1376 "lea 0x20(%3),%3 \n"
1377 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001378 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001379 : "+r"(y_buf), // %0
1380 "+r"(u_buf), // %1
1381 "+r"(v_buf), // %2
1382 "+r"(rgb_buf), // %3
1383 "+rm"(width) // %4
1384 : "r"(&kYuvConstants.kUVToB) // %5
1385 : "memory", "cc"
1386#if defined(__SSE2__)
1387 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1388#endif
1389 );
1390}
1391
fbarchard@google.com952a5072012-03-30 18:10:50 +00001392void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1393 const uint8* u_buf,
1394 const uint8* v_buf,
1395 uint8* rgb_buf,
1396 int width) {
1397 asm volatile (
1398 "sub %1,%2 \n"
1399 "pcmpeqb %%xmm5,%%xmm5 \n"
1400 "pxor %%xmm4,%%xmm4 \n"
1401 "1: \n"
1402 YUVTORGB
1403 "punpcklbw %%xmm1,%%xmm0 \n"
1404 "punpcklbw %%xmm5,%%xmm2 \n"
1405 "movdqa %%xmm0,%%xmm1 \n"
1406 "punpcklwd %%xmm2,%%xmm0 \n"
1407 "punpckhwd %%xmm2,%%xmm1 \n"
1408 "movdqu %%xmm0,(%3) \n"
1409 "movdqu %%xmm1,0x10(%3) \n"
1410 "lea 0x20(%3),%3 \n"
1411 "sub $0x8,%4 \n"
1412 "jg 1b \n"
1413 : "+r"(y_buf), // %0
1414 "+r"(u_buf), // %1
1415 "+r"(v_buf), // %2
1416 "+r"(rgb_buf), // %3
1417 "+rm"(width) // %4
1418 : "r"(&kYuvConstants.kUVToB) // %5
1419 : "memory", "cc"
1420#if defined(__SSE2__)
1421 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1422#endif
1423 );
1424}
1425
1426void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
1427 const uint8* u_buf,
1428 const uint8* v_buf,
1429 uint8* rgb_buf,
1430 int width) {
1431 asm volatile (
1432 "sub %1,%2 \n"
1433 "pcmpeqb %%xmm5,%%xmm5 \n"
1434 "pxor %%xmm4,%%xmm4 \n"
1435 "1: \n"
1436 YUVTORGB
1437 "pcmpeqb %%xmm5,%%xmm5 \n"
1438 "punpcklbw %%xmm0,%%xmm1 \n"
1439 "punpcklbw %%xmm2,%%xmm5 \n"
1440 "movdqa %%xmm5,%%xmm0 \n"
1441 "punpcklwd %%xmm1,%%xmm5 \n"
1442 "punpckhwd %%xmm1,%%xmm0 \n"
1443 "movdqu %%xmm5,(%3) \n"
1444 "movdqu %%xmm0,0x10(%3) \n"
1445 "lea 0x20(%3),%3 \n"
1446 "sub $0x8,%4 \n"
1447 "jg 1b \n"
1448 : "+r"(y_buf), // %0
1449 "+r"(u_buf), // %1
1450 "+r"(v_buf), // %2
1451 "+r"(rgb_buf), // %3
1452 "+rm"(width) // %4
1453 : "r"(&kYuvConstants.kUVToB) // %5
1454 : "memory", "cc"
1455#if defined(__SSE2__)
1456 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1457#endif
1458 );
1459}
1460
1461void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
1462 const uint8* u_buf,
1463 const uint8* v_buf,
1464 uint8* rgb_buf,
1465 int width) {
1466 asm volatile (
1467 "sub %1,%2 \n"
1468 "pcmpeqb %%xmm5,%%xmm5 \n"
1469 "pxor %%xmm4,%%xmm4 \n"
1470 "1: \n"
1471 YUVTORGB
1472 "punpcklbw %%xmm1,%%xmm2 \n"
1473 "punpcklbw %%xmm5,%%xmm0 \n"
1474 "movdqa %%xmm2,%%xmm1 \n"
1475 "punpcklwd %%xmm0,%%xmm2 \n"
1476 "punpckhwd %%xmm0,%%xmm1 \n"
1477 "movdqu %%xmm2,(%3) \n"
1478 "movdqu %%xmm1,0x10(%3) \n"
1479 "lea 0x20(%3),%3 \n"
1480 "sub $0x8,%4 \n"
1481 "jg 1b \n"
1482 : "+r"(y_buf), // %0
1483 "+r"(u_buf), // %1
1484 "+r"(v_buf), // %2
1485 "+r"(rgb_buf), // %3
1486 "+rm"(width) // %4
1487 : "r"(&kYuvConstants.kUVToB) // %5
1488 : "memory", "cc"
1489#if defined(__SSE2__)
1490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1491#endif
1492 );
1493}
1494
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001495void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1496 const uint8* u_buf,
1497 const uint8* v_buf,
1498 uint8* rgb_buf,
1499 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001500 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001501 "sub %1,%2 \n"
1502 "pcmpeqb %%xmm5,%%xmm5 \n"
1503 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001504 "1: \n"
1505 "movd (%1),%%xmm0 \n"
1506 "movd (%1,%2,1),%%xmm1 \n"
1507 "lea 0x4(%1),%1 \n"
1508 "punpcklbw %%xmm1,%%xmm0 \n"
1509 "movdqa %%xmm0,%%xmm1 \n"
1510 "movdqa %%xmm0,%%xmm2 \n"
1511 "pmaddubsw (%5),%%xmm0 \n"
1512 "pmaddubsw 16(%5),%%xmm1 \n"
1513 "pmaddubsw 32(%5),%%xmm2 \n"
1514 "psubw 48(%5),%%xmm0 \n"
1515 "psubw 64(%5),%%xmm1 \n"
1516 "psubw 80(%5),%%xmm2 \n"
1517 "movd (%0),%%xmm3 \n"
1518 "lea 0x4(%0),%0 \n"
1519 "punpcklbw %%xmm4,%%xmm3 \n"
1520 "psubsw 96(%5),%%xmm3 \n"
1521 "pmullw 112(%5),%%xmm3 \n"
1522 "paddsw %%xmm3,%%xmm0 \n"
1523 "paddsw %%xmm3,%%xmm1 \n"
1524 "paddsw %%xmm3,%%xmm2 \n"
1525 "psraw $0x6,%%xmm0 \n"
1526 "psraw $0x6,%%xmm1 \n"
1527 "psraw $0x6,%%xmm2 \n"
1528 "packuswb %%xmm0,%%xmm0 \n"
1529 "packuswb %%xmm1,%%xmm1 \n"
1530 "packuswb %%xmm2,%%xmm2 \n"
1531 "punpcklbw %%xmm1,%%xmm0 \n"
1532 "punpcklbw %%xmm5,%%xmm2 \n"
1533 "punpcklwd %%xmm2,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001534 "sub $0x4,%4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001535 "movdqa %%xmm0,(%3) \n"
1536 "lea 0x10(%3),%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001537 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001538 : "+r"(y_buf), // %0
1539 "+r"(u_buf), // %1
1540 "+r"(v_buf), // %2
1541 "+r"(rgb_buf), // %3
1542 "+rm"(width) // %4
1543 : "r"(&kYuvConstants.kUVToB) // %5
1544 : "memory", "cc"
1545#if defined(__SSE2__)
1546 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1547#endif
1548 );
1549}
1550#endif
1551
1552#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001553void YToARGBRow_SSE2(const uint8* y_buf,
1554 uint8* rgb_buf,
1555 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001556 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001557 "pcmpeqb %%xmm4,%%xmm4 \n"
1558 "pslld $0x18,%%xmm4 \n"
1559 "mov $0x10001000,%%eax \n"
1560 "movd %%eax,%%xmm3 \n"
1561 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1562 "mov $0x012a012a,%%eax \n"
1563 "movd %%eax,%%xmm2 \n"
1564 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001565 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001566 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001567 "movq (%0),%%xmm0 \n"
1568 "lea 0x8(%0),%0 \n"
1569 "punpcklbw %%xmm0,%%xmm0 \n"
1570 "psubusw %%xmm3,%%xmm0 \n"
1571 "pmulhuw %%xmm2,%%xmm0 \n"
1572 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001573
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001574 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001575 "punpcklbw %%xmm0,%%xmm0 \n"
1576 "movdqa %%xmm0,%%xmm1 \n"
1577 "punpcklwd %%xmm0,%%xmm0 \n"
1578 "punpckhwd %%xmm1,%%xmm1 \n"
1579 "por %%xmm4,%%xmm0 \n"
1580 "por %%xmm4,%%xmm1 \n"
1581 "movdqa %%xmm0,(%1) \n"
1582 "movdqa %%xmm1,16(%1) \n"
1583 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001584
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001585 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001586 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001587 : "+r"(y_buf), // %0
1588 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001589 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001590 :
1591 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001592#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001593 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001594#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001595 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001596}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001597#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001598
fbarchard@google.com42831e02012-01-21 02:54:17 +00001599#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001600// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001601CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001602 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1603};
1604
fbarchard@google.com42831e02012-01-21 02:54:17 +00001605void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001606 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001607 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001608 "movdqa %3,%%xmm5 \n"
1609 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001610 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001611 "movdqa (%0,%2),%%xmm0 \n"
1612 "pshufb %%xmm5,%%xmm0 \n"
1613 "sub $0x10,%2 \n"
1614 "movdqa %%xmm0,(%1) \n"
1615 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001616 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001617 : "+r"(src), // %0
1618 "+r"(dst), // %1
1619 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001620 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001621 : "memory", "cc"
1622#if defined(__SSE2__)
1623 , "xmm0", "xmm5"
1624#endif
1625 );
1626}
1627#endif
1628
fbarchard@google.com42831e02012-01-21 02:54:17 +00001629#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001630void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001631 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001632 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001633 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001634 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001635 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001636 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001637 "psllw $0x8,%%xmm0 \n"
1638 "psrlw $0x8,%%xmm1 \n"
1639 "por %%xmm1,%%xmm0 \n"
1640 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1641 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1642 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1643 "sub $0x10,%2 \n"
1644 "movdqu %%xmm0,(%1) \n"
1645 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001646 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001647 : "+r"(src), // %0
1648 "+r"(dst), // %1
1649 "+r"(temp_width) // %2
1650 :
1651 : "memory", "cc"
1652#if defined(__SSE2__)
1653 , "xmm0", "xmm1"
1654#endif
1655 );
1656}
1657#endif
1658
fbarchard@google.com16a96642012-03-02 22:38:09 +00001659#ifdef HAS_MIRRORROW_UV_SSSE3
1660// Shuffle table for reversing the bytes of UV channels.
1661CONST uvec8 kShuffleMirrorUV = {
1662 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1663};
1664void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1665 int width) {
1666 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001667 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001668 "movdqa %4,%%xmm1 \n"
1669 "lea -16(%0,%3,2),%0 \n"
1670 "sub %1,%2 \n"
1671 "1: \n"
1672 "movdqa (%0),%%xmm0 \n"
1673 "lea -16(%0),%0 \n"
1674 "pshufb %%xmm1,%%xmm0 \n"
1675 "sub $8,%3 \n"
1676 "movlpd %%xmm0,(%1) \n"
1677 "movhpd %%xmm0,(%1,%2) \n"
1678 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001679 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001680 : "+r"(src), // %0
1681 "+r"(dst_u), // %1
1682 "+r"(dst_v), // %2
1683 "+r"(temp_width) // %3
1684 : "m"(kShuffleMirrorUV) // %4
1685 : "memory", "cc"
1686#if defined(__SSE2__)
1687 , "xmm0", "xmm1"
1688#endif
1689 );
1690}
1691#endif
1692
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001693#ifdef HAS_SPLITUV_SSE2
1694void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001695 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001696 "pcmpeqb %%xmm5,%%xmm5 \n"
1697 "psrlw $0x8,%%xmm5 \n"
1698 "sub %1,%2 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001699 "1: \n"
1700 "movdqa (%0),%%xmm0 \n"
1701 "movdqa 0x10(%0),%%xmm1 \n"
1702 "lea 0x20(%0),%0 \n"
1703 "movdqa %%xmm0,%%xmm2 \n"
1704 "movdqa %%xmm1,%%xmm3 \n"
1705 "pand %%xmm5,%%xmm0 \n"
1706 "pand %%xmm5,%%xmm1 \n"
1707 "packuswb %%xmm1,%%xmm0 \n"
1708 "psrlw $0x8,%%xmm2 \n"
1709 "psrlw $0x8,%%xmm3 \n"
1710 "packuswb %%xmm3,%%xmm2 \n"
1711 "movdqa %%xmm0,(%1) \n"
1712 "movdqa %%xmm2,(%1,%2) \n"
1713 "lea 0x10(%1),%1 \n"
1714 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001715 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001716 : "+r"(src_uv), // %0
1717 "+r"(dst_u), // %1
1718 "+r"(dst_v), // %2
1719 "+r"(pix) // %3
1720 :
1721 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001722#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001723 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001724#endif
1725 );
1726}
1727#endif
1728
fbarchard@google.com19932f82012-02-16 22:19:14 +00001729#ifdef HAS_COPYROW_SSE2
1730void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001731 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001732 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00001733 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001734 "1: \n"
1735 "movdqa (%0),%%xmm0 \n"
1736 "movdqa 0x10(%0),%%xmm1 \n"
1737 "movdqa %%xmm0,(%0,%1) \n"
1738 "movdqa %%xmm1,0x10(%0,%1) \n"
1739 "lea 0x20(%0),%0 \n"
1740 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001741 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001742 : "+r"(src), // %0
1743 "+r"(dst), // %1
1744 "+r"(count) // %2
1745 :
1746 : "memory", "cc"
1747#if defined(__SSE2__)
1748 , "xmm0", "xmm1"
1749#endif
1750 );
1751}
1752#endif // HAS_COPYROW_SSE2
1753
1754#ifdef HAS_COPYROW_X86
1755void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1756 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001757 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00001758 "shr $0x2,%2 \n"
1759 "rep movsl \n"
1760 : "+S"(src), // %0
1761 "+D"(dst), // %1
1762 "+c"(width_tmp) // %2
1763 :
1764 : "memory", "cc"
1765 );
1766}
1767#endif
1768
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001769#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001770void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001771 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001772 "pcmpeqb %%xmm5,%%xmm5 \n"
1773 "psrlw $0x8,%%xmm5 \n"
1774 "1: \n"
1775 "movdqa (%0),%%xmm0 \n"
1776 "movdqa 0x10(%0),%%xmm1 \n"
1777 "lea 0x20(%0),%0 \n"
1778 "pand %%xmm5,%%xmm0 \n"
1779 "pand %%xmm5,%%xmm1 \n"
1780 "packuswb %%xmm1,%%xmm0 \n"
1781 "movdqa %%xmm0,(%1) \n"
1782 "lea 0x10(%1),%1 \n"
1783 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001784 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001785 : "+r"(src_yuy2), // %0
1786 "+r"(dst_y), // %1
1787 "+r"(pix) // %2
1788 :
1789 : "memory", "cc"
1790#if defined(__SSE2__)
1791 , "xmm0", "xmm1", "xmm5"
1792#endif
1793 );
1794}
1795
1796void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1797 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001798 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001799 "pcmpeqb %%xmm5,%%xmm5 \n"
1800 "psrlw $0x8,%%xmm5 \n"
1801 "sub %1,%2 \n"
1802 "1: \n"
1803 "movdqa (%0),%%xmm0 \n"
1804 "movdqa 0x10(%0),%%xmm1 \n"
1805 "movdqa (%0,%4,1),%%xmm2 \n"
1806 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1807 "lea 0x20(%0),%0 \n"
1808 "pavgb %%xmm2,%%xmm0 \n"
1809 "pavgb %%xmm3,%%xmm1 \n"
1810 "psrlw $0x8,%%xmm0 \n"
1811 "psrlw $0x8,%%xmm1 \n"
1812 "packuswb %%xmm1,%%xmm0 \n"
1813 "movdqa %%xmm0,%%xmm1 \n"
1814 "pand %%xmm5,%%xmm0 \n"
1815 "packuswb %%xmm0,%%xmm0 \n"
1816 "psrlw $0x8,%%xmm1 \n"
1817 "packuswb %%xmm1,%%xmm1 \n"
1818 "movq %%xmm0,(%1) \n"
1819 "movq %%xmm1,(%1,%2) \n"
1820 "lea 0x8(%1),%1 \n"
1821 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001822 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001823 : "+r"(src_yuy2), // %0
1824 "+r"(dst_u), // %1
1825 "+r"(dst_y), // %2
1826 "+r"(pix) // %3
1827 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1828 : "memory", "cc"
1829#if defined(__SSE2__)
1830 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1831#endif
1832 );
1833}
1834
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00001835
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001836void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1837 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001838 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001839 "pcmpeqb %%xmm5,%%xmm5 \n"
1840 "psrlw $0x8,%%xmm5 \n"
1841 "1: \n"
1842 "movdqu (%0),%%xmm0 \n"
1843 "movdqu 0x10(%0),%%xmm1 \n"
1844 "lea 0x20(%0),%0 \n"
1845 "pand %%xmm5,%%xmm0 \n"
1846 "pand %%xmm5,%%xmm1 \n"
1847 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001848 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001849 "movdqu %%xmm0,(%1) \n"
1850 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001851 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001852 : "+r"(src_yuy2), // %0
1853 "+r"(dst_y), // %1
1854 "+r"(pix) // %2
1855 :
1856 : "memory", "cc"
1857#if defined(__SSE2__)
1858 , "xmm0", "xmm1", "xmm5"
1859#endif
1860 );
1861}
1862
1863void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1864 int stride_yuy2,
1865 uint8* dst_u, uint8* dst_y,
1866 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001867 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001868 "pcmpeqb %%xmm5,%%xmm5 \n"
1869 "psrlw $0x8,%%xmm5 \n"
1870 "sub %1,%2 \n"
1871 "1: \n"
1872 "movdqu (%0),%%xmm0 \n"
1873 "movdqu 0x10(%0),%%xmm1 \n"
1874 "movdqu (%0,%4,1),%%xmm2 \n"
1875 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1876 "lea 0x20(%0),%0 \n"
1877 "pavgb %%xmm2,%%xmm0 \n"
1878 "pavgb %%xmm3,%%xmm1 \n"
1879 "psrlw $0x8,%%xmm0 \n"
1880 "psrlw $0x8,%%xmm1 \n"
1881 "packuswb %%xmm1,%%xmm0 \n"
1882 "movdqa %%xmm0,%%xmm1 \n"
1883 "pand %%xmm5,%%xmm0 \n"
1884 "packuswb %%xmm0,%%xmm0 \n"
1885 "psrlw $0x8,%%xmm1 \n"
1886 "packuswb %%xmm1,%%xmm1 \n"
1887 "movq %%xmm0,(%1) \n"
1888 "movq %%xmm1,(%1,%2) \n"
1889 "lea 0x8(%1),%1 \n"
1890 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001891 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001892 : "+r"(src_yuy2), // %0
1893 "+r"(dst_u), // %1
1894 "+r"(dst_y), // %2
1895 "+r"(pix) // %3
1896 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1897 : "memory", "cc"
1898#if defined(__SSE2__)
1899 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1900#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001901 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001902}
1903
1904void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001905 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001906 "1: \n"
1907 "movdqa (%0),%%xmm0 \n"
1908 "movdqa 0x10(%0),%%xmm1 \n"
1909 "lea 0x20(%0),%0 \n"
1910 "psrlw $0x8,%%xmm0 \n"
1911 "psrlw $0x8,%%xmm1 \n"
1912 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001913 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001914 "movdqa %%xmm0,(%1) \n"
1915 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001916 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001917 : "+r"(src_uyvy), // %0
1918 "+r"(dst_y), // %1
1919 "+r"(pix) // %2
1920 :
1921 : "memory", "cc"
1922#if defined(__SSE2__)
1923 , "xmm0", "xmm1"
1924#endif
1925 );
1926}
1927
1928void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
1929 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001930 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001931 "pcmpeqb %%xmm5,%%xmm5 \n"
1932 "psrlw $0x8,%%xmm5 \n"
1933 "sub %1,%2 \n"
1934 "1: \n"
1935 "movdqa (%0),%%xmm0 \n"
1936 "movdqa 0x10(%0),%%xmm1 \n"
1937 "movdqa (%0,%4,1),%%xmm2 \n"
1938 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1939 "lea 0x20(%0),%0 \n"
1940 "pavgb %%xmm2,%%xmm0 \n"
1941 "pavgb %%xmm3,%%xmm1 \n"
1942 "pand %%xmm5,%%xmm0 \n"
1943 "pand %%xmm5,%%xmm1 \n"
1944 "packuswb %%xmm1,%%xmm0 \n"
1945 "movdqa %%xmm0,%%xmm1 \n"
1946 "pand %%xmm5,%%xmm0 \n"
1947 "packuswb %%xmm0,%%xmm0 \n"
1948 "psrlw $0x8,%%xmm1 \n"
1949 "packuswb %%xmm1,%%xmm1 \n"
1950 "movq %%xmm0,(%1) \n"
1951 "movq %%xmm1,(%1,%2) \n"
1952 "lea 0x8(%1),%1 \n"
1953 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001954 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001955 : "+r"(src_uyvy), // %0
1956 "+r"(dst_u), // %1
1957 "+r"(dst_y), // %2
1958 "+r"(pix) // %3
1959 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1960 : "memory", "cc"
1961#if defined(__SSE2__)
1962 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1963#endif
1964 );
1965}
1966
1967void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
1968 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001969 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001970 "1: \n"
1971 "movdqu (%0),%%xmm0 \n"
1972 "movdqu 0x10(%0),%%xmm1 \n"
1973 "lea 0x20(%0),%0 \n"
1974 "psrlw $0x8,%%xmm0 \n"
1975 "psrlw $0x8,%%xmm1 \n"
1976 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001977 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001978 "movdqu %%xmm0,(%1) \n"
1979 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001980 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001981 : "+r"(src_uyvy), // %0
1982 "+r"(dst_y), // %1
1983 "+r"(pix) // %2
1984 :
1985 : "memory", "cc"
1986#if defined(__SSE2__)
1987 , "xmm0", "xmm1"
1988#endif
1989 );
1990}
1991
1992void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
1993 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001994 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001995 "pcmpeqb %%xmm5,%%xmm5 \n"
1996 "psrlw $0x8,%%xmm5 \n"
1997 "sub %1,%2 \n"
1998 "1: \n"
1999 "movdqu (%0),%%xmm0 \n"
2000 "movdqu 0x10(%0),%%xmm1 \n"
2001 "movdqu (%0,%4,1),%%xmm2 \n"
2002 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2003 "lea 0x20(%0),%0 \n"
2004 "pavgb %%xmm2,%%xmm0 \n"
2005 "pavgb %%xmm3,%%xmm1 \n"
2006 "pand %%xmm5,%%xmm0 \n"
2007 "pand %%xmm5,%%xmm1 \n"
2008 "packuswb %%xmm1,%%xmm0 \n"
2009 "movdqa %%xmm0,%%xmm1 \n"
2010 "pand %%xmm5,%%xmm0 \n"
2011 "packuswb %%xmm0,%%xmm0 \n"
2012 "psrlw $0x8,%%xmm1 \n"
2013 "packuswb %%xmm1,%%xmm1 \n"
2014 "movq %%xmm0,(%1) \n"
2015 "movq %%xmm1,(%1,%2) \n"
2016 "lea 0x8(%1),%1 \n"
2017 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002018 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002019 : "+r"(src_uyvy), // %0
2020 "+r"(dst_u), // %1
2021 "+r"(dst_y), // %2
2022 "+r"(pix) // %3
2023 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2024 : "memory", "cc"
2025#if defined(__SSE2__)
2026 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2027#endif
2028 );
2029}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002030#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002031
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002032#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002033// Blend 8 pixels at a time.
2034// src_argb0 unaligned.
2035// src_argb1 and dst_argb aligned to 16 bytes.
2036// width must be multiple of 4 pixels.
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002037void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.comc757f302012-04-03 00:49:16 +00002038 uint8* dst_argb, int width) {
2039 asm volatile (
2040 "pcmpeqb %%xmm7,%%xmm7 \n"
2041 "psrlw $0xf,%%xmm7 \n"
2042 "pcmpeqb %%xmm6,%%xmm6 \n"
2043 "psrlw $0x8,%%xmm6 \n"
2044 "pcmpeqb %%xmm5,%%xmm5 \n"
2045 "psllw $0x8,%%xmm5 \n"
2046 "pcmpeqb %%xmm4,%%xmm4 \n"
2047 "pslld $0x18,%%xmm4 \n"
2048
2049 // 8 pixel loop
2050 "1: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002051 "movdqu (%0),%%xmm3 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002052 "movdqa %%xmm3,%%xmm0 \n"
2053 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002054 "movdqu (%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002055 "psrlw $0x8,%%xmm3 \n"
2056 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2057 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2058 "pand %%xmm6,%%xmm2 \n"
2059 "paddw %%xmm7,%%xmm3 \n"
2060 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002061 "movdqu (%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002062 "psrlw $0x8,%%xmm1 \n"
2063 "por %%xmm4,%%xmm0 \n"
2064 "pmullw %%xmm3,%%xmm1 \n"
2065 "movdqu 0x10(%0),%%xmm3 \n"
2066 "lea 0x20(%0),%0 \n"
2067 "psrlw $0x8,%%xmm2 \n"
2068 "paddusb %%xmm2,%%xmm0 \n"
2069 "pand %%xmm5,%%xmm1 \n"
2070 "paddusb %%xmm1,%%xmm0 \n"
2071 "sub $0x4,%3 \n"
2072 "movdqa %%xmm0,(%2) \n"
2073 "jle 9f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002074 "movdqa %%xmm3,%%xmm0 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002075 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002076 "movdqu 0x10(%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002077 "psrlw $0x8,%%xmm3 \n"
2078 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2079 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2080 "pand %%xmm6,%%xmm2 \n"
2081 "paddw %%xmm7,%%xmm3 \n"
2082 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002083 "movdqu 0x10(%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002084 "lea 0x20(%1),%1 \n"
2085 "psrlw $0x8,%%xmm1 \n"
2086 "por %%xmm4,%%xmm0 \n"
2087 "pmullw %%xmm3,%%xmm1 \n"
2088 "psrlw $0x8,%%xmm2 \n"
2089 "paddusb %%xmm2,%%xmm0 \n"
2090 "pand %%xmm5,%%xmm1 \n"
2091 "paddusb %%xmm1,%%xmm0 \n"
2092 "sub $0x4,%3 \n"
2093 "movdqa %%xmm0,0x10(%2) \n"
2094 "lea 0x20(%2),%2 \n"
2095 "jg 1b \n"
2096 "9: \n"
2097 : "+r"(src_argb0), // %0
2098 "+r"(src_argb1), // %1
2099 "+r"(dst_argb), // %2
2100 "+r"(width) // %3
2101 :
2102 : "memory", "cc"
2103#if defined(__SSE2__)
2104 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2105#endif
2106 );
2107}
2108
2109// Blend 1 pixel at a time, unaligned
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002110void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com96af8702012-04-06 18:22:27 +00002111 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002112 asm volatile (
2113 "pcmpeqb %%xmm7,%%xmm7 \n"
2114 "psrlw $0xf,%%xmm7 \n"
2115 "pcmpeqb %%xmm6,%%xmm6 \n"
2116 "psrlw $0x8,%%xmm6 \n"
2117 "pcmpeqb %%xmm5,%%xmm5 \n"
2118 "psllw $0x8,%%xmm5 \n"
2119 "pcmpeqb %%xmm4,%%xmm4 \n"
2120 "pslld $0x18,%%xmm4 \n"
2121
2122 // 1 pixel loop
2123 "1: \n"
2124 "movd (%0),%%xmm3 \n"
2125 "lea 0x4(%0),%0 \n"
2126 "movdqa %%xmm3,%%xmm0 \n"
2127 "pxor %%xmm4,%%xmm3 \n"
2128 "movd (%1),%%xmm2 \n"
2129 "psrlw $0x8,%%xmm3 \n"
2130 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2131 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2132 "pand %%xmm6,%%xmm2 \n"
2133 "paddw %%xmm7,%%xmm3 \n"
2134 "pmullw %%xmm3,%%xmm2 \n"
2135 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002136 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002137 "psrlw $0x8,%%xmm1 \n"
2138 "por %%xmm4,%%xmm0 \n"
2139 "pmullw %%xmm3,%%xmm1 \n"
2140 "psrlw $0x8,%%xmm2 \n"
2141 "paddusb %%xmm2,%%xmm0 \n"
2142 "pand %%xmm5,%%xmm1 \n"
2143 "paddusb %%xmm1,%%xmm0 \n"
2144 "sub $0x1,%3 \n"
2145 "movd %%xmm0,(%2) \n"
2146 "lea 0x4(%2),%2 \n"
2147 "jg 1b \n"
2148 : "+r"(src_argb0), // %0
2149 "+r"(src_argb1), // %1
2150 "+r"(dst_argb), // %2
2151 "+r"(width) // %3
2152 :
2153 : "memory", "cc"
2154#if defined(__SSE2__)
2155 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2156#endif
2157 );
2158}
fbarchard@google.comc757f302012-04-03 00:49:16 +00002159#endif // HAS_ARGBBLENDROW_SSE2
2160
fbarchard@google.com96af8702012-04-06 18:22:27 +00002161#ifdef HAS_ARGBBLENDROW_SSSE3
2162// Shuffle table for reversing the bytes.
2163CONST uvec8 kShuffleAlpha = {
2164 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2165 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2166};
2167void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002168 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002169 asm volatile (
2170 "pcmpeqb %%xmm7,%%xmm7 \n"
2171 "psrlw $0xf,%%xmm7 \n"
2172 "pcmpeqb %%xmm6,%%xmm6 \n"
2173 "psrlw $0x8,%%xmm6 \n"
2174 "pcmpeqb %%xmm5,%%xmm5 \n"
2175 "psllw $0x8,%%xmm5 \n"
2176 "pcmpeqb %%xmm4,%%xmm4 \n"
2177 "pslld $0x18,%%xmm4 \n"
2178
2179 // 8 pixel loop
2180 "1: \n"
2181 "movdqu (%0),%%xmm3 \n"
2182 "movdqa %%xmm3,%%xmm0 \n"
2183 "pxor %%xmm4,%%xmm3 \n"
2184 "pshufb %4,%%xmm3 \n"
2185 "movdqu (%1),%%xmm2 \n"
2186 "pand %%xmm6,%%xmm2 \n"
2187 "paddw %%xmm7,%%xmm3 \n"
2188 "pmullw %%xmm3,%%xmm2 \n"
2189 "movdqu (%1),%%xmm1 \n"
2190 "psrlw $0x8,%%xmm1 \n"
2191 "por %%xmm4,%%xmm0 \n"
2192 "pmullw %%xmm3,%%xmm1 \n"
2193 "movdqu 0x10(%0),%%xmm3 \n"
2194 "lea 0x20(%0),%0 \n"
2195 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002196 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002197 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002198 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002199 "sub $0x4,%3 \n"
2200 "movdqa %%xmm0,(%2) \n"
2201 "jle 9f \n"
2202 "movdqa %%xmm3,%%xmm0 \n"
2203 "pxor %%xmm4,%%xmm3 \n"
2204 "movdqu 0x10(%1),%%xmm2 \n"
2205 "pshufb %4,%%xmm3 \n"
2206 "pand %%xmm6,%%xmm2 \n"
2207 "paddw %%xmm7,%%xmm3 \n"
2208 "pmullw %%xmm3,%%xmm2 \n"
2209 "movdqu 0x10(%1),%%xmm1 \n"
2210 "lea 0x20(%1),%1 \n"
2211 "psrlw $0x8,%%xmm1 \n"
2212 "por %%xmm4,%%xmm0 \n"
2213 "pmullw %%xmm3,%%xmm1 \n"
2214 "psrlw $0x8,%%xmm2 \n"
2215 "paddusb %%xmm2,%%xmm0 \n"
2216 "pand %%xmm5,%%xmm1 \n"
2217 "paddusb %%xmm1,%%xmm0 \n"
2218 "sub $0x4,%3 \n"
2219 "movdqa %%xmm0,0x10(%2) \n"
2220 "lea 0x20(%2),%2 \n"
2221 "jg 1b \n"
2222 "9: \n"
2223 : "+r"(src_argb0), // %0
2224 "+r"(src_argb1), // %1
2225 "+r"(dst_argb), // %2
2226 "+r"(width) // %3
2227 : "m"(kShuffleAlpha) // %4
2228 : "memory", "cc"
2229#if defined(__SSE2__)
2230 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2231#endif
2232 );
2233}
2234#endif // HAS_ARGBBLENDROW_SSSE3
2235
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002236#ifdef HAS_ARGBATTENUATE_SSE2
2237// Attenuate 4 pixels at a time.
2238// aligned to 16 bytes
2239void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2240 asm volatile (
2241 "sub %0,%1 \n"
2242 "pcmpeqb %%xmm4,%%xmm4 \n"
2243 "pslld $0x18,%%xmm4 \n"
2244 "pcmpeqb %%xmm5,%%xmm5 \n"
2245 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002246
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002247 // 4 pixel loop
2248 "1: \n"
2249 "movdqa (%0),%%xmm0 \n"
2250 "punpcklbw %%xmm0,%%xmm0 \n"
2251 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2252 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2253 "pmulhuw %%xmm2,%%xmm0 \n"
2254 "movdqa (%0),%%xmm1 \n"
2255 "punpckhbw %%xmm1,%%xmm1 \n"
2256 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2257 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2258 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002259 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002260 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002261 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002262 "psrlw $0x8,%%xmm1 \n"
2263 "packuswb %%xmm1,%%xmm0 \n"
2264 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002265 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002266 "sub $0x4,%2 \n"
2267 "movdqa %%xmm0,(%0,%1,1) \n"
2268 "lea 0x10(%0),%0 \n"
2269 "jg 1b \n"
2270 : "+r"(src_argb), // %0
2271 "+r"(dst_argb), // %1
2272 "+r"(width) // %2
2273 :
2274 : "memory", "cc"
2275#if defined(__SSE2__)
2276 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2277#endif
2278 );
2279}
2280#endif // HAS_ARGBATTENUATE_SSE2
2281
fbarchard@google.com810cd912012-04-20 20:15:27 +00002282#ifdef HAS_ARGBATTENUATE_SSSE3
2283// Shuffle table duplicating alpha
2284CONST uvec8 kShuffleAlpha0 = {
2285 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2286};
2287CONST uvec8 kShuffleAlpha1 = {
2288 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2289 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2290};
2291// Attenuate 4 pixels at a time.
2292// aligned to 16 bytes
2293void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2294 asm volatile (
2295 "sub %0,%1 \n"
2296 "pcmpeqb %%xmm3,%%xmm3 \n"
2297 "pslld $0x18,%%xmm3 \n"
2298 "movdqa %3,%%xmm4 \n"
2299 "movdqa %4,%%xmm5 \n"
2300
2301 // 4 pixel loop
2302 "1: \n"
2303 "movdqa (%0),%%xmm0 \n"
2304 "pshufb %%xmm4,%%xmm0 \n"
2305 "movdqa (%0),%%xmm1 \n"
2306 "punpcklbw %%xmm1,%%xmm1 \n"
2307 "pmulhuw %%xmm1,%%xmm0 \n"
2308 "movdqa (%0),%%xmm1 \n"
2309 "pshufb %%xmm5,%%xmm1 \n"
2310 "movdqa (%0),%%xmm2 \n"
2311 "punpckhbw %%xmm2,%%xmm2 \n"
2312 "pmulhuw %%xmm2,%%xmm1 \n"
2313 "movdqa (%0),%%xmm2 \n"
2314 "pand %%xmm3,%%xmm2 \n"
2315 "psrlw $0x8,%%xmm0 \n"
2316 "psrlw $0x8,%%xmm1 \n"
2317 "packuswb %%xmm1,%%xmm0 \n"
2318 "por %%xmm2,%%xmm0 \n"
2319 "sub $0x4,%2 \n"
2320 "movdqa %%xmm0,(%0,%1,1) \n"
2321 "lea 0x10(%0),%0 \n"
2322 "jg 1b \n"
2323 : "+r"(src_argb), // %0
2324 "+r"(dst_argb), // %1
2325 "+r"(width) // %2
2326 : "m"(kShuffleAlpha0), // %3
2327 "m"(kShuffleAlpha1) // %4
2328 : "memory", "cc"
2329#if defined(__SSE2__)
2330 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2331#endif
2332 );
2333}
2334#endif // HAS_ARGBATTENUATE_SSSE3
2335
2336#ifdef HAS_ARGBUNATTENUATE_SSE2
2337// Divide source RGB by alpha and store to destination.
2338// b = (b * 255 + (a / 2)) / a;
2339// g = (g * 255 + (a / 2)) / a;
2340// r = (r * 255 + (a / 2)) / a;
2341// Reciprocal method is off by 1 on some values. ie 125
2342// 8.16 fixed point inverse table
2343#define T(a) 0x10000 / a
2344CONST uint32 fixed_invtbl8[256] = {
2345 0x100, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
2346 T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
2347 T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
2348 T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
2349 T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
2350 T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
2351 T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
2352 T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
2353 T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
2354 T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
2355 T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
2356 T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
2357 T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
2358 T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
2359 T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
2360 T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
2361 T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
2362 T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
2363 T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
2364 T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
2365 T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
2366 T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
2367 T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
2368 T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
2369 T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
2370 T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
2371 T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
2372 T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
2373 T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
2374 T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
2375 T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
2376 T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x100 };
2377#undef T
2378
2379// Unattenuate 4 pixels at a time.
2380// aligned to 16 bytes
2381void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2382 int width) {
2383 uintptr_t alpha = 0;
2384 asm volatile (
2385 "sub %0,%1 \n"
2386 "pcmpeqb %%xmm4,%%xmm4 \n"
2387 "pslld $0x18,%%xmm4 \n"
2388
2389 // 4 pixel loop
2390 "1: \n"
2391 "movdqa (%0),%%xmm0 \n"
2392 "movzb 0x3(%0),%3 \n"
2393 "punpcklbw %%xmm0,%%xmm0 \n"
2394 "movd 0x0(%4,%3,4),%%xmm2 \n"
2395 "movzb 0x7(%0),%3 \n"
2396 "movd 0x0(%4,%3,4),%%xmm3 \n"
2397 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2398 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2399 "movlhps %%xmm3,%%xmm2 \n"
2400 "pmulhuw %%xmm2,%%xmm0 \n"
2401 "movdqa (%0),%%xmm1 \n"
2402 "movzb 0xb(%0),%3 \n"
2403 "punpckhbw %%xmm1,%%xmm1 \n"
2404 "movd 0x0(%4,%3,4),%%xmm2 \n"
2405 "movzb 0xf(%0),%3 \n"
2406 "movd 0x0(%4,%3,4),%%xmm3 \n"
2407 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2408 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2409 "movlhps %%xmm3,%%xmm2 \n"
2410 "pmulhuw %%xmm2,%%xmm1 \n"
2411 "movdqa (%0),%%xmm2 \n"
2412 "pand %%xmm4,%%xmm2 \n"
2413 "packuswb %%xmm1,%%xmm0 \n"
2414 "por %%xmm2,%%xmm0 \n"
2415 "sub $0x4,%2 \n"
2416 "movdqa %%xmm0,(%0,%1,1) \n"
2417 "lea 0x10(%0),%0 \n"
2418 "jg 1b \n"
2419 : "+r"(src_argb), // %0
2420 "+r"(dst_argb), // %1
2421 "+r"(width), // %2
2422 "+r"(alpha) // %3
2423 : "r"(fixed_invtbl8) // %4
2424 : "memory", "cc"
2425#if defined(__SSE2__)
2426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2427#endif
2428 );
2429}
2430#endif // HAS_ARGBUNATTENUATE_SSE2
2431
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002432#endif // defined(__x86_64__) || defined(__i386__)
2433
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002434#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002435} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002436} // namespace libyuv
2437#endif