blob: 340dd651fd3c85f910e91ff4bb79b95236d2771d [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comffaea7e2012-05-18 19:43:59 +0000111// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
112CONST vec8 kARGBToGray = {
113 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
114};
115
fbarchard@google.comb6149762011-11-07 21:58:52 +0000116void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000117 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000118 "pcmpeqb %%xmm5,%%xmm5 \n"
119 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000120 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000121 "1: \n"
122 "movq (%0),%%xmm0 \n"
123 "lea 0x8(%0),%0 \n"
124 "punpcklbw %%xmm0,%%xmm0 \n"
125 "movdqa %%xmm0,%%xmm1 \n"
126 "punpcklwd %%xmm0,%%xmm0 \n"
127 "punpckhwd %%xmm1,%%xmm1 \n"
128 "por %%xmm5,%%xmm0 \n"
129 "por %%xmm5,%%xmm1 \n"
130 "movdqa %%xmm0,(%1) \n"
131 "movdqa %%xmm1,0x10(%1) \n"
132 "lea 0x20(%1),%1 \n"
133 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000134 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000135 : "+r"(src_y), // %0
136 "+r"(dst_argb), // %1
137 "+r"(pix) // %2
138 :
139 : "memory", "cc"
140#if defined(__SSE2__)
141 , "xmm0", "xmm1", "xmm5"
142#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000143 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000144}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000145
146void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000147 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000148 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000149 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000150 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000151 "1: \n"
152 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000153 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000154 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000155 "movdqa %%xmm0,(%0,%1,1) \n"
156 "lea 0x10(%0),%0 \n"
157 "jg 1b \n"
158
fbarchard@google.comb6149762011-11-07 21:58:52 +0000159 : "+r"(src_abgr), // %0
160 "+r"(dst_argb), // %1
161 "+r"(pix) // %2
162 : "m"(kShuffleMaskABGRToARGB) // %3
163 : "memory", "cc"
164#if defined(__SSE2__)
165 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000166#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000167 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000168}
169
170void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000171 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000172 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000173 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000174 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000175 "1: \n"
176 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000177 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000178 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000179 "movdqa %%xmm0,(%0,%1,1) \n"
180 "lea 0x10(%0),%0 \n"
181 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000182 : "+r"(src_bgra), // %0
183 "+r"(dst_argb), // %1
184 "+r"(pix) // %2
185 : "m"(kShuffleMaskBGRAToARGB) // %3
186 : "memory", "cc"
187#if defined(__SSE2__)
188 , "xmm0", "xmm5"
189#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000190 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000191}
192
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000193void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000194 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000195 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
196 "pslld $0x18,%%xmm5 \n"
197 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000198 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000199 "1: \n"
200 "movdqu (%0),%%xmm0 \n"
201 "movdqu 0x10(%0),%%xmm1 \n"
202 "movdqu 0x20(%0),%%xmm3 \n"
203 "lea 0x30(%0),%0 \n"
204 "movdqa %%xmm3,%%xmm2 \n"
205 "palignr $0x8,%%xmm1,%%xmm2 \n"
206 "pshufb %%xmm4,%%xmm2 \n"
207 "por %%xmm5,%%xmm2 \n"
208 "palignr $0xc,%%xmm0,%%xmm1 \n"
209 "pshufb %%xmm4,%%xmm0 \n"
210 "movdqa %%xmm2,0x20(%1) \n"
211 "por %%xmm5,%%xmm0 \n"
212 "pshufb %%xmm4,%%xmm1 \n"
213 "movdqa %%xmm0,(%1) \n"
214 "por %%xmm5,%%xmm1 \n"
215 "palignr $0x4,%%xmm3,%%xmm3 \n"
216 "pshufb %%xmm4,%%xmm3 \n"
217 "movdqa %%xmm1,0x10(%1) \n"
218 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000219 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000220 "movdqa %%xmm3,0x30(%1) \n"
221 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000222 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000223 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000224 "+r"(dst_argb), // %1
225 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000226 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000227 : "memory", "cc"
228#if defined(__SSE2__)
229 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
230#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000231 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000232}
233
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000234void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000235 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000236 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
237 "pslld $0x18,%%xmm5 \n"
238 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000239 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000240 "1: \n"
241 "movdqu (%0),%%xmm0 \n"
242 "movdqu 0x10(%0),%%xmm1 \n"
243 "movdqu 0x20(%0),%%xmm3 \n"
244 "lea 0x30(%0),%0 \n"
245 "movdqa %%xmm3,%%xmm2 \n"
246 "palignr $0x8,%%xmm1,%%xmm2 \n"
247 "pshufb %%xmm4,%%xmm2 \n"
248 "por %%xmm5,%%xmm2 \n"
249 "palignr $0xc,%%xmm0,%%xmm1 \n"
250 "pshufb %%xmm4,%%xmm0 \n"
251 "movdqa %%xmm2,0x20(%1) \n"
252 "por %%xmm5,%%xmm0 \n"
253 "pshufb %%xmm4,%%xmm1 \n"
254 "movdqa %%xmm0,(%1) \n"
255 "por %%xmm5,%%xmm1 \n"
256 "palignr $0x4,%%xmm3,%%xmm3 \n"
257 "pshufb %%xmm4,%%xmm3 \n"
258 "movdqa %%xmm1,0x10(%1) \n"
259 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000260 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000261 "movdqa %%xmm3,0x30(%1) \n"
262 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000263 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000264 : "+r"(src_raw), // %0
265 "+r"(dst_argb), // %1
266 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000267 : "m"(kShuffleMaskRAWToARGB) // %3
268 : "memory", "cc"
269#if defined(__SSE2__)
270 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
271#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000272 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000273}
274
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000275void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000276 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000277 "mov $0x1080108,%%eax \n"
278 "movd %%eax,%%xmm5 \n"
279 "pshufd $0x0,%%xmm5,%%xmm5 \n"
280 "mov $0x20082008,%%eax \n"
281 "movd %%eax,%%xmm6 \n"
282 "pshufd $0x0,%%xmm6,%%xmm6 \n"
283 "pcmpeqb %%xmm3,%%xmm3 \n"
284 "psllw $0xb,%%xmm3 \n"
285 "pcmpeqb %%xmm4,%%xmm4 \n"
286 "psllw $0xa,%%xmm4 \n"
287 "psrlw $0x5,%%xmm4 \n"
288 "pcmpeqb %%xmm7,%%xmm7 \n"
289 "psllw $0x8,%%xmm7 \n"
290 "sub %0,%1 \n"
291 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000292 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000293 "1: \n"
294 "movdqu (%0),%%xmm0 \n"
295 "movdqa %%xmm0,%%xmm1 \n"
296 "movdqa %%xmm0,%%xmm2 \n"
297 "pand %%xmm3,%%xmm1 \n"
298 "psllw $0xb,%%xmm2 \n"
299 "pmulhuw %%xmm5,%%xmm1 \n"
300 "pmulhuw %%xmm5,%%xmm2 \n"
301 "psllw $0x8,%%xmm1 \n"
302 "por %%xmm2,%%xmm1 \n"
303 "pand %%xmm4,%%xmm0 \n"
304 "pmulhuw %%xmm6,%%xmm0 \n"
305 "por %%xmm7,%%xmm0 \n"
306 "movdqa %%xmm1,%%xmm2 \n"
307 "punpcklbw %%xmm0,%%xmm1 \n"
308 "punpckhbw %%xmm0,%%xmm2 \n"
309 "movdqa %%xmm1,(%1,%0,2) \n"
310 "movdqa %%xmm2,0x10(%1,%0,2) \n"
311 "lea 0x10(%0),%0 \n"
312 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000313 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000314 : "+r"(src), // %0
315 "+r"(dst), // %1
316 "+r"(pix) // %2
317 :
318 : "memory", "cc", "eax"
319#if defined(__SSE2__)
320 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
321#endif
322 );
323}
324
325void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000326 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000327 "mov $0x1080108,%%eax \n"
328 "movd %%eax,%%xmm5 \n"
329 "pshufd $0x0,%%xmm5,%%xmm5 \n"
330 "mov $0x42004200,%%eax \n"
331 "movd %%eax,%%xmm6 \n"
332 "pshufd $0x0,%%xmm6,%%xmm6 \n"
333 "pcmpeqb %%xmm3,%%xmm3 \n"
334 "psllw $0xb,%%xmm3 \n"
335 "movdqa %%xmm3,%%xmm4 \n"
336 "psrlw $0x6,%%xmm4 \n"
337 "pcmpeqb %%xmm7,%%xmm7 \n"
338 "psllw $0x8,%%xmm7 \n"
339 "sub %0,%1 \n"
340 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000341 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000342 "1: \n"
343 "movdqu (%0),%%xmm0 \n"
344 "movdqa %%xmm0,%%xmm1 \n"
345 "movdqa %%xmm0,%%xmm2 \n"
346 "psllw $0x1,%%xmm1 \n"
347 "psllw $0xb,%%xmm2 \n"
348 "pand %%xmm3,%%xmm1 \n"
349 "pmulhuw %%xmm5,%%xmm2 \n"
350 "pmulhuw %%xmm5,%%xmm1 \n"
351 "psllw $0x8,%%xmm1 \n"
352 "por %%xmm2,%%xmm1 \n"
353 "movdqa %%xmm0,%%xmm2 \n"
354 "pand %%xmm4,%%xmm0 \n"
355 "psraw $0x8,%%xmm2 \n"
356 "pmulhuw %%xmm6,%%xmm0 \n"
357 "pand %%xmm7,%%xmm2 \n"
358 "por %%xmm2,%%xmm0 \n"
359 "movdqa %%xmm1,%%xmm2 \n"
360 "punpcklbw %%xmm0,%%xmm1 \n"
361 "punpckhbw %%xmm0,%%xmm2 \n"
362 "movdqa %%xmm1,(%1,%0,2) \n"
363 "movdqa %%xmm2,0x10(%1,%0,2) \n"
364 "lea 0x10(%0),%0 \n"
365 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000366 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000367 : "+r"(src), // %0
368 "+r"(dst), // %1
369 "+r"(pix) // %2
370 :
371 : "memory", "cc", "eax"
372#if defined(__SSE2__)
373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
374#endif
375 );
376}
377
378void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000379 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000380 "mov $0xf0f0f0f,%%eax \n"
381 "movd %%eax,%%xmm4 \n"
382 "pshufd $0x0,%%xmm4,%%xmm4 \n"
383 "movdqa %%xmm4,%%xmm5 \n"
384 "pslld $0x4,%%xmm5 \n"
385 "sub %0,%1 \n"
386 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000387 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000388 "1: \n"
389 "movdqu (%0),%%xmm0 \n"
390 "movdqa %%xmm0,%%xmm2 \n"
391 "pand %%xmm4,%%xmm0 \n"
392 "pand %%xmm5,%%xmm2 \n"
393 "movdqa %%xmm0,%%xmm1 \n"
394 "movdqa %%xmm2,%%xmm3 \n"
395 "psllw $0x4,%%xmm1 \n"
396 "psrlw $0x4,%%xmm3 \n"
397 "por %%xmm1,%%xmm0 \n"
398 "por %%xmm3,%%xmm2 \n"
399 "movdqa %%xmm0,%%xmm1 \n"
400 "punpcklbw %%xmm2,%%xmm0 \n"
401 "punpckhbw %%xmm2,%%xmm1 \n"
402 "movdqa %%xmm0,(%1,%0,2) \n"
403 "movdqa %%xmm1,0x10(%1,%0,2) \n"
404 "lea 0x10(%0),%0 \n"
405 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000406 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000407 : "+r"(src), // %0
408 "+r"(dst), // %1
409 "+r"(pix) // %2
410 :
411 : "memory", "cc", "eax"
412#if defined(__SSE2__)
413 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
414#endif
415 );
416}
417
418void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000419 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000420 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000421 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000422 "1: \n"
423 "movdqa (%0),%%xmm0 \n"
424 "movdqa 0x10(%0),%%xmm1 \n"
425 "movdqa 0x20(%0),%%xmm2 \n"
426 "movdqa 0x30(%0),%%xmm3 \n"
427 "lea 0x40(%0),%0 \n"
428 "pshufb %%xmm6,%%xmm0 \n"
429 "pshufb %%xmm6,%%xmm1 \n"
430 "pshufb %%xmm6,%%xmm2 \n"
431 "pshufb %%xmm6,%%xmm3 \n"
432 "movdqa %%xmm1,%%xmm4 \n"
433 "psrldq $0x4,%%xmm1 \n"
434 "pslldq $0xc,%%xmm4 \n"
435 "movdqa %%xmm2,%%xmm5 \n"
436 "por %%xmm4,%%xmm0 \n"
437 "pslldq $0x8,%%xmm5 \n"
438 "movdqa %%xmm0,(%1) \n"
439 "por %%xmm5,%%xmm1 \n"
440 "psrldq $0x8,%%xmm2 \n"
441 "pslldq $0x4,%%xmm3 \n"
442 "por %%xmm3,%%xmm2 \n"
443 "movdqa %%xmm1,0x10(%1) \n"
444 "movdqa %%xmm2,0x20(%1) \n"
445 "lea 0x30(%1),%1 \n"
446 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000447 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000448 : "+r"(src), // %0
449 "+r"(dst), // %1
450 "+r"(pix) // %2
451 : "m"(kShuffleMaskARGBToRGB24) // %3
452 : "memory", "cc"
453#if defined(__SSE2__)
454 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
455#endif
456 );
457}
458
459void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000460 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000461 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000462 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000463 "1: \n"
464 "movdqa (%0),%%xmm0 \n"
465 "movdqa 0x10(%0),%%xmm1 \n"
466 "movdqa 0x20(%0),%%xmm2 \n"
467 "movdqa 0x30(%0),%%xmm3 \n"
468 "lea 0x40(%0),%0 \n"
469 "pshufb %%xmm6,%%xmm0 \n"
470 "pshufb %%xmm6,%%xmm1 \n"
471 "pshufb %%xmm6,%%xmm2 \n"
472 "pshufb %%xmm6,%%xmm3 \n"
473 "movdqa %%xmm1,%%xmm4 \n"
474 "psrldq $0x4,%%xmm1 \n"
475 "pslldq $0xc,%%xmm4 \n"
476 "movdqa %%xmm2,%%xmm5 \n"
477 "por %%xmm4,%%xmm0 \n"
478 "pslldq $0x8,%%xmm5 \n"
479 "movdqa %%xmm0,(%1) \n"
480 "por %%xmm5,%%xmm1 \n"
481 "psrldq $0x8,%%xmm2 \n"
482 "pslldq $0x4,%%xmm3 \n"
483 "por %%xmm3,%%xmm2 \n"
484 "movdqa %%xmm1,0x10(%1) \n"
485 "movdqa %%xmm2,0x20(%1) \n"
486 "lea 0x30(%1),%1 \n"
487 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000488 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000489 : "+r"(src), // %0
490 "+r"(dst), // %1
491 "+r"(pix) // %2
492 : "m"(kShuffleMaskARGBToRAW) // %3
493 : "memory", "cc"
494#if defined(__SSE2__)
495 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
496#endif
497 );
498}
499
500void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000501 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000502 "pcmpeqb %%xmm3,%%xmm3 \n"
503 "psrld $0x1b,%%xmm3 \n"
504 "pcmpeqb %%xmm4,%%xmm4 \n"
505 "psrld $0x1a,%%xmm4 \n"
506 "pslld $0x5,%%xmm4 \n"
507 "pcmpeqb %%xmm5,%%xmm5 \n"
508 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000509 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000510 "1: \n"
511 "movdqa (%0),%%xmm0 \n"
512 "movdqa %%xmm0,%%xmm1 \n"
513 "movdqa %%xmm0,%%xmm2 \n"
514 "pslld $0x8,%%xmm0 \n"
515 "psrld $0x3,%%xmm1 \n"
516 "psrld $0x5,%%xmm2 \n"
517 "psrad $0x10,%%xmm0 \n"
518 "pand %%xmm3,%%xmm1 \n"
519 "pand %%xmm4,%%xmm2 \n"
520 "pand %%xmm5,%%xmm0 \n"
521 "por %%xmm2,%%xmm1 \n"
522 "por %%xmm1,%%xmm0 \n"
523 "packssdw %%xmm0,%%xmm0 \n"
524 "lea 0x10(%0),%0 \n"
525 "movq %%xmm0,(%1) \n"
526 "lea 0x8(%1),%1 \n"
527 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000528 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000529 : "+r"(src), // %0
530 "+r"(dst), // %1
531 "+r"(pix) // %2
532 :
533 : "memory", "cc"
534#if defined(__SSE2__)
535 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
536#endif
537 );
538}
539
540void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000541 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000542 "pcmpeqb %%xmm4,%%xmm4 \n"
543 "psrld $0x1b,%%xmm4 \n"
544 "movdqa %%xmm4,%%xmm5 \n"
545 "pslld $0x5,%%xmm5 \n"
546 "movdqa %%xmm4,%%xmm6 \n"
547 "pslld $0xa,%%xmm6 \n"
548 "pcmpeqb %%xmm7,%%xmm7 \n"
549 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000550 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000551 "1: \n"
552 "movdqa (%0),%%xmm0 \n"
553 "movdqa %%xmm0,%%xmm1 \n"
554 "movdqa %%xmm0,%%xmm2 \n"
555 "movdqa %%xmm0,%%xmm3 \n"
556 "psrad $0x10,%%xmm0 \n"
557 "psrld $0x3,%%xmm1 \n"
558 "psrld $0x6,%%xmm2 \n"
559 "psrld $0x9,%%xmm3 \n"
560 "pand %%xmm7,%%xmm0 \n"
561 "pand %%xmm4,%%xmm1 \n"
562 "pand %%xmm5,%%xmm2 \n"
563 "pand %%xmm6,%%xmm3 \n"
564 "por %%xmm1,%%xmm0 \n"
565 "por %%xmm3,%%xmm2 \n"
566 "por %%xmm2,%%xmm0 \n"
567 "packssdw %%xmm0,%%xmm0 \n"
568 "lea 0x10(%0),%0 \n"
569 "movq %%xmm0,(%1) \n"
570 "lea 0x8(%1),%1 \n"
571 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000572 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000573 : "+r"(src), // %0
574 "+r"(dst), // %1
575 "+r"(pix) // %2
576 :
577 : "memory", "cc"
578#if defined(__SSE2__)
579 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
580#endif
581 );
582}
583
584void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000585 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "pcmpeqb %%xmm4,%%xmm4 \n"
587 "psllw $0xc,%%xmm4 \n"
588 "movdqa %%xmm4,%%xmm3 \n"
589 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000590 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000591 "1: \n"
592 "movdqa (%0),%%xmm0 \n"
593 "movdqa %%xmm0,%%xmm1 \n"
594 "pand %%xmm3,%%xmm0 \n"
595 "pand %%xmm4,%%xmm1 \n"
596 "psrlq $0x4,%%xmm0 \n"
597 "psrlq $0x8,%%xmm1 \n"
598 "por %%xmm1,%%xmm0 \n"
599 "packuswb %%xmm0,%%xmm0 \n"
600 "lea 0x10(%0),%0 \n"
601 "movq %%xmm0,(%1) \n"
602 "lea 0x8(%1),%1 \n"
603 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000604 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000605 : "+r"(src), // %0
606 "+r"(dst), // %1
607 "+r"(pix) // %2
608 :
609 : "memory", "cc"
610#if defined(__SSE2__)
611 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
612#endif
613 );
614}
615
fbarchard@google.comb6149762011-11-07 21:58:52 +0000616void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000617 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000618 "movdqa %4,%%xmm5 \n"
619 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000620 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000621 "1: \n"
622 "movdqa (%0),%%xmm0 \n"
623 "movdqa 0x10(%0),%%xmm1 \n"
624 "movdqa 0x20(%0),%%xmm2 \n"
625 "movdqa 0x30(%0),%%xmm3 \n"
626 "pmaddubsw %%xmm4,%%xmm0 \n"
627 "pmaddubsw %%xmm4,%%xmm1 \n"
628 "pmaddubsw %%xmm4,%%xmm2 \n"
629 "pmaddubsw %%xmm4,%%xmm3 \n"
630 "lea 0x40(%0),%0 \n"
631 "phaddw %%xmm1,%%xmm0 \n"
632 "phaddw %%xmm3,%%xmm2 \n"
633 "psrlw $0x7,%%xmm0 \n"
634 "psrlw $0x7,%%xmm2 \n"
635 "packuswb %%xmm2,%%xmm0 \n"
636 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000637 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000638 "movdqa %%xmm0,(%1) \n"
639 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000640 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000641 : "+r"(src_argb), // %0
642 "+r"(dst_y), // %1
643 "+r"(pix) // %2
644 : "m"(kARGBToY), // %3
645 "m"(kAddY16) // %4
646 : "memory", "cc"
647#if defined(__SSE2__)
648 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
649#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000650 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000651}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000652
653void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000654 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000655 "movdqa %4,%%xmm5 \n"
656 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000657 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000658 "1: \n"
659 "movdqu (%0),%%xmm0 \n"
660 "movdqu 0x10(%0),%%xmm1 \n"
661 "movdqu 0x20(%0),%%xmm2 \n"
662 "movdqu 0x30(%0),%%xmm3 \n"
663 "pmaddubsw %%xmm4,%%xmm0 \n"
664 "pmaddubsw %%xmm4,%%xmm1 \n"
665 "pmaddubsw %%xmm4,%%xmm2 \n"
666 "pmaddubsw %%xmm4,%%xmm3 \n"
667 "lea 0x40(%0),%0 \n"
668 "phaddw %%xmm1,%%xmm0 \n"
669 "phaddw %%xmm3,%%xmm2 \n"
670 "psrlw $0x7,%%xmm0 \n"
671 "psrlw $0x7,%%xmm2 \n"
672 "packuswb %%xmm2,%%xmm0 \n"
673 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000674 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000675 "movdqu %%xmm0,(%1) \n"
676 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000677 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000678 : "+r"(src_argb), // %0
679 "+r"(dst_y), // %1
680 "+r"(pix) // %2
681 : "m"(kARGBToY), // %3
682 "m"(kAddY16) // %4
683 : "memory", "cc"
684#if defined(__SSE2__)
685 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
686#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000687 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000688}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000689
fbarchard@google.com714050a2012-02-17 22:59:56 +0000690// TODO(fbarchard): pass xmm constants to single block of assembly.
691// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
692// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
693// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
694// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000695void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
696 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000697 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000698 "movdqa %0,%%xmm4 \n"
699 "movdqa %1,%%xmm3 \n"
700 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000701 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000702 : "m"(kARGBToU), // %0
703 "m"(kARGBToV), // %1
704 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000705 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000706 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000707 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000708 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000709 "1: \n"
710 "movdqa (%0),%%xmm0 \n"
711 "movdqa 0x10(%0),%%xmm1 \n"
712 "movdqa 0x20(%0),%%xmm2 \n"
713 "movdqa 0x30(%0),%%xmm6 \n"
714 "pavgb (%0,%4,1),%%xmm0 \n"
715 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
716 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
717 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
718 "lea 0x40(%0),%0 \n"
719 "movdqa %%xmm0,%%xmm7 \n"
720 "shufps $0x88,%%xmm1,%%xmm0 \n"
721 "shufps $0xdd,%%xmm1,%%xmm7 \n"
722 "pavgb %%xmm7,%%xmm0 \n"
723 "movdqa %%xmm2,%%xmm7 \n"
724 "shufps $0x88,%%xmm6,%%xmm2 \n"
725 "shufps $0xdd,%%xmm6,%%xmm7 \n"
726 "pavgb %%xmm7,%%xmm2 \n"
727 "movdqa %%xmm0,%%xmm1 \n"
728 "movdqa %%xmm2,%%xmm6 \n"
729 "pmaddubsw %%xmm4,%%xmm0 \n"
730 "pmaddubsw %%xmm4,%%xmm2 \n"
731 "pmaddubsw %%xmm3,%%xmm1 \n"
732 "pmaddubsw %%xmm3,%%xmm6 \n"
733 "phaddw %%xmm2,%%xmm0 \n"
734 "phaddw %%xmm6,%%xmm1 \n"
735 "psraw $0x8,%%xmm0 \n"
736 "psraw $0x8,%%xmm1 \n"
737 "packsswb %%xmm1,%%xmm0 \n"
738 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000739 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000740 "movlps %%xmm0,(%1) \n"
741 "movhps %%xmm0,(%1,%2,1) \n"
742 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000743 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000744 : "+r"(src_argb0), // %0
745 "+r"(dst_u), // %1
746 "+r"(dst_v), // %2
747 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000748 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000749 : "memory", "cc"
750#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000751 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000752#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000753 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000754}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000755
756void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
757 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000758 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000759 "movdqa %0,%%xmm4 \n"
760 "movdqa %1,%%xmm3 \n"
761 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000762 :
763 : "m"(kARGBToU), // %0
764 "m"(kARGBToV), // %1
765 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000766 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000767 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000768 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000769 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000770 "1: \n"
771 "movdqu (%0),%%xmm0 \n"
772 "movdqu 0x10(%0),%%xmm1 \n"
773 "movdqu 0x20(%0),%%xmm2 \n"
774 "movdqu 0x30(%0),%%xmm6 \n"
775 "movdqu (%0,%4,1),%%xmm7 \n"
776 "pavgb %%xmm7,%%xmm0 \n"
777 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
778 "pavgb %%xmm7,%%xmm1 \n"
779 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
780 "pavgb %%xmm7,%%xmm2 \n"
781 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
782 "pavgb %%xmm7,%%xmm6 \n"
783 "lea 0x40(%0),%0 \n"
784 "movdqa %%xmm0,%%xmm7 \n"
785 "shufps $0x88,%%xmm1,%%xmm0 \n"
786 "shufps $0xdd,%%xmm1,%%xmm7 \n"
787 "pavgb %%xmm7,%%xmm0 \n"
788 "movdqa %%xmm2,%%xmm7 \n"
789 "shufps $0x88,%%xmm6,%%xmm2 \n"
790 "shufps $0xdd,%%xmm6,%%xmm7 \n"
791 "pavgb %%xmm7,%%xmm2 \n"
792 "movdqa %%xmm0,%%xmm1 \n"
793 "movdqa %%xmm2,%%xmm6 \n"
794 "pmaddubsw %%xmm4,%%xmm0 \n"
795 "pmaddubsw %%xmm4,%%xmm2 \n"
796 "pmaddubsw %%xmm3,%%xmm1 \n"
797 "pmaddubsw %%xmm3,%%xmm6 \n"
798 "phaddw %%xmm2,%%xmm0 \n"
799 "phaddw %%xmm6,%%xmm1 \n"
800 "psraw $0x8,%%xmm0 \n"
801 "psraw $0x8,%%xmm1 \n"
802 "packsswb %%xmm1,%%xmm0 \n"
803 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000804 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000805 "movlps %%xmm0,(%1) \n"
806 "movhps %%xmm0,(%1,%2,1) \n"
807 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000808 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000809 : "+r"(src_argb0), // %0
810 "+r"(dst_u), // %1
811 "+r"(dst_v), // %2
812 "+rm"(width) // %3
813 : "r"(static_cast<intptr_t>(src_stride_argb))
814 : "memory", "cc"
815#if defined(__SSE2__)
816 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
817#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000818 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000819}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000820
fbarchard@google.com714050a2012-02-17 22:59:56 +0000821void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000822 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000823 "movdqa %4,%%xmm5 \n"
824 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000825 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000826 "1: \n"
827 "movdqa (%0),%%xmm0 \n"
828 "movdqa 0x10(%0),%%xmm1 \n"
829 "movdqa 0x20(%0),%%xmm2 \n"
830 "movdqa 0x30(%0),%%xmm3 \n"
831 "pmaddubsw %%xmm4,%%xmm0 \n"
832 "pmaddubsw %%xmm4,%%xmm1 \n"
833 "pmaddubsw %%xmm4,%%xmm2 \n"
834 "pmaddubsw %%xmm4,%%xmm3 \n"
835 "lea 0x40(%0),%0 \n"
836 "phaddw %%xmm1,%%xmm0 \n"
837 "phaddw %%xmm3,%%xmm2 \n"
838 "psrlw $0x7,%%xmm0 \n"
839 "psrlw $0x7,%%xmm2 \n"
840 "packuswb %%xmm2,%%xmm0 \n"
841 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000842 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000843 "movdqa %%xmm0,(%1) \n"
844 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000845 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000846 : "+r"(src_bgra), // %0
847 "+r"(dst_y), // %1
848 "+r"(pix) // %2
849 : "m"(kBGRAToY), // %3
850 "m"(kAddY16) // %4
851 : "memory", "cc"
852#if defined(__SSE2__)
853 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000854#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000855 );
856}
857
858void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000859 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000860 "movdqa %4,%%xmm5 \n"
861 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000862 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000863 "1: \n"
864 "movdqu (%0),%%xmm0 \n"
865 "movdqu 0x10(%0),%%xmm1 \n"
866 "movdqu 0x20(%0),%%xmm2 \n"
867 "movdqu 0x30(%0),%%xmm3 \n"
868 "pmaddubsw %%xmm4,%%xmm0 \n"
869 "pmaddubsw %%xmm4,%%xmm1 \n"
870 "pmaddubsw %%xmm4,%%xmm2 \n"
871 "pmaddubsw %%xmm4,%%xmm3 \n"
872 "lea 0x40(%0),%0 \n"
873 "phaddw %%xmm1,%%xmm0 \n"
874 "phaddw %%xmm3,%%xmm2 \n"
875 "psrlw $0x7,%%xmm0 \n"
876 "psrlw $0x7,%%xmm2 \n"
877 "packuswb %%xmm2,%%xmm0 \n"
878 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000879 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000880 "movdqu %%xmm0,(%1) \n"
881 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000882 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000883 : "+r"(src_bgra), // %0
884 "+r"(dst_y), // %1
885 "+r"(pix) // %2
886 : "m"(kBGRAToY), // %3
887 "m"(kAddY16) // %4
888 : "memory", "cc"
889#if defined(__SSE2__)
890 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
891#endif
892 );
893}
894
895void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
896 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000897 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000898 "movdqa %0,%%xmm4 \n"
899 "movdqa %1,%%xmm3 \n"
900 "movdqa %2,%%xmm5 \n"
901 :
902 : "m"(kBGRAToU), // %0
903 "m"(kBGRAToV), // %1
904 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000905 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000906 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000907 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000908 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000909 "1: \n"
910 "movdqa (%0),%%xmm0 \n"
911 "movdqa 0x10(%0),%%xmm1 \n"
912 "movdqa 0x20(%0),%%xmm2 \n"
913 "movdqa 0x30(%0),%%xmm6 \n"
914 "pavgb (%0,%4,1),%%xmm0 \n"
915 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
916 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
917 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
918 "lea 0x40(%0),%0 \n"
919 "movdqa %%xmm0,%%xmm7 \n"
920 "shufps $0x88,%%xmm1,%%xmm0 \n"
921 "shufps $0xdd,%%xmm1,%%xmm7 \n"
922 "pavgb %%xmm7,%%xmm0 \n"
923 "movdqa %%xmm2,%%xmm7 \n"
924 "shufps $0x88,%%xmm6,%%xmm2 \n"
925 "shufps $0xdd,%%xmm6,%%xmm7 \n"
926 "pavgb %%xmm7,%%xmm2 \n"
927 "movdqa %%xmm0,%%xmm1 \n"
928 "movdqa %%xmm2,%%xmm6 \n"
929 "pmaddubsw %%xmm4,%%xmm0 \n"
930 "pmaddubsw %%xmm4,%%xmm2 \n"
931 "pmaddubsw %%xmm3,%%xmm1 \n"
932 "pmaddubsw %%xmm3,%%xmm6 \n"
933 "phaddw %%xmm2,%%xmm0 \n"
934 "phaddw %%xmm6,%%xmm1 \n"
935 "psraw $0x8,%%xmm0 \n"
936 "psraw $0x8,%%xmm1 \n"
937 "packsswb %%xmm1,%%xmm0 \n"
938 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000939 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000940 "movlps %%xmm0,(%1) \n"
941 "movhps %%xmm0,(%1,%2,1) \n"
942 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000943 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000944 : "+r"(src_bgra0), // %0
945 "+r"(dst_u), // %1
946 "+r"(dst_v), // %2
947 "+rm"(width) // %3
948 : "r"(static_cast<intptr_t>(src_stride_bgra))
949 : "memory", "cc"
950#if defined(__SSE2__)
951 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
952#endif
953 );
954}
955
956void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
957 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000958 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000959 "movdqa %0,%%xmm4 \n"
960 "movdqa %1,%%xmm3 \n"
961 "movdqa %2,%%xmm5 \n"
962 :
963 : "m"(kBGRAToU), // %0
964 "m"(kBGRAToV), // %1
965 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000966 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000967 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000968 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000969 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000970 "1: \n"
971 "movdqu (%0),%%xmm0 \n"
972 "movdqu 0x10(%0),%%xmm1 \n"
973 "movdqu 0x20(%0),%%xmm2 \n"
974 "movdqu 0x30(%0),%%xmm6 \n"
975 "movdqu (%0,%4,1),%%xmm7 \n"
976 "pavgb %%xmm7,%%xmm0 \n"
977 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
978 "pavgb %%xmm7,%%xmm1 \n"
979 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
980 "pavgb %%xmm7,%%xmm2 \n"
981 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
982 "pavgb %%xmm7,%%xmm6 \n"
983 "lea 0x40(%0),%0 \n"
984 "movdqa %%xmm0,%%xmm7 \n"
985 "shufps $0x88,%%xmm1,%%xmm0 \n"
986 "shufps $0xdd,%%xmm1,%%xmm7 \n"
987 "pavgb %%xmm7,%%xmm0 \n"
988 "movdqa %%xmm2,%%xmm7 \n"
989 "shufps $0x88,%%xmm6,%%xmm2 \n"
990 "shufps $0xdd,%%xmm6,%%xmm7 \n"
991 "pavgb %%xmm7,%%xmm2 \n"
992 "movdqa %%xmm0,%%xmm1 \n"
993 "movdqa %%xmm2,%%xmm6 \n"
994 "pmaddubsw %%xmm4,%%xmm0 \n"
995 "pmaddubsw %%xmm4,%%xmm2 \n"
996 "pmaddubsw %%xmm3,%%xmm1 \n"
997 "pmaddubsw %%xmm3,%%xmm6 \n"
998 "phaddw %%xmm2,%%xmm0 \n"
999 "phaddw %%xmm6,%%xmm1 \n"
1000 "psraw $0x8,%%xmm0 \n"
1001 "psraw $0x8,%%xmm1 \n"
1002 "packsswb %%xmm1,%%xmm0 \n"
1003 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001004 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001005 "movlps %%xmm0,(%1) \n"
1006 "movhps %%xmm0,(%1,%2,1) \n"
1007 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001008 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001009 : "+r"(src_bgra0), // %0
1010 "+r"(dst_u), // %1
1011 "+r"(dst_v), // %2
1012 "+rm"(width) // %3
1013 : "r"(static_cast<intptr_t>(src_stride_bgra))
1014 : "memory", "cc"
1015#if defined(__SSE2__)
1016 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1017#endif
1018 );
1019}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001020
1021void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001022 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001023 "movdqa %4,%%xmm5 \n"
1024 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001025 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001026 "1: \n"
1027 "movdqa (%0),%%xmm0 \n"
1028 "movdqa 0x10(%0),%%xmm1 \n"
1029 "movdqa 0x20(%0),%%xmm2 \n"
1030 "movdqa 0x30(%0),%%xmm3 \n"
1031 "pmaddubsw %%xmm4,%%xmm0 \n"
1032 "pmaddubsw %%xmm4,%%xmm1 \n"
1033 "pmaddubsw %%xmm4,%%xmm2 \n"
1034 "pmaddubsw %%xmm4,%%xmm3 \n"
1035 "lea 0x40(%0),%0 \n"
1036 "phaddw %%xmm1,%%xmm0 \n"
1037 "phaddw %%xmm3,%%xmm2 \n"
1038 "psrlw $0x7,%%xmm0 \n"
1039 "psrlw $0x7,%%xmm2 \n"
1040 "packuswb %%xmm2,%%xmm0 \n"
1041 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001042 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001043 "movdqa %%xmm0,(%1) \n"
1044 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001045 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001046 : "+r"(src_abgr), // %0
1047 "+r"(dst_y), // %1
1048 "+r"(pix) // %2
1049 : "m"(kABGRToY), // %3
1050 "m"(kAddY16) // %4
1051 : "memory", "cc"
1052#if defined(__SSE2__)
1053 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1054#endif
1055 );
1056}
1057
1058void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001059 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001060 "movdqa %4,%%xmm5 \n"
1061 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001062 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001063 "1: \n"
1064 "movdqu (%0),%%xmm0 \n"
1065 "movdqu 0x10(%0),%%xmm1 \n"
1066 "movdqu 0x20(%0),%%xmm2 \n"
1067 "movdqu 0x30(%0),%%xmm3 \n"
1068 "pmaddubsw %%xmm4,%%xmm0 \n"
1069 "pmaddubsw %%xmm4,%%xmm1 \n"
1070 "pmaddubsw %%xmm4,%%xmm2 \n"
1071 "pmaddubsw %%xmm4,%%xmm3 \n"
1072 "lea 0x40(%0),%0 \n"
1073 "phaddw %%xmm1,%%xmm0 \n"
1074 "phaddw %%xmm3,%%xmm2 \n"
1075 "psrlw $0x7,%%xmm0 \n"
1076 "psrlw $0x7,%%xmm2 \n"
1077 "packuswb %%xmm2,%%xmm0 \n"
1078 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001079 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001080 "movdqu %%xmm0,(%1) \n"
1081 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001082 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001083 : "+r"(src_abgr), // %0
1084 "+r"(dst_y), // %1
1085 "+r"(pix) // %2
1086 : "m"(kABGRToY), // %3
1087 "m"(kAddY16) // %4
1088 : "memory", "cc"
1089#if defined(__SSE2__)
1090 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1091#endif
1092 );
1093}
1094
1095void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1096 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001097 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001098 "movdqa %0,%%xmm4 \n"
1099 "movdqa %1,%%xmm3 \n"
1100 "movdqa %2,%%xmm5 \n"
1101 :
1102 : "m"(kABGRToU), // %0
1103 "m"(kABGRToV), // %1
1104 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001105 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001106 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001107 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001108 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001109 "1: \n"
1110 "movdqa (%0),%%xmm0 \n"
1111 "movdqa 0x10(%0),%%xmm1 \n"
1112 "movdqa 0x20(%0),%%xmm2 \n"
1113 "movdqa 0x30(%0),%%xmm6 \n"
1114 "pavgb (%0,%4,1),%%xmm0 \n"
1115 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1116 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1117 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1118 "lea 0x40(%0),%0 \n"
1119 "movdqa %%xmm0,%%xmm7 \n"
1120 "shufps $0x88,%%xmm1,%%xmm0 \n"
1121 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1122 "pavgb %%xmm7,%%xmm0 \n"
1123 "movdqa %%xmm2,%%xmm7 \n"
1124 "shufps $0x88,%%xmm6,%%xmm2 \n"
1125 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1126 "pavgb %%xmm7,%%xmm2 \n"
1127 "movdqa %%xmm0,%%xmm1 \n"
1128 "movdqa %%xmm2,%%xmm6 \n"
1129 "pmaddubsw %%xmm4,%%xmm0 \n"
1130 "pmaddubsw %%xmm4,%%xmm2 \n"
1131 "pmaddubsw %%xmm3,%%xmm1 \n"
1132 "pmaddubsw %%xmm3,%%xmm6 \n"
1133 "phaddw %%xmm2,%%xmm0 \n"
1134 "phaddw %%xmm6,%%xmm1 \n"
1135 "psraw $0x8,%%xmm0 \n"
1136 "psraw $0x8,%%xmm1 \n"
1137 "packsswb %%xmm1,%%xmm0 \n"
1138 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001139 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001140 "movlps %%xmm0,(%1) \n"
1141 "movhps %%xmm0,(%1,%2,1) \n"
1142 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001143 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001144 : "+r"(src_abgr0), // %0
1145 "+r"(dst_u), // %1
1146 "+r"(dst_v), // %2
1147 "+rm"(width) // %3
1148 : "r"(static_cast<intptr_t>(src_stride_abgr))
1149 : "memory", "cc"
1150#if defined(__SSE2__)
1151 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1152#endif
1153 );
1154}
1155
1156void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1157 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001158 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001159 "movdqa %0,%%xmm4 \n"
1160 "movdqa %1,%%xmm3 \n"
1161 "movdqa %2,%%xmm5 \n"
1162 :
1163 : "m"(kABGRToU), // %0
1164 "m"(kABGRToV), // %1
1165 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001166 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001167 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001168 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001169 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001170 "1: \n"
1171 "movdqu (%0),%%xmm0 \n"
1172 "movdqu 0x10(%0),%%xmm1 \n"
1173 "movdqu 0x20(%0),%%xmm2 \n"
1174 "movdqu 0x30(%0),%%xmm6 \n"
1175 "movdqu (%0,%4,1),%%xmm7 \n"
1176 "pavgb %%xmm7,%%xmm0 \n"
1177 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1178 "pavgb %%xmm7,%%xmm1 \n"
1179 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1180 "pavgb %%xmm7,%%xmm2 \n"
1181 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1182 "pavgb %%xmm7,%%xmm6 \n"
1183 "lea 0x40(%0),%0 \n"
1184 "movdqa %%xmm0,%%xmm7 \n"
1185 "shufps $0x88,%%xmm1,%%xmm0 \n"
1186 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1187 "pavgb %%xmm7,%%xmm0 \n"
1188 "movdqa %%xmm2,%%xmm7 \n"
1189 "shufps $0x88,%%xmm6,%%xmm2 \n"
1190 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1191 "pavgb %%xmm7,%%xmm2 \n"
1192 "movdqa %%xmm0,%%xmm1 \n"
1193 "movdqa %%xmm2,%%xmm6 \n"
1194 "pmaddubsw %%xmm4,%%xmm0 \n"
1195 "pmaddubsw %%xmm4,%%xmm2 \n"
1196 "pmaddubsw %%xmm3,%%xmm1 \n"
1197 "pmaddubsw %%xmm3,%%xmm6 \n"
1198 "phaddw %%xmm2,%%xmm0 \n"
1199 "phaddw %%xmm6,%%xmm1 \n"
1200 "psraw $0x8,%%xmm0 \n"
1201 "psraw $0x8,%%xmm1 \n"
1202 "packsswb %%xmm1,%%xmm0 \n"
1203 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001204 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001205 "movlps %%xmm0,(%1) \n"
1206 "movhps %%xmm0,(%1,%2,1) \n"
1207 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001208 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001209 : "+r"(src_abgr0), // %0
1210 "+r"(dst_u), // %1
1211 "+r"(dst_v), // %2
1212 "+rm"(width) // %3
1213 : "r"(static_cast<intptr_t>(src_stride_abgr))
1214 : "memory", "cc"
1215#if defined(__SSE2__)
1216 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1217#endif
1218 );
1219}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001220
1221#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001222
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001223#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001224#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1225#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1226#define UR 0
1227
1228#define VB 0
1229#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1230#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1231
1232// Bias
1233#define BB UB * 128 + VB * 128
1234#define BG UG * 128 + VG * 128
1235#define BR UR * 128 + VR * 128
1236
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001237#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001238
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001239struct {
1240 vec8 kUVToB;
1241 vec8 kUVToG;
1242 vec8 kUVToR;
1243 vec16 kUVBiasB;
1244 vec16 kUVBiasG;
1245 vec16 kUVBiasR;
1246 vec16 kYSub16;
1247 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001248} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001249 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1250 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1251 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1252 { BB, BB, BB, BB, BB, BB, BB, BB },
1253 { BG, BG, BG, BG, BG, BG, BG, BG },
1254 { BR, BR, BR, BR, BR, BR, BR, BR },
1255 { 16, 16, 16, 16, 16, 16, 16, 16 },
1256 { YG, YG, YG, YG, YG, YG, YG, YG }
1257};
1258
1259// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +00001260#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001261 "movd (%1),%%xmm0 \n" \
1262 "movd (%1,%2,1),%%xmm1 \n" \
1263 "lea 0x4(%1),%1 \n" \
1264 "punpcklbw %%xmm1,%%xmm0 \n" \
1265 "punpcklwd %%xmm0,%%xmm0 \n" \
1266 "movdqa %%xmm0,%%xmm1 \n" \
1267 "movdqa %%xmm0,%%xmm2 \n" \
1268 "pmaddubsw (%5),%%xmm0 \n" \
1269 "pmaddubsw 16(%5),%%xmm1 \n" \
1270 "pmaddubsw 32(%5),%%xmm2 \n" \
1271 "psubw 48(%5),%%xmm0 \n" \
1272 "psubw 64(%5),%%xmm1 \n" \
1273 "psubw 80(%5),%%xmm2 \n" \
1274 "movq (%0),%%xmm3 \n" \
1275 "lea 0x8(%0),%0 \n" \
1276 "punpcklbw %%xmm4,%%xmm3 \n" \
1277 "psubsw 96(%5),%%xmm3 \n" \
1278 "pmullw 112(%5),%%xmm3 \n" \
1279 "paddsw %%xmm3,%%xmm0 \n" \
1280 "paddsw %%xmm3,%%xmm1 \n" \
1281 "paddsw %%xmm3,%%xmm2 \n" \
1282 "psraw $0x6,%%xmm0 \n" \
1283 "psraw $0x6,%%xmm1 \n" \
1284 "psraw $0x6,%%xmm2 \n" \
1285 "packuswb %%xmm0,%%xmm0 \n" \
1286 "packuswb %%xmm1,%%xmm1 \n" \
1287 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001288
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001289void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
1290 const uint8* u_buf,
1291 const uint8* v_buf,
1292 uint8* rgb_buf,
1293 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001294 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001295 "sub %1,%2 \n"
1296 "pcmpeqb %%xmm5,%%xmm5 \n"
1297 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001298 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001299 "1: \n"
1300 YUVTORGB
1301 "punpcklbw %%xmm1,%%xmm0 \n"
1302 "punpcklbw %%xmm5,%%xmm2 \n"
1303 "movdqa %%xmm0,%%xmm1 \n"
1304 "punpcklwd %%xmm2,%%xmm0 \n"
1305 "punpckhwd %%xmm2,%%xmm1 \n"
1306 "movdqa %%xmm0,(%3) \n"
1307 "movdqa %%xmm1,0x10(%3) \n"
1308 "lea 0x20(%3),%3 \n"
1309 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001310 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001311 : "+r"(y_buf), // %0
1312 "+r"(u_buf), // %1
1313 "+r"(v_buf), // %2
1314 "+r"(rgb_buf), // %3
1315 "+rm"(width) // %4
1316 : "r"(&kYuvConstants.kUVToB) // %5
1317 : "memory", "cc"
1318#if defined(__SSE2__)
1319 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1320#endif
1321 );
1322}
1323
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001324void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
1325 const uint8* u_buf,
1326 const uint8* v_buf,
1327 uint8* rgb_buf,
1328 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001329 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001330 "sub %1,%2 \n"
1331 "pcmpeqb %%xmm5,%%xmm5 \n"
1332 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001333 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001334 "1: \n"
1335 YUVTORGB
1336 "pcmpeqb %%xmm5,%%xmm5 \n"
1337 "punpcklbw %%xmm0,%%xmm1 \n"
1338 "punpcklbw %%xmm2,%%xmm5 \n"
1339 "movdqa %%xmm5,%%xmm0 \n"
1340 "punpcklwd %%xmm1,%%xmm5 \n"
1341 "punpckhwd %%xmm1,%%xmm0 \n"
1342 "movdqa %%xmm5,(%3) \n"
1343 "movdqa %%xmm0,0x10(%3) \n"
1344 "lea 0x20(%3),%3 \n"
1345 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001346 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001347 : "+r"(y_buf), // %0
1348 "+r"(u_buf), // %1
1349 "+r"(v_buf), // %2
1350 "+r"(rgb_buf), // %3
1351 "+rm"(width) // %4
1352 : "r"(&kYuvConstants.kUVToB) // %5
1353 : "memory", "cc"
1354#if defined(__SSE2__)
1355 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1356#endif
1357 );
1358}
1359
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001360void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
1361 const uint8* u_buf,
1362 const uint8* v_buf,
1363 uint8* rgb_buf,
1364 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001365 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001366 "sub %1,%2 \n"
1367 "pcmpeqb %%xmm5,%%xmm5 \n"
1368 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001369 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001370 "1: \n"
1371 YUVTORGB
1372 "punpcklbw %%xmm1,%%xmm2 \n"
1373 "punpcklbw %%xmm5,%%xmm0 \n"
1374 "movdqa %%xmm2,%%xmm1 \n"
1375 "punpcklwd %%xmm0,%%xmm2 \n"
1376 "punpckhwd %%xmm0,%%xmm1 \n"
1377 "movdqa %%xmm2,(%3) \n"
1378 "movdqa %%xmm1,0x10(%3) \n"
1379 "lea 0x20(%3),%3 \n"
1380 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001381 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001382 : "+r"(y_buf), // %0
1383 "+r"(u_buf), // %1
1384 "+r"(v_buf), // %2
1385 "+r"(rgb_buf), // %3
1386 "+rm"(width) // %4
1387 : "r"(&kYuvConstants.kUVToB) // %5
1388 : "memory", "cc"
1389#if defined(__SSE2__)
1390 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1391#endif
1392 );
1393}
1394
fbarchard@google.com952a5072012-03-30 18:10:50 +00001395void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1396 const uint8* u_buf,
1397 const uint8* v_buf,
1398 uint8* rgb_buf,
1399 int width) {
1400 asm volatile (
1401 "sub %1,%2 \n"
1402 "pcmpeqb %%xmm5,%%xmm5 \n"
1403 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001404 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001405 "1: \n"
1406 YUVTORGB
1407 "punpcklbw %%xmm1,%%xmm0 \n"
1408 "punpcklbw %%xmm5,%%xmm2 \n"
1409 "movdqa %%xmm0,%%xmm1 \n"
1410 "punpcklwd %%xmm2,%%xmm0 \n"
1411 "punpckhwd %%xmm2,%%xmm1 \n"
1412 "movdqu %%xmm0,(%3) \n"
1413 "movdqu %%xmm1,0x10(%3) \n"
1414 "lea 0x20(%3),%3 \n"
1415 "sub $0x8,%4 \n"
1416 "jg 1b \n"
1417 : "+r"(y_buf), // %0
1418 "+r"(u_buf), // %1
1419 "+r"(v_buf), // %2
1420 "+r"(rgb_buf), // %3
1421 "+rm"(width) // %4
1422 : "r"(&kYuvConstants.kUVToB) // %5
1423 : "memory", "cc"
1424#if defined(__SSE2__)
1425 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1426#endif
1427 );
1428}
1429
1430void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
1431 const uint8* u_buf,
1432 const uint8* v_buf,
1433 uint8* rgb_buf,
1434 int width) {
1435 asm volatile (
1436 "sub %1,%2 \n"
1437 "pcmpeqb %%xmm5,%%xmm5 \n"
1438 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001439 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001440 "1: \n"
1441 YUVTORGB
1442 "pcmpeqb %%xmm5,%%xmm5 \n"
1443 "punpcklbw %%xmm0,%%xmm1 \n"
1444 "punpcklbw %%xmm2,%%xmm5 \n"
1445 "movdqa %%xmm5,%%xmm0 \n"
1446 "punpcklwd %%xmm1,%%xmm5 \n"
1447 "punpckhwd %%xmm1,%%xmm0 \n"
1448 "movdqu %%xmm5,(%3) \n"
1449 "movdqu %%xmm0,0x10(%3) \n"
1450 "lea 0x20(%3),%3 \n"
1451 "sub $0x8,%4 \n"
1452 "jg 1b \n"
1453 : "+r"(y_buf), // %0
1454 "+r"(u_buf), // %1
1455 "+r"(v_buf), // %2
1456 "+r"(rgb_buf), // %3
1457 "+rm"(width) // %4
1458 : "r"(&kYuvConstants.kUVToB) // %5
1459 : "memory", "cc"
1460#if defined(__SSE2__)
1461 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1462#endif
1463 );
1464}
1465
1466void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
1467 const uint8* u_buf,
1468 const uint8* v_buf,
1469 uint8* rgb_buf,
1470 int width) {
1471 asm volatile (
1472 "sub %1,%2 \n"
1473 "pcmpeqb %%xmm5,%%xmm5 \n"
1474 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001475 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001476 "1: \n"
1477 YUVTORGB
1478 "punpcklbw %%xmm1,%%xmm2 \n"
1479 "punpcklbw %%xmm5,%%xmm0 \n"
1480 "movdqa %%xmm2,%%xmm1 \n"
1481 "punpcklwd %%xmm0,%%xmm2 \n"
1482 "punpckhwd %%xmm0,%%xmm1 \n"
1483 "movdqu %%xmm2,(%3) \n"
1484 "movdqu %%xmm1,0x10(%3) \n"
1485 "lea 0x20(%3),%3 \n"
1486 "sub $0x8,%4 \n"
1487 "jg 1b \n"
1488 : "+r"(y_buf), // %0
1489 "+r"(u_buf), // %1
1490 "+r"(v_buf), // %2
1491 "+r"(rgb_buf), // %3
1492 "+rm"(width) // %4
1493 : "r"(&kYuvConstants.kUVToB) // %5
1494 : "memory", "cc"
1495#if defined(__SSE2__)
1496 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1497#endif
1498 );
1499}
1500
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001501void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1502 const uint8* u_buf,
1503 const uint8* v_buf,
1504 uint8* rgb_buf,
1505 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001506 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001507 "sub %1,%2 \n"
1508 "pcmpeqb %%xmm5,%%xmm5 \n"
1509 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001510 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001511 "1: \n"
1512 "movd (%1),%%xmm0 \n"
1513 "movd (%1,%2,1),%%xmm1 \n"
1514 "lea 0x4(%1),%1 \n"
1515 "punpcklbw %%xmm1,%%xmm0 \n"
1516 "movdqa %%xmm0,%%xmm1 \n"
1517 "movdqa %%xmm0,%%xmm2 \n"
1518 "pmaddubsw (%5),%%xmm0 \n"
1519 "pmaddubsw 16(%5),%%xmm1 \n"
1520 "pmaddubsw 32(%5),%%xmm2 \n"
1521 "psubw 48(%5),%%xmm0 \n"
1522 "psubw 64(%5),%%xmm1 \n"
1523 "psubw 80(%5),%%xmm2 \n"
1524 "movd (%0),%%xmm3 \n"
1525 "lea 0x4(%0),%0 \n"
1526 "punpcklbw %%xmm4,%%xmm3 \n"
1527 "psubsw 96(%5),%%xmm3 \n"
1528 "pmullw 112(%5),%%xmm3 \n"
1529 "paddsw %%xmm3,%%xmm0 \n"
1530 "paddsw %%xmm3,%%xmm1 \n"
1531 "paddsw %%xmm3,%%xmm2 \n"
1532 "psraw $0x6,%%xmm0 \n"
1533 "psraw $0x6,%%xmm1 \n"
1534 "psraw $0x6,%%xmm2 \n"
1535 "packuswb %%xmm0,%%xmm0 \n"
1536 "packuswb %%xmm1,%%xmm1 \n"
1537 "packuswb %%xmm2,%%xmm2 \n"
1538 "punpcklbw %%xmm1,%%xmm0 \n"
1539 "punpcklbw %%xmm5,%%xmm2 \n"
1540 "punpcklwd %%xmm2,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001541 "sub $0x4,%4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001542 "movdqa %%xmm0,(%3) \n"
1543 "lea 0x10(%3),%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001544 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001545 : "+r"(y_buf), // %0
1546 "+r"(u_buf), // %1
1547 "+r"(v_buf), // %2
1548 "+r"(rgb_buf), // %3
1549 "+rm"(width) // %4
1550 : "r"(&kYuvConstants.kUVToB) // %5
1551 : "memory", "cc"
1552#if defined(__SSE2__)
1553 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1554#endif
1555 );
1556}
1557#endif
1558
1559#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001560void YToARGBRow_SSE2(const uint8* y_buf,
1561 uint8* rgb_buf,
1562 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001563 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001564 "pcmpeqb %%xmm4,%%xmm4 \n"
1565 "pslld $0x18,%%xmm4 \n"
1566 "mov $0x10001000,%%eax \n"
1567 "movd %%eax,%%xmm3 \n"
1568 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1569 "mov $0x012a012a,%%eax \n"
1570 "movd %%eax,%%xmm2 \n"
1571 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001572 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001573 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001574 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001575 "movq (%0),%%xmm0 \n"
1576 "lea 0x8(%0),%0 \n"
1577 "punpcklbw %%xmm0,%%xmm0 \n"
1578 "psubusw %%xmm3,%%xmm0 \n"
1579 "pmulhuw %%xmm2,%%xmm0 \n"
1580 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001581
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001582 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001583 "punpcklbw %%xmm0,%%xmm0 \n"
1584 "movdqa %%xmm0,%%xmm1 \n"
1585 "punpcklwd %%xmm0,%%xmm0 \n"
1586 "punpckhwd %%xmm1,%%xmm1 \n"
1587 "por %%xmm4,%%xmm0 \n"
1588 "por %%xmm4,%%xmm1 \n"
1589 "movdqa %%xmm0,(%1) \n"
1590 "movdqa %%xmm1,16(%1) \n"
1591 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001592
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001593 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001594 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001595 : "+r"(y_buf), // %0
1596 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001597 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001598 :
1599 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001600#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001601 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001602#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001603 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001604}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001605#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001606
fbarchard@google.com42831e02012-01-21 02:54:17 +00001607#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001608// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001609CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001610 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1611};
1612
fbarchard@google.com42831e02012-01-21 02:54:17 +00001613void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001614 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001615 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001616 "movdqa %3,%%xmm5 \n"
1617 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001618 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001619 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001620 "movdqa (%0,%2),%%xmm0 \n"
1621 "pshufb %%xmm5,%%xmm0 \n"
1622 "sub $0x10,%2 \n"
1623 "movdqa %%xmm0,(%1) \n"
1624 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001625 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001626 : "+r"(src), // %0
1627 "+r"(dst), // %1
1628 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001629 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001630 : "memory", "cc"
1631#if defined(__SSE2__)
1632 , "xmm0", "xmm5"
1633#endif
1634 );
1635}
1636#endif
1637
fbarchard@google.com42831e02012-01-21 02:54:17 +00001638#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001639void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001640 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001641 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001642 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001643 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001644 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001645 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001646 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001647 "psllw $0x8,%%xmm0 \n"
1648 "psrlw $0x8,%%xmm1 \n"
1649 "por %%xmm1,%%xmm0 \n"
1650 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1651 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1652 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1653 "sub $0x10,%2 \n"
1654 "movdqu %%xmm0,(%1) \n"
1655 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001656 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001657 : "+r"(src), // %0
1658 "+r"(dst), // %1
1659 "+r"(temp_width) // %2
1660 :
1661 : "memory", "cc"
1662#if defined(__SSE2__)
1663 , "xmm0", "xmm1"
1664#endif
1665 );
1666}
1667#endif
1668
fbarchard@google.com16a96642012-03-02 22:38:09 +00001669#ifdef HAS_MIRRORROW_UV_SSSE3
1670// Shuffle table for reversing the bytes of UV channels.
1671CONST uvec8 kShuffleMirrorUV = {
1672 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1673};
1674void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1675 int width) {
1676 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001677 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001678 "movdqa %4,%%xmm1 \n"
1679 "lea -16(%0,%3,2),%0 \n"
1680 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001681 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001682 "1: \n"
1683 "movdqa (%0),%%xmm0 \n"
1684 "lea -16(%0),%0 \n"
1685 "pshufb %%xmm1,%%xmm0 \n"
1686 "sub $8,%3 \n"
1687 "movlpd %%xmm0,(%1) \n"
1688 "movhpd %%xmm0,(%1,%2) \n"
1689 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001690 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001691 : "+r"(src), // %0
1692 "+r"(dst_u), // %1
1693 "+r"(dst_v), // %2
1694 "+r"(temp_width) // %3
1695 : "m"(kShuffleMirrorUV) // %4
1696 : "memory", "cc"
1697#if defined(__SSE2__)
1698 , "xmm0", "xmm1"
1699#endif
1700 );
1701}
1702#endif
1703
fbarchard@google.com55663022012-04-26 00:01:41 +00001704#ifdef HAS_ADDROW_SSE2
1705// dst and width aligned to 16
1706void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
1707 asm volatile (
1708 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001709 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001710 "1: \n"
1711 "movdqu (%0),%%xmm2 \n"
1712 "lea 0x10(%0),%0 \n"
1713 "movdqa (%1),%%xmm0 \n"
1714 "movdqa 0x10(%1),%%xmm1 \n"
1715 "movdqa %%xmm2,%%xmm3 \n"
1716 "punpcklbw %%xmm4,%%xmm2 \n"
1717 "punpckhbw %%xmm4,%%xmm3 \n"
1718 "paddusw %%xmm2,%%xmm0 \n"
1719 "paddusw %%xmm3,%%xmm1 \n"
1720 "sub $0x10,%2 \n"
1721 "movdqa %%xmm0,(%1) \n"
1722 "movdqa %%xmm1,0x10(%1) \n"
1723 "lea 0x20(%1),%1 \n"
1724 "jg 1b \n"
1725 : "+r"(src), // %0
1726 "+r"(dst), // %1
1727 "+r"(width) // %2
1728 :
1729 : "memory", "cc"
1730#if defined(__SSE2__)
1731 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1732#endif
1733 );
1734}
1735
1736// dst and width aligned to 16
1737void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
1738 asm volatile (
1739 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001740 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001741 "1: \n"
1742 "movdqu (%0),%%xmm2 \n"
1743 "lea 0x10(%0),%0 \n"
1744 "movdqa (%1),%%xmm0 \n"
1745 "movdqa 0x10(%1),%%xmm1 \n"
1746 "movdqa %%xmm2,%%xmm3 \n"
1747 "punpcklbw %%xmm4,%%xmm2 \n"
1748 "punpckhbw %%xmm4,%%xmm3 \n"
1749 "psubusw %%xmm2,%%xmm0 \n"
1750 "psubusw %%xmm3,%%xmm1 \n"
1751 "sub $0x10,%2 \n"
1752 "movdqa %%xmm0,(%1) \n"
1753 "movdqa %%xmm1,0x10(%1) \n"
1754 "lea 0x20(%1),%1 \n"
1755 "jg 1b \n"
1756 : "+r"(src), // %0
1757 "+r"(dst), // %1
1758 "+r"(width) // %2
1759 :
1760 : "memory", "cc"
1761#if defined(__SSE2__)
1762 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1763#endif
1764 );
1765}
1766#endif // HAS_ADDROW_SSE2
1767
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001768#ifdef HAS_SPLITUV_SSE2
1769void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001770 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001771 "pcmpeqb %%xmm5,%%xmm5 \n"
1772 "psrlw $0x8,%%xmm5 \n"
1773 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001774 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001775 "1: \n"
1776 "movdqa (%0),%%xmm0 \n"
1777 "movdqa 0x10(%0),%%xmm1 \n"
1778 "lea 0x20(%0),%0 \n"
1779 "movdqa %%xmm0,%%xmm2 \n"
1780 "movdqa %%xmm1,%%xmm3 \n"
1781 "pand %%xmm5,%%xmm0 \n"
1782 "pand %%xmm5,%%xmm1 \n"
1783 "packuswb %%xmm1,%%xmm0 \n"
1784 "psrlw $0x8,%%xmm2 \n"
1785 "psrlw $0x8,%%xmm3 \n"
1786 "packuswb %%xmm3,%%xmm2 \n"
1787 "movdqa %%xmm0,(%1) \n"
1788 "movdqa %%xmm2,(%1,%2) \n"
1789 "lea 0x10(%1),%1 \n"
1790 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001791 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001792 : "+r"(src_uv), // %0
1793 "+r"(dst_u), // %1
1794 "+r"(dst_v), // %2
1795 "+r"(pix) // %3
1796 :
1797 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001798#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001799 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001800#endif
1801 );
1802}
1803#endif
1804
fbarchard@google.com19932f82012-02-16 22:19:14 +00001805#ifdef HAS_COPYROW_SSE2
1806void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001807 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001808 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00001809 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001810 "1: \n"
1811 "movdqa (%0),%%xmm0 \n"
1812 "movdqa 0x10(%0),%%xmm1 \n"
1813 "movdqa %%xmm0,(%0,%1) \n"
1814 "movdqa %%xmm1,0x10(%0,%1) \n"
1815 "lea 0x20(%0),%0 \n"
1816 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001817 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001818 : "+r"(src), // %0
1819 "+r"(dst), // %1
1820 "+r"(count) // %2
1821 :
1822 : "memory", "cc"
1823#if defined(__SSE2__)
1824 , "xmm0", "xmm1"
1825#endif
1826 );
1827}
1828#endif // HAS_COPYROW_SSE2
1829
1830#ifdef HAS_COPYROW_X86
1831void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1832 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001833 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00001834 "shr $0x2,%2 \n"
1835 "rep movsl \n"
1836 : "+S"(src), // %0
1837 "+D"(dst), // %1
1838 "+c"(width_tmp) // %2
1839 :
1840 : "memory", "cc"
1841 );
1842}
1843#endif
1844
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001845#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001846void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001847 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001848 "pcmpeqb %%xmm5,%%xmm5 \n"
1849 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001850 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001851 "1: \n"
1852 "movdqa (%0),%%xmm0 \n"
1853 "movdqa 0x10(%0),%%xmm1 \n"
1854 "lea 0x20(%0),%0 \n"
1855 "pand %%xmm5,%%xmm0 \n"
1856 "pand %%xmm5,%%xmm1 \n"
1857 "packuswb %%xmm1,%%xmm0 \n"
1858 "movdqa %%xmm0,(%1) \n"
1859 "lea 0x10(%1),%1 \n"
1860 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001861 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001862 : "+r"(src_yuy2), // %0
1863 "+r"(dst_y), // %1
1864 "+r"(pix) // %2
1865 :
1866 : "memory", "cc"
1867#if defined(__SSE2__)
1868 , "xmm0", "xmm1", "xmm5"
1869#endif
1870 );
1871}
1872
1873void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1874 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001875 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001876 "pcmpeqb %%xmm5,%%xmm5 \n"
1877 "psrlw $0x8,%%xmm5 \n"
1878 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001879 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001880 "1: \n"
1881 "movdqa (%0),%%xmm0 \n"
1882 "movdqa 0x10(%0),%%xmm1 \n"
1883 "movdqa (%0,%4,1),%%xmm2 \n"
1884 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1885 "lea 0x20(%0),%0 \n"
1886 "pavgb %%xmm2,%%xmm0 \n"
1887 "pavgb %%xmm3,%%xmm1 \n"
1888 "psrlw $0x8,%%xmm0 \n"
1889 "psrlw $0x8,%%xmm1 \n"
1890 "packuswb %%xmm1,%%xmm0 \n"
1891 "movdqa %%xmm0,%%xmm1 \n"
1892 "pand %%xmm5,%%xmm0 \n"
1893 "packuswb %%xmm0,%%xmm0 \n"
1894 "psrlw $0x8,%%xmm1 \n"
1895 "packuswb %%xmm1,%%xmm1 \n"
1896 "movq %%xmm0,(%1) \n"
1897 "movq %%xmm1,(%1,%2) \n"
1898 "lea 0x8(%1),%1 \n"
1899 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001900 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001901 : "+r"(src_yuy2), // %0
1902 "+r"(dst_u), // %1
1903 "+r"(dst_y), // %2
1904 "+r"(pix) // %3
1905 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1906 : "memory", "cc"
1907#if defined(__SSE2__)
1908 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1909#endif
1910 );
1911}
1912
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00001913
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001914void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1915 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001916 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001917 "pcmpeqb %%xmm5,%%xmm5 \n"
1918 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001919 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001920 "1: \n"
1921 "movdqu (%0),%%xmm0 \n"
1922 "movdqu 0x10(%0),%%xmm1 \n"
1923 "lea 0x20(%0),%0 \n"
1924 "pand %%xmm5,%%xmm0 \n"
1925 "pand %%xmm5,%%xmm1 \n"
1926 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001927 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001928 "movdqu %%xmm0,(%1) \n"
1929 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001930 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001931 : "+r"(src_yuy2), // %0
1932 "+r"(dst_y), // %1
1933 "+r"(pix) // %2
1934 :
1935 : "memory", "cc"
1936#if defined(__SSE2__)
1937 , "xmm0", "xmm1", "xmm5"
1938#endif
1939 );
1940}
1941
1942void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1943 int stride_yuy2,
1944 uint8* dst_u, uint8* dst_y,
1945 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001946 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001947 "pcmpeqb %%xmm5,%%xmm5 \n"
1948 "psrlw $0x8,%%xmm5 \n"
1949 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001950 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001951 "1: \n"
1952 "movdqu (%0),%%xmm0 \n"
1953 "movdqu 0x10(%0),%%xmm1 \n"
1954 "movdqu (%0,%4,1),%%xmm2 \n"
1955 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1956 "lea 0x20(%0),%0 \n"
1957 "pavgb %%xmm2,%%xmm0 \n"
1958 "pavgb %%xmm3,%%xmm1 \n"
1959 "psrlw $0x8,%%xmm0 \n"
1960 "psrlw $0x8,%%xmm1 \n"
1961 "packuswb %%xmm1,%%xmm0 \n"
1962 "movdqa %%xmm0,%%xmm1 \n"
1963 "pand %%xmm5,%%xmm0 \n"
1964 "packuswb %%xmm0,%%xmm0 \n"
1965 "psrlw $0x8,%%xmm1 \n"
1966 "packuswb %%xmm1,%%xmm1 \n"
1967 "movq %%xmm0,(%1) \n"
1968 "movq %%xmm1,(%1,%2) \n"
1969 "lea 0x8(%1),%1 \n"
1970 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001971 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001972 : "+r"(src_yuy2), // %0
1973 "+r"(dst_u), // %1
1974 "+r"(dst_y), // %2
1975 "+r"(pix) // %3
1976 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1977 : "memory", "cc"
1978#if defined(__SSE2__)
1979 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1980#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001981 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001982}
1983
1984void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001985 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001986 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001987 "1: \n"
1988 "movdqa (%0),%%xmm0 \n"
1989 "movdqa 0x10(%0),%%xmm1 \n"
1990 "lea 0x20(%0),%0 \n"
1991 "psrlw $0x8,%%xmm0 \n"
1992 "psrlw $0x8,%%xmm1 \n"
1993 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001994 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001995 "movdqa %%xmm0,(%1) \n"
1996 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001997 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001998 : "+r"(src_uyvy), // %0
1999 "+r"(dst_y), // %1
2000 "+r"(pix) // %2
2001 :
2002 : "memory", "cc"
2003#if defined(__SSE2__)
2004 , "xmm0", "xmm1"
2005#endif
2006 );
2007}
2008
2009void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2010 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002011 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002012 "pcmpeqb %%xmm5,%%xmm5 \n"
2013 "psrlw $0x8,%%xmm5 \n"
2014 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002015 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002016 "1: \n"
2017 "movdqa (%0),%%xmm0 \n"
2018 "movdqa 0x10(%0),%%xmm1 \n"
2019 "movdqa (%0,%4,1),%%xmm2 \n"
2020 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2021 "lea 0x20(%0),%0 \n"
2022 "pavgb %%xmm2,%%xmm0 \n"
2023 "pavgb %%xmm3,%%xmm1 \n"
2024 "pand %%xmm5,%%xmm0 \n"
2025 "pand %%xmm5,%%xmm1 \n"
2026 "packuswb %%xmm1,%%xmm0 \n"
2027 "movdqa %%xmm0,%%xmm1 \n"
2028 "pand %%xmm5,%%xmm0 \n"
2029 "packuswb %%xmm0,%%xmm0 \n"
2030 "psrlw $0x8,%%xmm1 \n"
2031 "packuswb %%xmm1,%%xmm1 \n"
2032 "movq %%xmm0,(%1) \n"
2033 "movq %%xmm1,(%1,%2) \n"
2034 "lea 0x8(%1),%1 \n"
2035 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002036 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002037 : "+r"(src_uyvy), // %0
2038 "+r"(dst_u), // %1
2039 "+r"(dst_y), // %2
2040 "+r"(pix) // %3
2041 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2042 : "memory", "cc"
2043#if defined(__SSE2__)
2044 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2045#endif
2046 );
2047}
2048
2049void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2050 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002051 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002052 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002053 "1: \n"
2054 "movdqu (%0),%%xmm0 \n"
2055 "movdqu 0x10(%0),%%xmm1 \n"
2056 "lea 0x20(%0),%0 \n"
2057 "psrlw $0x8,%%xmm0 \n"
2058 "psrlw $0x8,%%xmm1 \n"
2059 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002060 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002061 "movdqu %%xmm0,(%1) \n"
2062 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002063 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002064 : "+r"(src_uyvy), // %0
2065 "+r"(dst_y), // %1
2066 "+r"(pix) // %2
2067 :
2068 : "memory", "cc"
2069#if defined(__SSE2__)
2070 , "xmm0", "xmm1"
2071#endif
2072 );
2073}
2074
2075void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2076 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002077 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002078 "pcmpeqb %%xmm5,%%xmm5 \n"
2079 "psrlw $0x8,%%xmm5 \n"
2080 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002081 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002082 "1: \n"
2083 "movdqu (%0),%%xmm0 \n"
2084 "movdqu 0x10(%0),%%xmm1 \n"
2085 "movdqu (%0,%4,1),%%xmm2 \n"
2086 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2087 "lea 0x20(%0),%0 \n"
2088 "pavgb %%xmm2,%%xmm0 \n"
2089 "pavgb %%xmm3,%%xmm1 \n"
2090 "pand %%xmm5,%%xmm0 \n"
2091 "pand %%xmm5,%%xmm1 \n"
2092 "packuswb %%xmm1,%%xmm0 \n"
2093 "movdqa %%xmm0,%%xmm1 \n"
2094 "pand %%xmm5,%%xmm0 \n"
2095 "packuswb %%xmm0,%%xmm0 \n"
2096 "psrlw $0x8,%%xmm1 \n"
2097 "packuswb %%xmm1,%%xmm1 \n"
2098 "movq %%xmm0,(%1) \n"
2099 "movq %%xmm1,(%1,%2) \n"
2100 "lea 0x8(%1),%1 \n"
2101 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002102 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002103 : "+r"(src_uyvy), // %0
2104 "+r"(dst_u), // %1
2105 "+r"(dst_y), // %2
2106 "+r"(pix) // %3
2107 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2108 : "memory", "cc"
2109#if defined(__SSE2__)
2110 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2111#endif
2112 );
2113}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002114#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002115
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002116#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002117// Blend 8 pixels at a time.
2118// src_argb0 unaligned.
2119// src_argb1 and dst_argb aligned to 16 bytes.
2120// width must be multiple of 4 pixels.
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002121void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.comc757f302012-04-03 00:49:16 +00002122 uint8* dst_argb, int width) {
2123 asm volatile (
2124 "pcmpeqb %%xmm7,%%xmm7 \n"
2125 "psrlw $0xf,%%xmm7 \n"
2126 "pcmpeqb %%xmm6,%%xmm6 \n"
2127 "psrlw $0x8,%%xmm6 \n"
2128 "pcmpeqb %%xmm5,%%xmm5 \n"
2129 "psllw $0x8,%%xmm5 \n"
2130 "pcmpeqb %%xmm4,%%xmm4 \n"
2131 "pslld $0x18,%%xmm4 \n"
2132
2133 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002134 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002135 "1: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002136 "movdqu (%0),%%xmm3 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002137 "movdqa %%xmm3,%%xmm0 \n"
2138 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002139 "movdqu (%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002140 "psrlw $0x8,%%xmm3 \n"
2141 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2142 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2143 "pand %%xmm6,%%xmm2 \n"
2144 "paddw %%xmm7,%%xmm3 \n"
2145 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002146 "movdqu (%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002147 "psrlw $0x8,%%xmm1 \n"
2148 "por %%xmm4,%%xmm0 \n"
2149 "pmullw %%xmm3,%%xmm1 \n"
2150 "movdqu 0x10(%0),%%xmm3 \n"
2151 "lea 0x20(%0),%0 \n"
2152 "psrlw $0x8,%%xmm2 \n"
2153 "paddusb %%xmm2,%%xmm0 \n"
2154 "pand %%xmm5,%%xmm1 \n"
2155 "paddusb %%xmm1,%%xmm0 \n"
2156 "sub $0x4,%3 \n"
2157 "movdqa %%xmm0,(%2) \n"
2158 "jle 9f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002159 "movdqa %%xmm3,%%xmm0 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002160 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002161 "movdqu 0x10(%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002162 "psrlw $0x8,%%xmm3 \n"
2163 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2164 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2165 "pand %%xmm6,%%xmm2 \n"
2166 "paddw %%xmm7,%%xmm3 \n"
2167 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002168 "movdqu 0x10(%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002169 "lea 0x20(%1),%1 \n"
2170 "psrlw $0x8,%%xmm1 \n"
2171 "por %%xmm4,%%xmm0 \n"
2172 "pmullw %%xmm3,%%xmm1 \n"
2173 "psrlw $0x8,%%xmm2 \n"
2174 "paddusb %%xmm2,%%xmm0 \n"
2175 "pand %%xmm5,%%xmm1 \n"
2176 "paddusb %%xmm1,%%xmm0 \n"
2177 "sub $0x4,%3 \n"
2178 "movdqa %%xmm0,0x10(%2) \n"
2179 "lea 0x20(%2),%2 \n"
2180 "jg 1b \n"
2181 "9: \n"
2182 : "+r"(src_argb0), // %0
2183 "+r"(src_argb1), // %1
2184 "+r"(dst_argb), // %2
2185 "+r"(width) // %3
2186 :
2187 : "memory", "cc"
2188#if defined(__SSE2__)
2189 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2190#endif
2191 );
2192}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002193#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002194
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002195#ifdef HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002196// Blend 1 pixel at a time, unaligned
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002197void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com96af8702012-04-06 18:22:27 +00002198 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002199 asm volatile (
2200 "pcmpeqb %%xmm7,%%xmm7 \n"
2201 "psrlw $0xf,%%xmm7 \n"
2202 "pcmpeqb %%xmm6,%%xmm6 \n"
2203 "psrlw $0x8,%%xmm6 \n"
2204 "pcmpeqb %%xmm5,%%xmm5 \n"
2205 "psllw $0x8,%%xmm5 \n"
2206 "pcmpeqb %%xmm4,%%xmm4 \n"
2207 "pslld $0x18,%%xmm4 \n"
2208
2209 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002210 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002211 "1: \n"
2212 "movd (%0),%%xmm3 \n"
2213 "lea 0x4(%0),%0 \n"
2214 "movdqa %%xmm3,%%xmm0 \n"
2215 "pxor %%xmm4,%%xmm3 \n"
2216 "movd (%1),%%xmm2 \n"
2217 "psrlw $0x8,%%xmm3 \n"
2218 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2219 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2220 "pand %%xmm6,%%xmm2 \n"
2221 "paddw %%xmm7,%%xmm3 \n"
2222 "pmullw %%xmm3,%%xmm2 \n"
2223 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002224 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002225 "psrlw $0x8,%%xmm1 \n"
2226 "por %%xmm4,%%xmm0 \n"
2227 "pmullw %%xmm3,%%xmm1 \n"
2228 "psrlw $0x8,%%xmm2 \n"
2229 "paddusb %%xmm2,%%xmm0 \n"
2230 "pand %%xmm5,%%xmm1 \n"
2231 "paddusb %%xmm1,%%xmm0 \n"
2232 "sub $0x1,%3 \n"
2233 "movd %%xmm0,(%2) \n"
2234 "lea 0x4(%2),%2 \n"
2235 "jg 1b \n"
2236 : "+r"(src_argb0), // %0
2237 "+r"(src_argb1), // %1
2238 "+r"(dst_argb), // %2
2239 "+r"(width) // %3
2240 :
2241 : "memory", "cc"
2242#if defined(__SSE2__)
2243 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2244#endif
2245 );
2246}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002247#endif // HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002248
fbarchard@google.com96af8702012-04-06 18:22:27 +00002249#ifdef HAS_ARGBBLENDROW_SSSE3
2250// Shuffle table for reversing the bytes.
2251CONST uvec8 kShuffleAlpha = {
2252 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2253 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2254};
2255void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002256 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002257 asm volatile (
2258 "pcmpeqb %%xmm7,%%xmm7 \n"
2259 "psrlw $0xf,%%xmm7 \n"
2260 "pcmpeqb %%xmm6,%%xmm6 \n"
2261 "psrlw $0x8,%%xmm6 \n"
2262 "pcmpeqb %%xmm5,%%xmm5 \n"
2263 "psllw $0x8,%%xmm5 \n"
2264 "pcmpeqb %%xmm4,%%xmm4 \n"
2265 "pslld $0x18,%%xmm4 \n"
2266
2267 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002268 ".p2align 4 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002269 "1: \n"
2270 "movdqu (%0),%%xmm3 \n"
2271 "movdqa %%xmm3,%%xmm0 \n"
2272 "pxor %%xmm4,%%xmm3 \n"
2273 "pshufb %4,%%xmm3 \n"
2274 "movdqu (%1),%%xmm2 \n"
2275 "pand %%xmm6,%%xmm2 \n"
2276 "paddw %%xmm7,%%xmm3 \n"
2277 "pmullw %%xmm3,%%xmm2 \n"
2278 "movdqu (%1),%%xmm1 \n"
2279 "psrlw $0x8,%%xmm1 \n"
2280 "por %%xmm4,%%xmm0 \n"
2281 "pmullw %%xmm3,%%xmm1 \n"
2282 "movdqu 0x10(%0),%%xmm3 \n"
2283 "lea 0x20(%0),%0 \n"
2284 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002285 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002286 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002287 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002288 "sub $0x4,%3 \n"
2289 "movdqa %%xmm0,(%2) \n"
2290 "jle 9f \n"
2291 "movdqa %%xmm3,%%xmm0 \n"
2292 "pxor %%xmm4,%%xmm3 \n"
2293 "movdqu 0x10(%1),%%xmm2 \n"
2294 "pshufb %4,%%xmm3 \n"
2295 "pand %%xmm6,%%xmm2 \n"
2296 "paddw %%xmm7,%%xmm3 \n"
2297 "pmullw %%xmm3,%%xmm2 \n"
2298 "movdqu 0x10(%1),%%xmm1 \n"
2299 "lea 0x20(%1),%1 \n"
2300 "psrlw $0x8,%%xmm1 \n"
2301 "por %%xmm4,%%xmm0 \n"
2302 "pmullw %%xmm3,%%xmm1 \n"
2303 "psrlw $0x8,%%xmm2 \n"
2304 "paddusb %%xmm2,%%xmm0 \n"
2305 "pand %%xmm5,%%xmm1 \n"
2306 "paddusb %%xmm1,%%xmm0 \n"
2307 "sub $0x4,%3 \n"
2308 "movdqa %%xmm0,0x10(%2) \n"
2309 "lea 0x20(%2),%2 \n"
2310 "jg 1b \n"
2311 "9: \n"
2312 : "+r"(src_argb0), // %0
2313 "+r"(src_argb1), // %1
2314 "+r"(dst_argb), // %2
2315 "+r"(width) // %3
2316 : "m"(kShuffleAlpha) // %4
2317 : "memory", "cc"
2318#if defined(__SSE2__)
2319 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2320#endif
2321 );
2322}
2323#endif // HAS_ARGBBLENDROW_SSSE3
2324
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002325
2326#ifdef HAS_ARGBBLENDROW1_SSSE3
2327// Blend 1 pixel at a time, unaligned
2328void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2329 uint8* dst_argb, int width) {
2330 asm volatile (
2331 "pcmpeqb %%xmm7,%%xmm7 \n"
2332 "psrlw $0xf,%%xmm7 \n"
2333 "pcmpeqb %%xmm6,%%xmm6 \n"
2334 "psrlw $0x8,%%xmm6 \n"
2335 "pcmpeqb %%xmm5,%%xmm5 \n"
2336 "psllw $0x8,%%xmm5 \n"
2337 "pcmpeqb %%xmm4,%%xmm4 \n"
2338 "pslld $0x18,%%xmm4 \n"
2339
2340 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002341 ".p2align 4 \n"
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002342 "1: \n"
2343 "movd (%0),%%xmm3 \n"
2344 "lea 0x4(%0),%0 \n"
2345 "movdqa %%xmm3,%%xmm0 \n"
2346 "pxor %%xmm4,%%xmm3 \n"
2347 "movd (%1),%%xmm2 \n"
2348 "pshufb %4,%%xmm3 \n"
2349 "pand %%xmm6,%%xmm2 \n"
2350 "paddw %%xmm7,%%xmm3 \n"
2351 "pmullw %%xmm3,%%xmm2 \n"
2352 "movd (%1),%%xmm1 \n"
2353 "lea 0x4(%1),%1 \n"
2354 "psrlw $0x8,%%xmm1 \n"
2355 "por %%xmm4,%%xmm0 \n"
2356 "pmullw %%xmm3,%%xmm1 \n"
2357 "psrlw $0x8,%%xmm2 \n"
2358 "paddusb %%xmm2,%%xmm0 \n"
2359 "pand %%xmm5,%%xmm1 \n"
2360 "paddusb %%xmm1,%%xmm0 \n"
2361 "sub $0x1,%3 \n"
2362 "movd %%xmm0,(%2) \n"
2363 "lea 0x4(%2),%2 \n"
2364 "jg 1b \n"
2365 : "+r"(src_argb0), // %0
2366 "+r"(src_argb1), // %1
2367 "+r"(dst_argb), // %2
2368 "+r"(width) // %3
2369 : "m"(kShuffleAlpha) // %4
2370 : "memory", "cc"
2371#if defined(__SSE2__)
2372 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2373#endif
2374 );
2375}
2376#endif // HAS_ARGBBLENDROW1_SSSE3
2377
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002378#ifdef HAS_ARGBATTENUATE_SSE2
2379// Attenuate 4 pixels at a time.
2380// aligned to 16 bytes
2381void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2382 asm volatile (
2383 "sub %0,%1 \n"
2384 "pcmpeqb %%xmm4,%%xmm4 \n"
2385 "pslld $0x18,%%xmm4 \n"
2386 "pcmpeqb %%xmm5,%%xmm5 \n"
2387 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002388
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002389 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002390 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002391 "1: \n"
2392 "movdqa (%0),%%xmm0 \n"
2393 "punpcklbw %%xmm0,%%xmm0 \n"
2394 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2395 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2396 "pmulhuw %%xmm2,%%xmm0 \n"
2397 "movdqa (%0),%%xmm1 \n"
2398 "punpckhbw %%xmm1,%%xmm1 \n"
2399 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2400 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2401 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002402 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002403 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002404 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002405 "psrlw $0x8,%%xmm1 \n"
2406 "packuswb %%xmm1,%%xmm0 \n"
2407 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002408 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002409 "sub $0x4,%2 \n"
2410 "movdqa %%xmm0,(%0,%1,1) \n"
2411 "lea 0x10(%0),%0 \n"
2412 "jg 1b \n"
2413 : "+r"(src_argb), // %0
2414 "+r"(dst_argb), // %1
2415 "+r"(width) // %2
2416 :
2417 : "memory", "cc"
2418#if defined(__SSE2__)
2419 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2420#endif
2421 );
2422}
2423#endif // HAS_ARGBATTENUATE_SSE2
2424
fbarchard@google.com810cd912012-04-20 20:15:27 +00002425#ifdef HAS_ARGBATTENUATE_SSSE3
2426// Shuffle table duplicating alpha
2427CONST uvec8 kShuffleAlpha0 = {
2428 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2429};
2430CONST uvec8 kShuffleAlpha1 = {
2431 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2432 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2433};
2434// Attenuate 4 pixels at a time.
2435// aligned to 16 bytes
2436void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2437 asm volatile (
2438 "sub %0,%1 \n"
2439 "pcmpeqb %%xmm3,%%xmm3 \n"
2440 "pslld $0x18,%%xmm3 \n"
2441 "movdqa %3,%%xmm4 \n"
2442 "movdqa %4,%%xmm5 \n"
2443
2444 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002445 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002446 "1: \n"
2447 "movdqa (%0),%%xmm0 \n"
2448 "pshufb %%xmm4,%%xmm0 \n"
2449 "movdqa (%0),%%xmm1 \n"
2450 "punpcklbw %%xmm1,%%xmm1 \n"
2451 "pmulhuw %%xmm1,%%xmm0 \n"
2452 "movdqa (%0),%%xmm1 \n"
2453 "pshufb %%xmm5,%%xmm1 \n"
2454 "movdqa (%0),%%xmm2 \n"
2455 "punpckhbw %%xmm2,%%xmm2 \n"
2456 "pmulhuw %%xmm2,%%xmm1 \n"
2457 "movdqa (%0),%%xmm2 \n"
2458 "pand %%xmm3,%%xmm2 \n"
2459 "psrlw $0x8,%%xmm0 \n"
2460 "psrlw $0x8,%%xmm1 \n"
2461 "packuswb %%xmm1,%%xmm0 \n"
2462 "por %%xmm2,%%xmm0 \n"
2463 "sub $0x4,%2 \n"
2464 "movdqa %%xmm0,(%0,%1,1) \n"
2465 "lea 0x10(%0),%0 \n"
2466 "jg 1b \n"
2467 : "+r"(src_argb), // %0
2468 "+r"(dst_argb), // %1
2469 "+r"(width) // %2
2470 : "m"(kShuffleAlpha0), // %3
2471 "m"(kShuffleAlpha1) // %4
2472 : "memory", "cc"
2473#if defined(__SSE2__)
2474 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2475#endif
2476 );
2477}
2478#endif // HAS_ARGBATTENUATE_SSSE3
2479
2480#ifdef HAS_ARGBUNATTENUATE_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00002481// Unattenuate 4 pixels at a time.
2482// aligned to 16 bytes
2483void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2484 int width) {
2485 uintptr_t alpha = 0;
2486 asm volatile (
2487 "sub %0,%1 \n"
2488 "pcmpeqb %%xmm4,%%xmm4 \n"
2489 "pslld $0x18,%%xmm4 \n"
2490
2491 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002492 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002493 "1: \n"
2494 "movdqa (%0),%%xmm0 \n"
2495 "movzb 0x3(%0),%3 \n"
2496 "punpcklbw %%xmm0,%%xmm0 \n"
2497 "movd 0x0(%4,%3,4),%%xmm2 \n"
2498 "movzb 0x7(%0),%3 \n"
2499 "movd 0x0(%4,%3,4),%%xmm3 \n"
2500 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2501 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2502 "movlhps %%xmm3,%%xmm2 \n"
2503 "pmulhuw %%xmm2,%%xmm0 \n"
2504 "movdqa (%0),%%xmm1 \n"
2505 "movzb 0xb(%0),%3 \n"
2506 "punpckhbw %%xmm1,%%xmm1 \n"
2507 "movd 0x0(%4,%3,4),%%xmm2 \n"
2508 "movzb 0xf(%0),%3 \n"
2509 "movd 0x0(%4,%3,4),%%xmm3 \n"
2510 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2511 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2512 "movlhps %%xmm3,%%xmm2 \n"
2513 "pmulhuw %%xmm2,%%xmm1 \n"
2514 "movdqa (%0),%%xmm2 \n"
2515 "pand %%xmm4,%%xmm2 \n"
2516 "packuswb %%xmm1,%%xmm0 \n"
2517 "por %%xmm2,%%xmm0 \n"
2518 "sub $0x4,%2 \n"
2519 "movdqa %%xmm0,(%0,%1,1) \n"
2520 "lea 0x10(%0),%0 \n"
2521 "jg 1b \n"
2522 : "+r"(src_argb), // %0
2523 "+r"(dst_argb), // %1
2524 "+r"(width), // %2
2525 "+r"(alpha) // %3
2526 : "r"(fixed_invtbl8) // %4
2527 : "memory", "cc"
2528#if defined(__SSE2__)
2529 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2530#endif
2531 );
2532}
2533#endif // HAS_ARGBUNATTENUATE_SSE2
2534
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002535#ifdef HAS_ARGBGRAYROW_SSSE3
2536// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2537void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) {
2538 asm volatile (
2539 "movdqa %2,%%xmm4 \n"
2540 "pcmpeqb %%xmm5,%%xmm5 \n"
2541 "pslld $0x18,%%xmm5 \n"
2542 "pcmpeqb %%xmm3,%%xmm3 \n"
2543 "psrld $0x8,%%xmm3 \n"
2544
2545 // 8 pixel loop \n"
2546 ".p2align 4 \n"
2547 "1: \n"
2548 "movdqa (%0),%%xmm0 \n"
2549 "movdqa 0x10(%0),%%xmm1 \n"
2550 "pmaddubsw %%xmm4,%%xmm0 \n"
2551 "pmaddubsw %%xmm4,%%xmm1 \n"
2552 "movdqa (%0),%%xmm6 \n"
2553 "movdqa 0x10(%0),%%xmm7 \n"
2554 "pand %%xmm5,%%xmm6 \n"
2555 "pand %%xmm5,%%xmm7 \n"
2556 "phaddw %%xmm1,%%xmm0 \n"
2557 "psrlw $0x7,%%xmm0 \n"
2558 "packuswb %%xmm0,%%xmm0 \n"
2559 "punpcklbw %%xmm0,%%xmm0 \n"
2560 "movdqa %%xmm0,%%xmm1 \n"
2561 "punpcklwd %%xmm0,%%xmm0 \n"
2562 "punpckhwd %%xmm1,%%xmm1 \n"
2563 "pand %%xmm3,%%xmm0 \n"
2564 "pand %%xmm3,%%xmm1 \n"
2565 "por %%xmm6,%%xmm0 \n"
2566 "por %%xmm7,%%xmm1 \n"
2567 "sub $0x8,%1 \n"
2568 "movdqa %%xmm0,(%0) \n"
2569 "movdqa %%xmm1,0x10(%0) \n"
2570 "lea 0x20(%0),%0 \n"
2571 "jg 1b \n"
2572 : "+r"(dst_argb), // %0
2573 "+r"(width) // %1
2574 : "m"(kARGBToGray) // %2
2575 : "memory", "cc"
2576#if defined(__SSE2__)
2577 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2578#endif
2579 );
2580}
2581#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002582#endif // defined(__x86_64__) || defined(__i386__)
2583
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002584#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002585} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002586} // namespace libyuv
2587#endif