blob: d7d174bc23378c11843e3578fc2860d370dfae40 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
115 "1: \n"
116 "movq (%0),%%xmm0 \n"
117 "lea 0x8(%0),%0 \n"
118 "punpcklbw %%xmm0,%%xmm0 \n"
119 "movdqa %%xmm0,%%xmm1 \n"
120 "punpcklwd %%xmm0,%%xmm0 \n"
121 "punpckhwd %%xmm1,%%xmm1 \n"
122 "por %%xmm5,%%xmm0 \n"
123 "por %%xmm5,%%xmm1 \n"
124 "movdqa %%xmm0,(%1) \n"
125 "movdqa %%xmm1,0x10(%1) \n"
126 "lea 0x20(%1),%1 \n"
127 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000128 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000129 : "+r"(src_y), // %0
130 "+r"(dst_argb), // %1
131 "+r"(pix) // %2
132 :
133 : "memory", "cc"
134#if defined(__SSE2__)
135 , "xmm0", "xmm1", "xmm5"
136#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000137 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000138}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000139
140void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000141 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000142 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000143 "sub %0,%1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000144 "1: \n"
145 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000147 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000148 "movdqa %%xmm0,(%0,%1,1) \n"
149 "lea 0x10(%0),%0 \n"
150 "jg 1b \n"
151
fbarchard@google.comb6149762011-11-07 21:58:52 +0000152 : "+r"(src_abgr), // %0
153 "+r"(dst_argb), // %1
154 "+r"(pix) // %2
155 : "m"(kShuffleMaskABGRToARGB) // %3
156 : "memory", "cc"
157#if defined(__SSE2__)
158 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000159#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000160 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000161}
162
163void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000164 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000165 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000166 "sub %0,%1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "1: \n"
168 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000169 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000171 "movdqa %%xmm0,(%0,%1,1) \n"
172 "lea 0x10(%0),%0 \n"
173 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000174 : "+r"(src_bgra), // %0
175 "+r"(dst_argb), // %1
176 "+r"(pix) // %2
177 : "m"(kShuffleMaskBGRAToARGB) // %3
178 : "memory", "cc"
179#if defined(__SSE2__)
180 , "xmm0", "xmm5"
181#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000182 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000183}
184
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000185void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000186 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000187 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
188 "pslld $0x18,%%xmm5 \n"
189 "movdqa %3,%%xmm4 \n"
190 "1: \n"
191 "movdqu (%0),%%xmm0 \n"
192 "movdqu 0x10(%0),%%xmm1 \n"
193 "movdqu 0x20(%0),%%xmm3 \n"
194 "lea 0x30(%0),%0 \n"
195 "movdqa %%xmm3,%%xmm2 \n"
196 "palignr $0x8,%%xmm1,%%xmm2 \n"
197 "pshufb %%xmm4,%%xmm2 \n"
198 "por %%xmm5,%%xmm2 \n"
199 "palignr $0xc,%%xmm0,%%xmm1 \n"
200 "pshufb %%xmm4,%%xmm0 \n"
201 "movdqa %%xmm2,0x20(%1) \n"
202 "por %%xmm5,%%xmm0 \n"
203 "pshufb %%xmm4,%%xmm1 \n"
204 "movdqa %%xmm0,(%1) \n"
205 "por %%xmm5,%%xmm1 \n"
206 "palignr $0x4,%%xmm3,%%xmm3 \n"
207 "pshufb %%xmm4,%%xmm3 \n"
208 "movdqa %%xmm1,0x10(%1) \n"
209 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000210 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000211 "movdqa %%xmm3,0x30(%1) \n"
212 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000213 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000214 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000215 "+r"(dst_argb), // %1
216 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000217 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000218 : "memory", "cc"
219#if defined(__SSE2__)
220 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
221#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000222 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000223}
224
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000225void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000226 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000227 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
228 "pslld $0x18,%%xmm5 \n"
229 "movdqa %3,%%xmm4 \n"
230 "1: \n"
231 "movdqu (%0),%%xmm0 \n"
232 "movdqu 0x10(%0),%%xmm1 \n"
233 "movdqu 0x20(%0),%%xmm3 \n"
234 "lea 0x30(%0),%0 \n"
235 "movdqa %%xmm3,%%xmm2 \n"
236 "palignr $0x8,%%xmm1,%%xmm2 \n"
237 "pshufb %%xmm4,%%xmm2 \n"
238 "por %%xmm5,%%xmm2 \n"
239 "palignr $0xc,%%xmm0,%%xmm1 \n"
240 "pshufb %%xmm4,%%xmm0 \n"
241 "movdqa %%xmm2,0x20(%1) \n"
242 "por %%xmm5,%%xmm0 \n"
243 "pshufb %%xmm4,%%xmm1 \n"
244 "movdqa %%xmm0,(%1) \n"
245 "por %%xmm5,%%xmm1 \n"
246 "palignr $0x4,%%xmm3,%%xmm3 \n"
247 "pshufb %%xmm4,%%xmm3 \n"
248 "movdqa %%xmm1,0x10(%1) \n"
249 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000250 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000251 "movdqa %%xmm3,0x30(%1) \n"
252 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000253 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000254 : "+r"(src_raw), // %0
255 "+r"(dst_argb), // %1
256 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000257 : "m"(kShuffleMaskRAWToARGB) // %3
258 : "memory", "cc"
259#if defined(__SSE2__)
260 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
261#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000262 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000263}
264
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000265void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000266 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267 "mov $0x1080108,%%eax \n"
268 "movd %%eax,%%xmm5 \n"
269 "pshufd $0x0,%%xmm5,%%xmm5 \n"
270 "mov $0x20082008,%%eax \n"
271 "movd %%eax,%%xmm6 \n"
272 "pshufd $0x0,%%xmm6,%%xmm6 \n"
273 "pcmpeqb %%xmm3,%%xmm3 \n"
274 "psllw $0xb,%%xmm3 \n"
275 "pcmpeqb %%xmm4,%%xmm4 \n"
276 "psllw $0xa,%%xmm4 \n"
277 "psrlw $0x5,%%xmm4 \n"
278 "pcmpeqb %%xmm7,%%xmm7 \n"
279 "psllw $0x8,%%xmm7 \n"
280 "sub %0,%1 \n"
281 "sub %0,%1 \n"
282 "1: \n"
283 "movdqu (%0),%%xmm0 \n"
284 "movdqa %%xmm0,%%xmm1 \n"
285 "movdqa %%xmm0,%%xmm2 \n"
286 "pand %%xmm3,%%xmm1 \n"
287 "psllw $0xb,%%xmm2 \n"
288 "pmulhuw %%xmm5,%%xmm1 \n"
289 "pmulhuw %%xmm5,%%xmm2 \n"
290 "psllw $0x8,%%xmm1 \n"
291 "por %%xmm2,%%xmm1 \n"
292 "pand %%xmm4,%%xmm0 \n"
293 "pmulhuw %%xmm6,%%xmm0 \n"
294 "por %%xmm7,%%xmm0 \n"
295 "movdqa %%xmm1,%%xmm2 \n"
296 "punpcklbw %%xmm0,%%xmm1 \n"
297 "punpckhbw %%xmm0,%%xmm2 \n"
298 "movdqa %%xmm1,(%1,%0,2) \n"
299 "movdqa %%xmm2,0x10(%1,%0,2) \n"
300 "lea 0x10(%0),%0 \n"
301 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000302 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000303 : "+r"(src), // %0
304 "+r"(dst), // %1
305 "+r"(pix) // %2
306 :
307 : "memory", "cc", "eax"
308#if defined(__SSE2__)
309 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
310#endif
311 );
312}
313
314void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000315 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000316 "mov $0x1080108,%%eax \n"
317 "movd %%eax,%%xmm5 \n"
318 "pshufd $0x0,%%xmm5,%%xmm5 \n"
319 "mov $0x42004200,%%eax \n"
320 "movd %%eax,%%xmm6 \n"
321 "pshufd $0x0,%%xmm6,%%xmm6 \n"
322 "pcmpeqb %%xmm3,%%xmm3 \n"
323 "psllw $0xb,%%xmm3 \n"
324 "movdqa %%xmm3,%%xmm4 \n"
325 "psrlw $0x6,%%xmm4 \n"
326 "pcmpeqb %%xmm7,%%xmm7 \n"
327 "psllw $0x8,%%xmm7 \n"
328 "sub %0,%1 \n"
329 "sub %0,%1 \n"
330 "1: \n"
331 "movdqu (%0),%%xmm0 \n"
332 "movdqa %%xmm0,%%xmm1 \n"
333 "movdqa %%xmm0,%%xmm2 \n"
334 "psllw $0x1,%%xmm1 \n"
335 "psllw $0xb,%%xmm2 \n"
336 "pand %%xmm3,%%xmm1 \n"
337 "pmulhuw %%xmm5,%%xmm2 \n"
338 "pmulhuw %%xmm5,%%xmm1 \n"
339 "psllw $0x8,%%xmm1 \n"
340 "por %%xmm2,%%xmm1 \n"
341 "movdqa %%xmm0,%%xmm2 \n"
342 "pand %%xmm4,%%xmm0 \n"
343 "psraw $0x8,%%xmm2 \n"
344 "pmulhuw %%xmm6,%%xmm0 \n"
345 "pand %%xmm7,%%xmm2 \n"
346 "por %%xmm2,%%xmm0 \n"
347 "movdqa %%xmm1,%%xmm2 \n"
348 "punpcklbw %%xmm0,%%xmm1 \n"
349 "punpckhbw %%xmm0,%%xmm2 \n"
350 "movdqa %%xmm1,(%1,%0,2) \n"
351 "movdqa %%xmm2,0x10(%1,%0,2) \n"
352 "lea 0x10(%0),%0 \n"
353 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000354 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000355 : "+r"(src), // %0
356 "+r"(dst), // %1
357 "+r"(pix) // %2
358 :
359 : "memory", "cc", "eax"
360#if defined(__SSE2__)
361 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
362#endif
363 );
364}
365
366void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000367 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000368 "mov $0xf0f0f0f,%%eax \n"
369 "movd %%eax,%%xmm4 \n"
370 "pshufd $0x0,%%xmm4,%%xmm4 \n"
371 "movdqa %%xmm4,%%xmm5 \n"
372 "pslld $0x4,%%xmm5 \n"
373 "sub %0,%1 \n"
374 "sub %0,%1 \n"
375 "1: \n"
376 "movdqu (%0),%%xmm0 \n"
377 "movdqa %%xmm0,%%xmm2 \n"
378 "pand %%xmm4,%%xmm0 \n"
379 "pand %%xmm5,%%xmm2 \n"
380 "movdqa %%xmm0,%%xmm1 \n"
381 "movdqa %%xmm2,%%xmm3 \n"
382 "psllw $0x4,%%xmm1 \n"
383 "psrlw $0x4,%%xmm3 \n"
384 "por %%xmm1,%%xmm0 \n"
385 "por %%xmm3,%%xmm2 \n"
386 "movdqa %%xmm0,%%xmm1 \n"
387 "punpcklbw %%xmm2,%%xmm0 \n"
388 "punpckhbw %%xmm2,%%xmm1 \n"
389 "movdqa %%xmm0,(%1,%0,2) \n"
390 "movdqa %%xmm1,0x10(%1,%0,2) \n"
391 "lea 0x10(%0),%0 \n"
392 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000393 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000394 : "+r"(src), // %0
395 "+r"(dst), // %1
396 "+r"(pix) // %2
397 :
398 : "memory", "cc", "eax"
399#if defined(__SSE2__)
400 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
401#endif
402 );
403}
404
405void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000406 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000407 "movdqa %3,%%xmm6 \n"
408 "1: \n"
409 "movdqa (%0),%%xmm0 \n"
410 "movdqa 0x10(%0),%%xmm1 \n"
411 "movdqa 0x20(%0),%%xmm2 \n"
412 "movdqa 0x30(%0),%%xmm3 \n"
413 "lea 0x40(%0),%0 \n"
414 "pshufb %%xmm6,%%xmm0 \n"
415 "pshufb %%xmm6,%%xmm1 \n"
416 "pshufb %%xmm6,%%xmm2 \n"
417 "pshufb %%xmm6,%%xmm3 \n"
418 "movdqa %%xmm1,%%xmm4 \n"
419 "psrldq $0x4,%%xmm1 \n"
420 "pslldq $0xc,%%xmm4 \n"
421 "movdqa %%xmm2,%%xmm5 \n"
422 "por %%xmm4,%%xmm0 \n"
423 "pslldq $0x8,%%xmm5 \n"
424 "movdqa %%xmm0,(%1) \n"
425 "por %%xmm5,%%xmm1 \n"
426 "psrldq $0x8,%%xmm2 \n"
427 "pslldq $0x4,%%xmm3 \n"
428 "por %%xmm3,%%xmm2 \n"
429 "movdqa %%xmm1,0x10(%1) \n"
430 "movdqa %%xmm2,0x20(%1) \n"
431 "lea 0x30(%1),%1 \n"
432 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000433 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000434 : "+r"(src), // %0
435 "+r"(dst), // %1
436 "+r"(pix) // %2
437 : "m"(kShuffleMaskARGBToRGB24) // %3
438 : "memory", "cc"
439#if defined(__SSE2__)
440 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
441#endif
442 );
443}
444
445void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000446 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000447 "movdqa %3,%%xmm6 \n"
448 "1: \n"
449 "movdqa (%0),%%xmm0 \n"
450 "movdqa 0x10(%0),%%xmm1 \n"
451 "movdqa 0x20(%0),%%xmm2 \n"
452 "movdqa 0x30(%0),%%xmm3 \n"
453 "lea 0x40(%0),%0 \n"
454 "pshufb %%xmm6,%%xmm0 \n"
455 "pshufb %%xmm6,%%xmm1 \n"
456 "pshufb %%xmm6,%%xmm2 \n"
457 "pshufb %%xmm6,%%xmm3 \n"
458 "movdqa %%xmm1,%%xmm4 \n"
459 "psrldq $0x4,%%xmm1 \n"
460 "pslldq $0xc,%%xmm4 \n"
461 "movdqa %%xmm2,%%xmm5 \n"
462 "por %%xmm4,%%xmm0 \n"
463 "pslldq $0x8,%%xmm5 \n"
464 "movdqa %%xmm0,(%1) \n"
465 "por %%xmm5,%%xmm1 \n"
466 "psrldq $0x8,%%xmm2 \n"
467 "pslldq $0x4,%%xmm3 \n"
468 "por %%xmm3,%%xmm2 \n"
469 "movdqa %%xmm1,0x10(%1) \n"
470 "movdqa %%xmm2,0x20(%1) \n"
471 "lea 0x30(%1),%1 \n"
472 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000473 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000474 : "+r"(src), // %0
475 "+r"(dst), // %1
476 "+r"(pix) // %2
477 : "m"(kShuffleMaskARGBToRAW) // %3
478 : "memory", "cc"
479#if defined(__SSE2__)
480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
481#endif
482 );
483}
484
485void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000486 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000487 "pcmpeqb %%xmm3,%%xmm3 \n"
488 "psrld $0x1b,%%xmm3 \n"
489 "pcmpeqb %%xmm4,%%xmm4 \n"
490 "psrld $0x1a,%%xmm4 \n"
491 "pslld $0x5,%%xmm4 \n"
492 "pcmpeqb %%xmm5,%%xmm5 \n"
493 "pslld $0xb,%%xmm5 \n"
494 "1: \n"
495 "movdqa (%0),%%xmm0 \n"
496 "movdqa %%xmm0,%%xmm1 \n"
497 "movdqa %%xmm0,%%xmm2 \n"
498 "pslld $0x8,%%xmm0 \n"
499 "psrld $0x3,%%xmm1 \n"
500 "psrld $0x5,%%xmm2 \n"
501 "psrad $0x10,%%xmm0 \n"
502 "pand %%xmm3,%%xmm1 \n"
503 "pand %%xmm4,%%xmm2 \n"
504 "pand %%xmm5,%%xmm0 \n"
505 "por %%xmm2,%%xmm1 \n"
506 "por %%xmm1,%%xmm0 \n"
507 "packssdw %%xmm0,%%xmm0 \n"
508 "lea 0x10(%0),%0 \n"
509 "movq %%xmm0,(%1) \n"
510 "lea 0x8(%1),%1 \n"
511 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000512 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000513 : "+r"(src), // %0
514 "+r"(dst), // %1
515 "+r"(pix) // %2
516 :
517 : "memory", "cc"
518#if defined(__SSE2__)
519 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
520#endif
521 );
522}
523
524void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000525 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000526 "pcmpeqb %%xmm4,%%xmm4 \n"
527 "psrld $0x1b,%%xmm4 \n"
528 "movdqa %%xmm4,%%xmm5 \n"
529 "pslld $0x5,%%xmm5 \n"
530 "movdqa %%xmm4,%%xmm6 \n"
531 "pslld $0xa,%%xmm6 \n"
532 "pcmpeqb %%xmm7,%%xmm7 \n"
533 "pslld $0xf,%%xmm7 \n"
534 "1: \n"
535 "movdqa (%0),%%xmm0 \n"
536 "movdqa %%xmm0,%%xmm1 \n"
537 "movdqa %%xmm0,%%xmm2 \n"
538 "movdqa %%xmm0,%%xmm3 \n"
539 "psrad $0x10,%%xmm0 \n"
540 "psrld $0x3,%%xmm1 \n"
541 "psrld $0x6,%%xmm2 \n"
542 "psrld $0x9,%%xmm3 \n"
543 "pand %%xmm7,%%xmm0 \n"
544 "pand %%xmm4,%%xmm1 \n"
545 "pand %%xmm5,%%xmm2 \n"
546 "pand %%xmm6,%%xmm3 \n"
547 "por %%xmm1,%%xmm0 \n"
548 "por %%xmm3,%%xmm2 \n"
549 "por %%xmm2,%%xmm0 \n"
550 "packssdw %%xmm0,%%xmm0 \n"
551 "lea 0x10(%0),%0 \n"
552 "movq %%xmm0,(%1) \n"
553 "lea 0x8(%1),%1 \n"
554 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000555 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000556 : "+r"(src), // %0
557 "+r"(dst), // %1
558 "+r"(pix) // %2
559 :
560 : "memory", "cc"
561#if defined(__SSE2__)
562 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
563#endif
564 );
565}
566
567void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000568 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000569 "pcmpeqb %%xmm4,%%xmm4 \n"
570 "psllw $0xc,%%xmm4 \n"
571 "movdqa %%xmm4,%%xmm3 \n"
572 "psrlw $0x8,%%xmm3 \n"
573 "1: \n"
574 "movdqa (%0),%%xmm0 \n"
575 "movdqa %%xmm0,%%xmm1 \n"
576 "pand %%xmm3,%%xmm0 \n"
577 "pand %%xmm4,%%xmm1 \n"
578 "psrlq $0x4,%%xmm0 \n"
579 "psrlq $0x8,%%xmm1 \n"
580 "por %%xmm1,%%xmm0 \n"
581 "packuswb %%xmm0,%%xmm0 \n"
582 "lea 0x10(%0),%0 \n"
583 "movq %%xmm0,(%1) \n"
584 "lea 0x8(%1),%1 \n"
585 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000586 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000587 : "+r"(src), // %0
588 "+r"(dst), // %1
589 "+r"(pix) // %2
590 :
591 : "memory", "cc"
592#if defined(__SSE2__)
593 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
594#endif
595 );
596}
597
fbarchard@google.comb6149762011-11-07 21:58:52 +0000598void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000599 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000600 "movdqa %4,%%xmm5 \n"
601 "movdqa %3,%%xmm4 \n"
602 "1: \n"
603 "movdqa (%0),%%xmm0 \n"
604 "movdqa 0x10(%0),%%xmm1 \n"
605 "movdqa 0x20(%0),%%xmm2 \n"
606 "movdqa 0x30(%0),%%xmm3 \n"
607 "pmaddubsw %%xmm4,%%xmm0 \n"
608 "pmaddubsw %%xmm4,%%xmm1 \n"
609 "pmaddubsw %%xmm4,%%xmm2 \n"
610 "pmaddubsw %%xmm4,%%xmm3 \n"
611 "lea 0x40(%0),%0 \n"
612 "phaddw %%xmm1,%%xmm0 \n"
613 "phaddw %%xmm3,%%xmm2 \n"
614 "psrlw $0x7,%%xmm0 \n"
615 "psrlw $0x7,%%xmm2 \n"
616 "packuswb %%xmm2,%%xmm0 \n"
617 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000618 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000619 "movdqa %%xmm0,(%1) \n"
620 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000621 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000622 : "+r"(src_argb), // %0
623 "+r"(dst_y), // %1
624 "+r"(pix) // %2
625 : "m"(kARGBToY), // %3
626 "m"(kAddY16) // %4
627 : "memory", "cc"
628#if defined(__SSE2__)
629 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
630#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000631 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000632}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000633
634void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000635 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000636 "movdqa %4,%%xmm5 \n"
637 "movdqa %3,%%xmm4 \n"
638 "1: \n"
639 "movdqu (%0),%%xmm0 \n"
640 "movdqu 0x10(%0),%%xmm1 \n"
641 "movdqu 0x20(%0),%%xmm2 \n"
642 "movdqu 0x30(%0),%%xmm3 \n"
643 "pmaddubsw %%xmm4,%%xmm0 \n"
644 "pmaddubsw %%xmm4,%%xmm1 \n"
645 "pmaddubsw %%xmm4,%%xmm2 \n"
646 "pmaddubsw %%xmm4,%%xmm3 \n"
647 "lea 0x40(%0),%0 \n"
648 "phaddw %%xmm1,%%xmm0 \n"
649 "phaddw %%xmm3,%%xmm2 \n"
650 "psrlw $0x7,%%xmm0 \n"
651 "psrlw $0x7,%%xmm2 \n"
652 "packuswb %%xmm2,%%xmm0 \n"
653 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000654 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000655 "movdqu %%xmm0,(%1) \n"
656 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000657 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000658 : "+r"(src_argb), // %0
659 "+r"(dst_y), // %1
660 "+r"(pix) // %2
661 : "m"(kARGBToY), // %3
662 "m"(kAddY16) // %4
663 : "memory", "cc"
664#if defined(__SSE2__)
665 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
666#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000667 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000668}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000669
fbarchard@google.com714050a2012-02-17 22:59:56 +0000670// TODO(fbarchard): pass xmm constants to single block of assembly.
671// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
672// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
673// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
674// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000675void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
676 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000677 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000678 "movdqa %0,%%xmm4 \n"
679 "movdqa %1,%%xmm3 \n"
680 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000681 :
682 : "m"(kARGBToU), // %0
683 "m"(kARGBToV), // %1
684 "m"(kAddUV128) // %2
685 :
686#if defined(__SSE2__)
687 "xmm3", "xmm4", "xmm5"
688#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000689 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000690 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000691 "sub %1,%2 \n"
692 "1: \n"
693 "movdqa (%0),%%xmm0 \n"
694 "movdqa 0x10(%0),%%xmm1 \n"
695 "movdqa 0x20(%0),%%xmm2 \n"
696 "movdqa 0x30(%0),%%xmm6 \n"
697 "pavgb (%0,%4,1),%%xmm0 \n"
698 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
699 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
700 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
701 "lea 0x40(%0),%0 \n"
702 "movdqa %%xmm0,%%xmm7 \n"
703 "shufps $0x88,%%xmm1,%%xmm0 \n"
704 "shufps $0xdd,%%xmm1,%%xmm7 \n"
705 "pavgb %%xmm7,%%xmm0 \n"
706 "movdqa %%xmm2,%%xmm7 \n"
707 "shufps $0x88,%%xmm6,%%xmm2 \n"
708 "shufps $0xdd,%%xmm6,%%xmm7 \n"
709 "pavgb %%xmm7,%%xmm2 \n"
710 "movdqa %%xmm0,%%xmm1 \n"
711 "movdqa %%xmm2,%%xmm6 \n"
712 "pmaddubsw %%xmm4,%%xmm0 \n"
713 "pmaddubsw %%xmm4,%%xmm2 \n"
714 "pmaddubsw %%xmm3,%%xmm1 \n"
715 "pmaddubsw %%xmm3,%%xmm6 \n"
716 "phaddw %%xmm2,%%xmm0 \n"
717 "phaddw %%xmm6,%%xmm1 \n"
718 "psraw $0x8,%%xmm0 \n"
719 "psraw $0x8,%%xmm1 \n"
720 "packsswb %%xmm1,%%xmm0 \n"
721 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000722 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000723 "movlps %%xmm0,(%1) \n"
724 "movhps %%xmm0,(%1,%2,1) \n"
725 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000726 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000727 : "+r"(src_argb0), // %0
728 "+r"(dst_u), // %1
729 "+r"(dst_v), // %2
730 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000731 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000732 : "memory", "cc"
733#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000734 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000735#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000736 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000737}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000738
739void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
740 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000741 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000742 "movdqa %0,%%xmm4 \n"
743 "movdqa %1,%%xmm3 \n"
744 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000745 :
746 : "m"(kARGBToU), // %0
747 "m"(kARGBToV), // %1
748 "m"(kAddUV128) // %2
749 :
750#if defined(__SSE2__)
751 "xmm3", "xmm4", "xmm5"
752#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000753 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000754 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000755 "sub %1,%2 \n"
756 "1: \n"
757 "movdqu (%0),%%xmm0 \n"
758 "movdqu 0x10(%0),%%xmm1 \n"
759 "movdqu 0x20(%0),%%xmm2 \n"
760 "movdqu 0x30(%0),%%xmm6 \n"
761 "movdqu (%0,%4,1),%%xmm7 \n"
762 "pavgb %%xmm7,%%xmm0 \n"
763 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
764 "pavgb %%xmm7,%%xmm1 \n"
765 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
766 "pavgb %%xmm7,%%xmm2 \n"
767 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
768 "pavgb %%xmm7,%%xmm6 \n"
769 "lea 0x40(%0),%0 \n"
770 "movdqa %%xmm0,%%xmm7 \n"
771 "shufps $0x88,%%xmm1,%%xmm0 \n"
772 "shufps $0xdd,%%xmm1,%%xmm7 \n"
773 "pavgb %%xmm7,%%xmm0 \n"
774 "movdqa %%xmm2,%%xmm7 \n"
775 "shufps $0x88,%%xmm6,%%xmm2 \n"
776 "shufps $0xdd,%%xmm6,%%xmm7 \n"
777 "pavgb %%xmm7,%%xmm2 \n"
778 "movdqa %%xmm0,%%xmm1 \n"
779 "movdqa %%xmm2,%%xmm6 \n"
780 "pmaddubsw %%xmm4,%%xmm0 \n"
781 "pmaddubsw %%xmm4,%%xmm2 \n"
782 "pmaddubsw %%xmm3,%%xmm1 \n"
783 "pmaddubsw %%xmm3,%%xmm6 \n"
784 "phaddw %%xmm2,%%xmm0 \n"
785 "phaddw %%xmm6,%%xmm1 \n"
786 "psraw $0x8,%%xmm0 \n"
787 "psraw $0x8,%%xmm1 \n"
788 "packsswb %%xmm1,%%xmm0 \n"
789 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000790 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000791 "movlps %%xmm0,(%1) \n"
792 "movhps %%xmm0,(%1,%2,1) \n"
793 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000794 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000795 : "+r"(src_argb0), // %0
796 "+r"(dst_u), // %1
797 "+r"(dst_v), // %2
798 "+rm"(width) // %3
799 : "r"(static_cast<intptr_t>(src_stride_argb))
800 : "memory", "cc"
801#if defined(__SSE2__)
802 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
803#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000804 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000805}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000806
fbarchard@google.com714050a2012-02-17 22:59:56 +0000807void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000808 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000809 "movdqa %4,%%xmm5 \n"
810 "movdqa %3,%%xmm4 \n"
811 "1: \n"
812 "movdqa (%0),%%xmm0 \n"
813 "movdqa 0x10(%0),%%xmm1 \n"
814 "movdqa 0x20(%0),%%xmm2 \n"
815 "movdqa 0x30(%0),%%xmm3 \n"
816 "pmaddubsw %%xmm4,%%xmm0 \n"
817 "pmaddubsw %%xmm4,%%xmm1 \n"
818 "pmaddubsw %%xmm4,%%xmm2 \n"
819 "pmaddubsw %%xmm4,%%xmm3 \n"
820 "lea 0x40(%0),%0 \n"
821 "phaddw %%xmm1,%%xmm0 \n"
822 "phaddw %%xmm3,%%xmm2 \n"
823 "psrlw $0x7,%%xmm0 \n"
824 "psrlw $0x7,%%xmm2 \n"
825 "packuswb %%xmm2,%%xmm0 \n"
826 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000827 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000828 "movdqa %%xmm0,(%1) \n"
829 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000830 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000831 : "+r"(src_bgra), // %0
832 "+r"(dst_y), // %1
833 "+r"(pix) // %2
834 : "m"(kBGRAToY), // %3
835 "m"(kAddY16) // %4
836 : "memory", "cc"
837#if defined(__SSE2__)
838 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000839#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000840 );
841}
842
843void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000844 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000845 "movdqa %4,%%xmm5 \n"
846 "movdqa %3,%%xmm4 \n"
847 "1: \n"
848 "movdqu (%0),%%xmm0 \n"
849 "movdqu 0x10(%0),%%xmm1 \n"
850 "movdqu 0x20(%0),%%xmm2 \n"
851 "movdqu 0x30(%0),%%xmm3 \n"
852 "pmaddubsw %%xmm4,%%xmm0 \n"
853 "pmaddubsw %%xmm4,%%xmm1 \n"
854 "pmaddubsw %%xmm4,%%xmm2 \n"
855 "pmaddubsw %%xmm4,%%xmm3 \n"
856 "lea 0x40(%0),%0 \n"
857 "phaddw %%xmm1,%%xmm0 \n"
858 "phaddw %%xmm3,%%xmm2 \n"
859 "psrlw $0x7,%%xmm0 \n"
860 "psrlw $0x7,%%xmm2 \n"
861 "packuswb %%xmm2,%%xmm0 \n"
862 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000863 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000864 "movdqu %%xmm0,(%1) \n"
865 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000866 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000867 : "+r"(src_bgra), // %0
868 "+r"(dst_y), // %1
869 "+r"(pix) // %2
870 : "m"(kBGRAToY), // %3
871 "m"(kAddY16) // %4
872 : "memory", "cc"
873#if defined(__SSE2__)
874 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
875#endif
876 );
877}
878
879void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
880 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000881 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000882 "movdqa %0,%%xmm4 \n"
883 "movdqa %1,%%xmm3 \n"
884 "movdqa %2,%%xmm5 \n"
885 :
886 : "m"(kBGRAToU), // %0
887 "m"(kBGRAToV), // %1
888 "m"(kAddUV128) // %2
889 :
890#if defined(__SSE2__)
891 "xmm3", "xmm4", "xmm5"
892#endif
893 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000894 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000895 "sub %1,%2 \n"
896 "1: \n"
897 "movdqa (%0),%%xmm0 \n"
898 "movdqa 0x10(%0),%%xmm1 \n"
899 "movdqa 0x20(%0),%%xmm2 \n"
900 "movdqa 0x30(%0),%%xmm6 \n"
901 "pavgb (%0,%4,1),%%xmm0 \n"
902 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
903 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
904 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
905 "lea 0x40(%0),%0 \n"
906 "movdqa %%xmm0,%%xmm7 \n"
907 "shufps $0x88,%%xmm1,%%xmm0 \n"
908 "shufps $0xdd,%%xmm1,%%xmm7 \n"
909 "pavgb %%xmm7,%%xmm0 \n"
910 "movdqa %%xmm2,%%xmm7 \n"
911 "shufps $0x88,%%xmm6,%%xmm2 \n"
912 "shufps $0xdd,%%xmm6,%%xmm7 \n"
913 "pavgb %%xmm7,%%xmm2 \n"
914 "movdqa %%xmm0,%%xmm1 \n"
915 "movdqa %%xmm2,%%xmm6 \n"
916 "pmaddubsw %%xmm4,%%xmm0 \n"
917 "pmaddubsw %%xmm4,%%xmm2 \n"
918 "pmaddubsw %%xmm3,%%xmm1 \n"
919 "pmaddubsw %%xmm3,%%xmm6 \n"
920 "phaddw %%xmm2,%%xmm0 \n"
921 "phaddw %%xmm6,%%xmm1 \n"
922 "psraw $0x8,%%xmm0 \n"
923 "psraw $0x8,%%xmm1 \n"
924 "packsswb %%xmm1,%%xmm0 \n"
925 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000926 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000927 "movlps %%xmm0,(%1) \n"
928 "movhps %%xmm0,(%1,%2,1) \n"
929 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000930 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000931 : "+r"(src_bgra0), // %0
932 "+r"(dst_u), // %1
933 "+r"(dst_v), // %2
934 "+rm"(width) // %3
935 : "r"(static_cast<intptr_t>(src_stride_bgra))
936 : "memory", "cc"
937#if defined(__SSE2__)
938 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
939#endif
940 );
941}
942
943void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
944 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000945 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000946 "movdqa %0,%%xmm4 \n"
947 "movdqa %1,%%xmm3 \n"
948 "movdqa %2,%%xmm5 \n"
949 :
950 : "m"(kBGRAToU), // %0
951 "m"(kBGRAToV), // %1
952 "m"(kAddUV128) // %2
953 :
954#if defined(__SSE2__)
955 "xmm3", "xmm4", "xmm5"
956#endif
957 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000958 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000959 "sub %1,%2 \n"
960 "1: \n"
961 "movdqu (%0),%%xmm0 \n"
962 "movdqu 0x10(%0),%%xmm1 \n"
963 "movdqu 0x20(%0),%%xmm2 \n"
964 "movdqu 0x30(%0),%%xmm6 \n"
965 "movdqu (%0,%4,1),%%xmm7 \n"
966 "pavgb %%xmm7,%%xmm0 \n"
967 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
968 "pavgb %%xmm7,%%xmm1 \n"
969 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
970 "pavgb %%xmm7,%%xmm2 \n"
971 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
972 "pavgb %%xmm7,%%xmm6 \n"
973 "lea 0x40(%0),%0 \n"
974 "movdqa %%xmm0,%%xmm7 \n"
975 "shufps $0x88,%%xmm1,%%xmm0 \n"
976 "shufps $0xdd,%%xmm1,%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm0 \n"
978 "movdqa %%xmm2,%%xmm7 \n"
979 "shufps $0x88,%%xmm6,%%xmm2 \n"
980 "shufps $0xdd,%%xmm6,%%xmm7 \n"
981 "pavgb %%xmm7,%%xmm2 \n"
982 "movdqa %%xmm0,%%xmm1 \n"
983 "movdqa %%xmm2,%%xmm6 \n"
984 "pmaddubsw %%xmm4,%%xmm0 \n"
985 "pmaddubsw %%xmm4,%%xmm2 \n"
986 "pmaddubsw %%xmm3,%%xmm1 \n"
987 "pmaddubsw %%xmm3,%%xmm6 \n"
988 "phaddw %%xmm2,%%xmm0 \n"
989 "phaddw %%xmm6,%%xmm1 \n"
990 "psraw $0x8,%%xmm0 \n"
991 "psraw $0x8,%%xmm1 \n"
992 "packsswb %%xmm1,%%xmm0 \n"
993 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000994 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000995 "movlps %%xmm0,(%1) \n"
996 "movhps %%xmm0,(%1,%2,1) \n"
997 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000998 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000999 : "+r"(src_bgra0), // %0
1000 "+r"(dst_u), // %1
1001 "+r"(dst_v), // %2
1002 "+rm"(width) // %3
1003 : "r"(static_cast<intptr_t>(src_stride_bgra))
1004 : "memory", "cc"
1005#if defined(__SSE2__)
1006 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1007#endif
1008 );
1009}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001010
1011void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001012 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001013 "movdqa %4,%%xmm5 \n"
1014 "movdqa %3,%%xmm4 \n"
1015 "1: \n"
1016 "movdqa (%0),%%xmm0 \n"
1017 "movdqa 0x10(%0),%%xmm1 \n"
1018 "movdqa 0x20(%0),%%xmm2 \n"
1019 "movdqa 0x30(%0),%%xmm3 \n"
1020 "pmaddubsw %%xmm4,%%xmm0 \n"
1021 "pmaddubsw %%xmm4,%%xmm1 \n"
1022 "pmaddubsw %%xmm4,%%xmm2 \n"
1023 "pmaddubsw %%xmm4,%%xmm3 \n"
1024 "lea 0x40(%0),%0 \n"
1025 "phaddw %%xmm1,%%xmm0 \n"
1026 "phaddw %%xmm3,%%xmm2 \n"
1027 "psrlw $0x7,%%xmm0 \n"
1028 "psrlw $0x7,%%xmm2 \n"
1029 "packuswb %%xmm2,%%xmm0 \n"
1030 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001031 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001032 "movdqa %%xmm0,(%1) \n"
1033 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001034 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001035 : "+r"(src_abgr), // %0
1036 "+r"(dst_y), // %1
1037 "+r"(pix) // %2
1038 : "m"(kABGRToY), // %3
1039 "m"(kAddY16) // %4
1040 : "memory", "cc"
1041#if defined(__SSE2__)
1042 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1043#endif
1044 );
1045}
1046
1047void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001048 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001049 "movdqa %4,%%xmm5 \n"
1050 "movdqa %3,%%xmm4 \n"
1051 "1: \n"
1052 "movdqu (%0),%%xmm0 \n"
1053 "movdqu 0x10(%0),%%xmm1 \n"
1054 "movdqu 0x20(%0),%%xmm2 \n"
1055 "movdqu 0x30(%0),%%xmm3 \n"
1056 "pmaddubsw %%xmm4,%%xmm0 \n"
1057 "pmaddubsw %%xmm4,%%xmm1 \n"
1058 "pmaddubsw %%xmm4,%%xmm2 \n"
1059 "pmaddubsw %%xmm4,%%xmm3 \n"
1060 "lea 0x40(%0),%0 \n"
1061 "phaddw %%xmm1,%%xmm0 \n"
1062 "phaddw %%xmm3,%%xmm2 \n"
1063 "psrlw $0x7,%%xmm0 \n"
1064 "psrlw $0x7,%%xmm2 \n"
1065 "packuswb %%xmm2,%%xmm0 \n"
1066 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001067 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001068 "movdqu %%xmm0,(%1) \n"
1069 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001070 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001071 : "+r"(src_abgr), // %0
1072 "+r"(dst_y), // %1
1073 "+r"(pix) // %2
1074 : "m"(kABGRToY), // %3
1075 "m"(kAddY16) // %4
1076 : "memory", "cc"
1077#if defined(__SSE2__)
1078 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1079#endif
1080 );
1081}
1082
1083void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1084 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001085 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001086 "movdqa %0,%%xmm4 \n"
1087 "movdqa %1,%%xmm3 \n"
1088 "movdqa %2,%%xmm5 \n"
1089 :
1090 : "m"(kABGRToU), // %0
1091 "m"(kABGRToV), // %1
1092 "m"(kAddUV128) // %2
1093 :
1094#if defined(__SSE2__)
1095 "xmm3", "xmm4", "xmm5"
1096#endif
1097 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001098 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001099 "sub %1,%2 \n"
1100 "1: \n"
1101 "movdqa (%0),%%xmm0 \n"
1102 "movdqa 0x10(%0),%%xmm1 \n"
1103 "movdqa 0x20(%0),%%xmm2 \n"
1104 "movdqa 0x30(%0),%%xmm6 \n"
1105 "pavgb (%0,%4,1),%%xmm0 \n"
1106 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1107 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1108 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1109 "lea 0x40(%0),%0 \n"
1110 "movdqa %%xmm0,%%xmm7 \n"
1111 "shufps $0x88,%%xmm1,%%xmm0 \n"
1112 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1113 "pavgb %%xmm7,%%xmm0 \n"
1114 "movdqa %%xmm2,%%xmm7 \n"
1115 "shufps $0x88,%%xmm6,%%xmm2 \n"
1116 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1117 "pavgb %%xmm7,%%xmm2 \n"
1118 "movdqa %%xmm0,%%xmm1 \n"
1119 "movdqa %%xmm2,%%xmm6 \n"
1120 "pmaddubsw %%xmm4,%%xmm0 \n"
1121 "pmaddubsw %%xmm4,%%xmm2 \n"
1122 "pmaddubsw %%xmm3,%%xmm1 \n"
1123 "pmaddubsw %%xmm3,%%xmm6 \n"
1124 "phaddw %%xmm2,%%xmm0 \n"
1125 "phaddw %%xmm6,%%xmm1 \n"
1126 "psraw $0x8,%%xmm0 \n"
1127 "psraw $0x8,%%xmm1 \n"
1128 "packsswb %%xmm1,%%xmm0 \n"
1129 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001130 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001131 "movlps %%xmm0,(%1) \n"
1132 "movhps %%xmm0,(%1,%2,1) \n"
1133 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001134 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 : "+r"(src_abgr0), // %0
1136 "+r"(dst_u), // %1
1137 "+r"(dst_v), // %2
1138 "+rm"(width) // %3
1139 : "r"(static_cast<intptr_t>(src_stride_abgr))
1140 : "memory", "cc"
1141#if defined(__SSE2__)
1142 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1143#endif
1144 );
1145}
1146
1147void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1148 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001149 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001150 "movdqa %0,%%xmm4 \n"
1151 "movdqa %1,%%xmm3 \n"
1152 "movdqa %2,%%xmm5 \n"
1153 :
1154 : "m"(kABGRToU), // %0
1155 "m"(kABGRToV), // %1
1156 "m"(kAddUV128) // %2
1157 :
1158#if defined(__SSE2__)
1159 "xmm3", "xmm4", "xmm5"
1160#endif
1161 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001162 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001163 "sub %1,%2 \n"
1164 "1: \n"
1165 "movdqu (%0),%%xmm0 \n"
1166 "movdqu 0x10(%0),%%xmm1 \n"
1167 "movdqu 0x20(%0),%%xmm2 \n"
1168 "movdqu 0x30(%0),%%xmm6 \n"
1169 "movdqu (%0,%4,1),%%xmm7 \n"
1170 "pavgb %%xmm7,%%xmm0 \n"
1171 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1172 "pavgb %%xmm7,%%xmm1 \n"
1173 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1174 "pavgb %%xmm7,%%xmm2 \n"
1175 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1176 "pavgb %%xmm7,%%xmm6 \n"
1177 "lea 0x40(%0),%0 \n"
1178 "movdqa %%xmm0,%%xmm7 \n"
1179 "shufps $0x88,%%xmm1,%%xmm0 \n"
1180 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1181 "pavgb %%xmm7,%%xmm0 \n"
1182 "movdqa %%xmm2,%%xmm7 \n"
1183 "shufps $0x88,%%xmm6,%%xmm2 \n"
1184 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1185 "pavgb %%xmm7,%%xmm2 \n"
1186 "movdqa %%xmm0,%%xmm1 \n"
1187 "movdqa %%xmm2,%%xmm6 \n"
1188 "pmaddubsw %%xmm4,%%xmm0 \n"
1189 "pmaddubsw %%xmm4,%%xmm2 \n"
1190 "pmaddubsw %%xmm3,%%xmm1 \n"
1191 "pmaddubsw %%xmm3,%%xmm6 \n"
1192 "phaddw %%xmm2,%%xmm0 \n"
1193 "phaddw %%xmm6,%%xmm1 \n"
1194 "psraw $0x8,%%xmm0 \n"
1195 "psraw $0x8,%%xmm1 \n"
1196 "packsswb %%xmm1,%%xmm0 \n"
1197 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001198 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001199 "movlps %%xmm0,(%1) \n"
1200 "movhps %%xmm0,(%1,%2,1) \n"
1201 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001202 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001203 : "+r"(src_abgr0), // %0
1204 "+r"(dst_u), // %1
1205 "+r"(dst_v), // %2
1206 "+rm"(width) // %3
1207 : "r"(static_cast<intptr_t>(src_stride_abgr))
1208 : "memory", "cc"
1209#if defined(__SSE2__)
1210 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1211#endif
1212 );
1213}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001214
1215#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001216
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001217#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001218#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1219#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1220#define UR 0
1221
1222#define VB 0
1223#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1224#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1225
1226// Bias
1227#define BB UB * 128 + VB * 128
1228#define BG UG * 128 + VG * 128
1229#define BR UR * 128 + VR * 128
1230
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001231#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001232
fbarchard@google.comb6149762011-11-07 21:58:52 +00001233#if defined(__APPLE__) || defined(__x86_64__)
1234#define OMITFP
1235#else
1236#define OMITFP __attribute__((optimize("omit-frame-pointer")))
1237#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001238
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001239struct {
1240 vec8 kUVToB;
1241 vec8 kUVToG;
1242 vec8 kUVToR;
1243 vec16 kUVBiasB;
1244 vec16 kUVBiasG;
1245 vec16 kUVBiasR;
1246 vec16 kYSub16;
1247 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001248} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001249 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1250 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1251 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1252 { BB, BB, BB, BB, BB, BB, BB, BB },
1253 { BG, BG, BG, BG, BG, BG, BG, BG },
1254 { BR, BR, BR, BR, BR, BR, BR, BR },
1255 { 16, 16, 16, 16, 16, 16, 16, 16 },
1256 { YG, YG, YG, YG, YG, YG, YG, YG }
1257};
1258
1259// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +00001260#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001261 "movd (%1),%%xmm0 \n" \
1262 "movd (%1,%2,1),%%xmm1 \n" \
1263 "lea 0x4(%1),%1 \n" \
1264 "punpcklbw %%xmm1,%%xmm0 \n" \
1265 "punpcklwd %%xmm0,%%xmm0 \n" \
1266 "movdqa %%xmm0,%%xmm1 \n" \
1267 "movdqa %%xmm0,%%xmm2 \n" \
1268 "pmaddubsw (%5),%%xmm0 \n" \
1269 "pmaddubsw 16(%5),%%xmm1 \n" \
1270 "pmaddubsw 32(%5),%%xmm2 \n" \
1271 "psubw 48(%5),%%xmm0 \n" \
1272 "psubw 64(%5),%%xmm1 \n" \
1273 "psubw 80(%5),%%xmm2 \n" \
1274 "movq (%0),%%xmm3 \n" \
1275 "lea 0x8(%0),%0 \n" \
1276 "punpcklbw %%xmm4,%%xmm3 \n" \
1277 "psubsw 96(%5),%%xmm3 \n" \
1278 "pmullw 112(%5),%%xmm3 \n" \
1279 "paddsw %%xmm3,%%xmm0 \n" \
1280 "paddsw %%xmm3,%%xmm1 \n" \
1281 "paddsw %%xmm3,%%xmm2 \n" \
1282 "psraw $0x6,%%xmm0 \n" \
1283 "psraw $0x6,%%xmm1 \n" \
1284 "psraw $0x6,%%xmm2 \n" \
1285 "packuswb %%xmm0,%%xmm0 \n" \
1286 "packuswb %%xmm1,%%xmm1 \n" \
1287 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001288
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001289void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
1290 const uint8* u_buf,
1291 const uint8* v_buf,
1292 uint8* rgb_buf,
1293 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001294 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001295 "sub %1,%2 \n"
1296 "pcmpeqb %%xmm5,%%xmm5 \n"
1297 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001298 "1: \n"
1299 YUVTORGB
1300 "punpcklbw %%xmm1,%%xmm0 \n"
1301 "punpcklbw %%xmm5,%%xmm2 \n"
1302 "movdqa %%xmm0,%%xmm1 \n"
1303 "punpcklwd %%xmm2,%%xmm0 \n"
1304 "punpckhwd %%xmm2,%%xmm1 \n"
1305 "movdqa %%xmm0,(%3) \n"
1306 "movdqa %%xmm1,0x10(%3) \n"
1307 "lea 0x20(%3),%3 \n"
1308 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001309 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001310 : "+r"(y_buf), // %0
1311 "+r"(u_buf), // %1
1312 "+r"(v_buf), // %2
1313 "+r"(rgb_buf), // %3
1314 "+rm"(width) // %4
1315 : "r"(&kYuvConstants.kUVToB) // %5
1316 : "memory", "cc"
1317#if defined(__SSE2__)
1318 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1319#endif
1320 );
1321}
1322
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001323void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
1324 const uint8* u_buf,
1325 const uint8* v_buf,
1326 uint8* rgb_buf,
1327 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001328 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001329 "sub %1,%2 \n"
1330 "pcmpeqb %%xmm5,%%xmm5 \n"
1331 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001332 "1: \n"
1333 YUVTORGB
1334 "pcmpeqb %%xmm5,%%xmm5 \n"
1335 "punpcklbw %%xmm0,%%xmm1 \n"
1336 "punpcklbw %%xmm2,%%xmm5 \n"
1337 "movdqa %%xmm5,%%xmm0 \n"
1338 "punpcklwd %%xmm1,%%xmm5 \n"
1339 "punpckhwd %%xmm1,%%xmm0 \n"
1340 "movdqa %%xmm5,(%3) \n"
1341 "movdqa %%xmm0,0x10(%3) \n"
1342 "lea 0x20(%3),%3 \n"
1343 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001344 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001345 : "+r"(y_buf), // %0
1346 "+r"(u_buf), // %1
1347 "+r"(v_buf), // %2
1348 "+r"(rgb_buf), // %3
1349 "+rm"(width) // %4
1350 : "r"(&kYuvConstants.kUVToB) // %5
1351 : "memory", "cc"
1352#if defined(__SSE2__)
1353 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1354#endif
1355 );
1356}
1357
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001358void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
1359 const uint8* u_buf,
1360 const uint8* v_buf,
1361 uint8* rgb_buf,
1362 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001363 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001364 "sub %1,%2 \n"
1365 "pcmpeqb %%xmm5,%%xmm5 \n"
1366 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001367 "1: \n"
1368 YUVTORGB
1369 "punpcklbw %%xmm1,%%xmm2 \n"
1370 "punpcklbw %%xmm5,%%xmm0 \n"
1371 "movdqa %%xmm2,%%xmm1 \n"
1372 "punpcklwd %%xmm0,%%xmm2 \n"
1373 "punpckhwd %%xmm0,%%xmm1 \n"
1374 "movdqa %%xmm2,(%3) \n"
1375 "movdqa %%xmm1,0x10(%3) \n"
1376 "lea 0x20(%3),%3 \n"
1377 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001378 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001379 : "+r"(y_buf), // %0
1380 "+r"(u_buf), // %1
1381 "+r"(v_buf), // %2
1382 "+r"(rgb_buf), // %3
1383 "+rm"(width) // %4
1384 : "r"(&kYuvConstants.kUVToB) // %5
1385 : "memory", "cc"
1386#if defined(__SSE2__)
1387 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1388#endif
1389 );
1390}
1391
fbarchard@google.com952a5072012-03-30 18:10:50 +00001392void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1393 const uint8* u_buf,
1394 const uint8* v_buf,
1395 uint8* rgb_buf,
1396 int width) {
1397 asm volatile (
1398 "sub %1,%2 \n"
1399 "pcmpeqb %%xmm5,%%xmm5 \n"
1400 "pxor %%xmm4,%%xmm4 \n"
1401 "1: \n"
1402 YUVTORGB
1403 "punpcklbw %%xmm1,%%xmm0 \n"
1404 "punpcklbw %%xmm5,%%xmm2 \n"
1405 "movdqa %%xmm0,%%xmm1 \n"
1406 "punpcklwd %%xmm2,%%xmm0 \n"
1407 "punpckhwd %%xmm2,%%xmm1 \n"
1408 "movdqu %%xmm0,(%3) \n"
1409 "movdqu %%xmm1,0x10(%3) \n"
1410 "lea 0x20(%3),%3 \n"
1411 "sub $0x8,%4 \n"
1412 "jg 1b \n"
1413 : "+r"(y_buf), // %0
1414 "+r"(u_buf), // %1
1415 "+r"(v_buf), // %2
1416 "+r"(rgb_buf), // %3
1417 "+rm"(width) // %4
1418 : "r"(&kYuvConstants.kUVToB) // %5
1419 : "memory", "cc"
1420#if defined(__SSE2__)
1421 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1422#endif
1423 );
1424}
1425
1426void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
1427 const uint8* u_buf,
1428 const uint8* v_buf,
1429 uint8* rgb_buf,
1430 int width) {
1431 asm volatile (
1432 "sub %1,%2 \n"
1433 "pcmpeqb %%xmm5,%%xmm5 \n"
1434 "pxor %%xmm4,%%xmm4 \n"
1435 "1: \n"
1436 YUVTORGB
1437 "pcmpeqb %%xmm5,%%xmm5 \n"
1438 "punpcklbw %%xmm0,%%xmm1 \n"
1439 "punpcklbw %%xmm2,%%xmm5 \n"
1440 "movdqa %%xmm5,%%xmm0 \n"
1441 "punpcklwd %%xmm1,%%xmm5 \n"
1442 "punpckhwd %%xmm1,%%xmm0 \n"
1443 "movdqu %%xmm5,(%3) \n"
1444 "movdqu %%xmm0,0x10(%3) \n"
1445 "lea 0x20(%3),%3 \n"
1446 "sub $0x8,%4 \n"
1447 "jg 1b \n"
1448 : "+r"(y_buf), // %0
1449 "+r"(u_buf), // %1
1450 "+r"(v_buf), // %2
1451 "+r"(rgb_buf), // %3
1452 "+rm"(width) // %4
1453 : "r"(&kYuvConstants.kUVToB) // %5
1454 : "memory", "cc"
1455#if defined(__SSE2__)
1456 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1457#endif
1458 );
1459}
1460
1461void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
1462 const uint8* u_buf,
1463 const uint8* v_buf,
1464 uint8* rgb_buf,
1465 int width) {
1466 asm volatile (
1467 "sub %1,%2 \n"
1468 "pcmpeqb %%xmm5,%%xmm5 \n"
1469 "pxor %%xmm4,%%xmm4 \n"
1470 "1: \n"
1471 YUVTORGB
1472 "punpcklbw %%xmm1,%%xmm2 \n"
1473 "punpcklbw %%xmm5,%%xmm0 \n"
1474 "movdqa %%xmm2,%%xmm1 \n"
1475 "punpcklwd %%xmm0,%%xmm2 \n"
1476 "punpckhwd %%xmm0,%%xmm1 \n"
1477 "movdqu %%xmm2,(%3) \n"
1478 "movdqu %%xmm1,0x10(%3) \n"
1479 "lea 0x20(%3),%3 \n"
1480 "sub $0x8,%4 \n"
1481 "jg 1b \n"
1482 : "+r"(y_buf), // %0
1483 "+r"(u_buf), // %1
1484 "+r"(v_buf), // %2
1485 "+r"(rgb_buf), // %3
1486 "+rm"(width) // %4
1487 : "r"(&kYuvConstants.kUVToB) // %5
1488 : "memory", "cc"
1489#if defined(__SSE2__)
1490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1491#endif
1492 );
1493}
1494
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001495void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1496 const uint8* u_buf,
1497 const uint8* v_buf,
1498 uint8* rgb_buf,
1499 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001500 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001501 "sub %1,%2 \n"
1502 "pcmpeqb %%xmm5,%%xmm5 \n"
1503 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001504 "1: \n"
1505 "movd (%1),%%xmm0 \n"
1506 "movd (%1,%2,1),%%xmm1 \n"
1507 "lea 0x4(%1),%1 \n"
1508 "punpcklbw %%xmm1,%%xmm0 \n"
1509 "movdqa %%xmm0,%%xmm1 \n"
1510 "movdqa %%xmm0,%%xmm2 \n"
1511 "pmaddubsw (%5),%%xmm0 \n"
1512 "pmaddubsw 16(%5),%%xmm1 \n"
1513 "pmaddubsw 32(%5),%%xmm2 \n"
1514 "psubw 48(%5),%%xmm0 \n"
1515 "psubw 64(%5),%%xmm1 \n"
1516 "psubw 80(%5),%%xmm2 \n"
1517 "movd (%0),%%xmm3 \n"
1518 "lea 0x4(%0),%0 \n"
1519 "punpcklbw %%xmm4,%%xmm3 \n"
1520 "psubsw 96(%5),%%xmm3 \n"
1521 "pmullw 112(%5),%%xmm3 \n"
1522 "paddsw %%xmm3,%%xmm0 \n"
1523 "paddsw %%xmm3,%%xmm1 \n"
1524 "paddsw %%xmm3,%%xmm2 \n"
1525 "psraw $0x6,%%xmm0 \n"
1526 "psraw $0x6,%%xmm1 \n"
1527 "psraw $0x6,%%xmm2 \n"
1528 "packuswb %%xmm0,%%xmm0 \n"
1529 "packuswb %%xmm1,%%xmm1 \n"
1530 "packuswb %%xmm2,%%xmm2 \n"
1531 "punpcklbw %%xmm1,%%xmm0 \n"
1532 "punpcklbw %%xmm5,%%xmm2 \n"
1533 "punpcklwd %%xmm2,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001534 "sub $0x4,%4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001535 "movdqa %%xmm0,(%3) \n"
1536 "lea 0x10(%3),%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001537 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001538 : "+r"(y_buf), // %0
1539 "+r"(u_buf), // %1
1540 "+r"(v_buf), // %2
1541 "+r"(rgb_buf), // %3
1542 "+rm"(width) // %4
1543 : "r"(&kYuvConstants.kUVToB) // %5
1544 : "memory", "cc"
1545#if defined(__SSE2__)
1546 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1547#endif
1548 );
1549}
1550#endif
1551
1552#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001553void YToARGBRow_SSE2(const uint8* y_buf,
1554 uint8* rgb_buf,
1555 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001556 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001557 "pcmpeqb %%xmm4,%%xmm4 \n"
1558 "pslld $0x18,%%xmm4 \n"
1559 "mov $0x10001000,%%eax \n"
1560 "movd %%eax,%%xmm3 \n"
1561 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1562 "mov $0x012a012a,%%eax \n"
1563 "movd %%eax,%%xmm2 \n"
1564 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001565 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001566 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001567 "movq (%0),%%xmm0 \n"
1568 "lea 0x8(%0),%0 \n"
1569 "punpcklbw %%xmm0,%%xmm0 \n"
1570 "psubusw %%xmm3,%%xmm0 \n"
1571 "pmulhuw %%xmm2,%%xmm0 \n"
1572 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001573
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001574 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001575 "punpcklbw %%xmm0,%%xmm0 \n"
1576 "movdqa %%xmm0,%%xmm1 \n"
1577 "punpcklwd %%xmm0,%%xmm0 \n"
1578 "punpckhwd %%xmm1,%%xmm1 \n"
1579 "por %%xmm4,%%xmm0 \n"
1580 "por %%xmm4,%%xmm1 \n"
1581 "movdqa %%xmm0,(%1) \n"
1582 "movdqa %%xmm1,16(%1) \n"
1583 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001584
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001585 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001586 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001587 : "+r"(y_buf), // %0
1588 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001589 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001590 :
1591 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001592#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001593 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001594#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001595 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001596}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001597#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001598
fbarchard@google.com42831e02012-01-21 02:54:17 +00001599#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001600// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001601CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001602 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1603};
1604
fbarchard@google.com42831e02012-01-21 02:54:17 +00001605void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001606 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001607 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001608 "movdqa %3,%%xmm5 \n"
1609 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001610 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001611 "movdqa (%0,%2),%%xmm0 \n"
1612 "pshufb %%xmm5,%%xmm0 \n"
1613 "sub $0x10,%2 \n"
1614 "movdqa %%xmm0,(%1) \n"
1615 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001616 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001617 : "+r"(src), // %0
1618 "+r"(dst), // %1
1619 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001620 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001621 : "memory", "cc"
1622#if defined(__SSE2__)
1623 , "xmm0", "xmm5"
1624#endif
1625 );
1626}
1627#endif
1628
fbarchard@google.com42831e02012-01-21 02:54:17 +00001629#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001630void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001631 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001632 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001633 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001634 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001635 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001636 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001637 "psllw $0x8,%%xmm0 \n"
1638 "psrlw $0x8,%%xmm1 \n"
1639 "por %%xmm1,%%xmm0 \n"
1640 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1641 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1642 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1643 "sub $0x10,%2 \n"
1644 "movdqu %%xmm0,(%1) \n"
1645 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001646 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001647 : "+r"(src), // %0
1648 "+r"(dst), // %1
1649 "+r"(temp_width) // %2
1650 :
1651 : "memory", "cc"
1652#if defined(__SSE2__)
1653 , "xmm0", "xmm1"
1654#endif
1655 );
1656}
1657#endif
1658
fbarchard@google.com16a96642012-03-02 22:38:09 +00001659#ifdef HAS_MIRRORROW_UV_SSSE3
1660// Shuffle table for reversing the bytes of UV channels.
1661CONST uvec8 kShuffleMirrorUV = {
1662 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1663};
1664void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1665 int width) {
1666 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001667 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001668 "movdqa %4,%%xmm1 \n"
1669 "lea -16(%0,%3,2),%0 \n"
1670 "sub %1,%2 \n"
1671 "1: \n"
1672 "movdqa (%0),%%xmm0 \n"
1673 "lea -16(%0),%0 \n"
1674 "pshufb %%xmm1,%%xmm0 \n"
1675 "sub $8,%3 \n"
1676 "movlpd %%xmm0,(%1) \n"
1677 "movhpd %%xmm0,(%1,%2) \n"
1678 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001679 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001680 : "+r"(src), // %0
1681 "+r"(dst_u), // %1
1682 "+r"(dst_v), // %2
1683 "+r"(temp_width) // %3
1684 : "m"(kShuffleMirrorUV) // %4
1685 : "memory", "cc"
1686#if defined(__SSE2__)
1687 , "xmm0", "xmm1"
1688#endif
1689 );
1690}
1691#endif
1692
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001693#ifdef HAS_SPLITUV_SSE2
1694void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001695 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001696 "pcmpeqb %%xmm5,%%xmm5 \n"
1697 "psrlw $0x8,%%xmm5 \n"
1698 "sub %1,%2 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001699 "1: \n"
1700 "movdqa (%0),%%xmm0 \n"
1701 "movdqa 0x10(%0),%%xmm1 \n"
1702 "lea 0x20(%0),%0 \n"
1703 "movdqa %%xmm0,%%xmm2 \n"
1704 "movdqa %%xmm1,%%xmm3 \n"
1705 "pand %%xmm5,%%xmm0 \n"
1706 "pand %%xmm5,%%xmm1 \n"
1707 "packuswb %%xmm1,%%xmm0 \n"
1708 "psrlw $0x8,%%xmm2 \n"
1709 "psrlw $0x8,%%xmm3 \n"
1710 "packuswb %%xmm3,%%xmm2 \n"
1711 "movdqa %%xmm0,(%1) \n"
1712 "movdqa %%xmm2,(%1,%2) \n"
1713 "lea 0x10(%1),%1 \n"
1714 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001715 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001716 : "+r"(src_uv), // %0
1717 "+r"(dst_u), // %1
1718 "+r"(dst_v), // %2
1719 "+r"(pix) // %3
1720 :
1721 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001722#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001723 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001724#endif
1725 );
1726}
1727#endif
1728
fbarchard@google.com19932f82012-02-16 22:19:14 +00001729#ifdef HAS_COPYROW_SSE2
1730void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001731 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001732 "sub %0,%1 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001733 "1: \n"
1734 "movdqa (%0),%%xmm0 \n"
1735 "movdqa 0x10(%0),%%xmm1 \n"
1736 "movdqa %%xmm0,(%0,%1) \n"
1737 "movdqa %%xmm1,0x10(%0,%1) \n"
1738 "lea 0x20(%0),%0 \n"
1739 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001740 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001741 : "+r"(src), // %0
1742 "+r"(dst), // %1
1743 "+r"(count) // %2
1744 :
1745 : "memory", "cc"
1746#if defined(__SSE2__)
1747 , "xmm0", "xmm1"
1748#endif
1749 );
1750}
1751#endif // HAS_COPYROW_SSE2
1752
1753#ifdef HAS_COPYROW_X86
1754void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1755 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001756 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00001757 "shr $0x2,%2 \n"
1758 "rep movsl \n"
1759 : "+S"(src), // %0
1760 "+D"(dst), // %1
1761 "+c"(width_tmp) // %2
1762 :
1763 : "memory", "cc"
1764 );
1765}
1766#endif
1767
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001768#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001769void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001770 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001771 "pcmpeqb %%xmm5,%%xmm5 \n"
1772 "psrlw $0x8,%%xmm5 \n"
1773 "1: \n"
1774 "movdqa (%0),%%xmm0 \n"
1775 "movdqa 0x10(%0),%%xmm1 \n"
1776 "lea 0x20(%0),%0 \n"
1777 "pand %%xmm5,%%xmm0 \n"
1778 "pand %%xmm5,%%xmm1 \n"
1779 "packuswb %%xmm1,%%xmm0 \n"
1780 "movdqa %%xmm0,(%1) \n"
1781 "lea 0x10(%1),%1 \n"
1782 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001783 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001784 : "+r"(src_yuy2), // %0
1785 "+r"(dst_y), // %1
1786 "+r"(pix) // %2
1787 :
1788 : "memory", "cc"
1789#if defined(__SSE2__)
1790 , "xmm0", "xmm1", "xmm5"
1791#endif
1792 );
1793}
1794
1795void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1796 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001797 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001798 "pcmpeqb %%xmm5,%%xmm5 \n"
1799 "psrlw $0x8,%%xmm5 \n"
1800 "sub %1,%2 \n"
1801 "1: \n"
1802 "movdqa (%0),%%xmm0 \n"
1803 "movdqa 0x10(%0),%%xmm1 \n"
1804 "movdqa (%0,%4,1),%%xmm2 \n"
1805 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1806 "lea 0x20(%0),%0 \n"
1807 "pavgb %%xmm2,%%xmm0 \n"
1808 "pavgb %%xmm3,%%xmm1 \n"
1809 "psrlw $0x8,%%xmm0 \n"
1810 "psrlw $0x8,%%xmm1 \n"
1811 "packuswb %%xmm1,%%xmm0 \n"
1812 "movdqa %%xmm0,%%xmm1 \n"
1813 "pand %%xmm5,%%xmm0 \n"
1814 "packuswb %%xmm0,%%xmm0 \n"
1815 "psrlw $0x8,%%xmm1 \n"
1816 "packuswb %%xmm1,%%xmm1 \n"
1817 "movq %%xmm0,(%1) \n"
1818 "movq %%xmm1,(%1,%2) \n"
1819 "lea 0x8(%1),%1 \n"
1820 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001821 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001822 : "+r"(src_yuy2), // %0
1823 "+r"(dst_u), // %1
1824 "+r"(dst_y), // %2
1825 "+r"(pix) // %3
1826 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1827 : "memory", "cc"
1828#if defined(__SSE2__)
1829 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1830#endif
1831 );
1832}
1833
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00001834
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001835void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1836 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001837 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001838 "pcmpeqb %%xmm5,%%xmm5 \n"
1839 "psrlw $0x8,%%xmm5 \n"
1840 "1: \n"
1841 "movdqu (%0),%%xmm0 \n"
1842 "movdqu 0x10(%0),%%xmm1 \n"
1843 "lea 0x20(%0),%0 \n"
1844 "pand %%xmm5,%%xmm0 \n"
1845 "pand %%xmm5,%%xmm1 \n"
1846 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001847 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001848 "movdqu %%xmm0,(%1) \n"
1849 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001850 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001851 : "+r"(src_yuy2), // %0
1852 "+r"(dst_y), // %1
1853 "+r"(pix) // %2
1854 :
1855 : "memory", "cc"
1856#if defined(__SSE2__)
1857 , "xmm0", "xmm1", "xmm5"
1858#endif
1859 );
1860}
1861
1862void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1863 int stride_yuy2,
1864 uint8* dst_u, uint8* dst_y,
1865 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001866 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001867 "pcmpeqb %%xmm5,%%xmm5 \n"
1868 "psrlw $0x8,%%xmm5 \n"
1869 "sub %1,%2 \n"
1870 "1: \n"
1871 "movdqu (%0),%%xmm0 \n"
1872 "movdqu 0x10(%0),%%xmm1 \n"
1873 "movdqu (%0,%4,1),%%xmm2 \n"
1874 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1875 "lea 0x20(%0),%0 \n"
1876 "pavgb %%xmm2,%%xmm0 \n"
1877 "pavgb %%xmm3,%%xmm1 \n"
1878 "psrlw $0x8,%%xmm0 \n"
1879 "psrlw $0x8,%%xmm1 \n"
1880 "packuswb %%xmm1,%%xmm0 \n"
1881 "movdqa %%xmm0,%%xmm1 \n"
1882 "pand %%xmm5,%%xmm0 \n"
1883 "packuswb %%xmm0,%%xmm0 \n"
1884 "psrlw $0x8,%%xmm1 \n"
1885 "packuswb %%xmm1,%%xmm1 \n"
1886 "movq %%xmm0,(%1) \n"
1887 "movq %%xmm1,(%1,%2) \n"
1888 "lea 0x8(%1),%1 \n"
1889 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001890 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001891 : "+r"(src_yuy2), // %0
1892 "+r"(dst_u), // %1
1893 "+r"(dst_y), // %2
1894 "+r"(pix) // %3
1895 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1896 : "memory", "cc"
1897#if defined(__SSE2__)
1898 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1899#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001900 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001901}
1902
1903void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001904 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001905 "1: \n"
1906 "movdqa (%0),%%xmm0 \n"
1907 "movdqa 0x10(%0),%%xmm1 \n"
1908 "lea 0x20(%0),%0 \n"
1909 "psrlw $0x8,%%xmm0 \n"
1910 "psrlw $0x8,%%xmm1 \n"
1911 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001912 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001913 "movdqa %%xmm0,(%1) \n"
1914 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001915 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001916 : "+r"(src_uyvy), // %0
1917 "+r"(dst_y), // %1
1918 "+r"(pix) // %2
1919 :
1920 : "memory", "cc"
1921#if defined(__SSE2__)
1922 , "xmm0", "xmm1"
1923#endif
1924 );
1925}
1926
1927void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
1928 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001929 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001930 "pcmpeqb %%xmm5,%%xmm5 \n"
1931 "psrlw $0x8,%%xmm5 \n"
1932 "sub %1,%2 \n"
1933 "1: \n"
1934 "movdqa (%0),%%xmm0 \n"
1935 "movdqa 0x10(%0),%%xmm1 \n"
1936 "movdqa (%0,%4,1),%%xmm2 \n"
1937 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1938 "lea 0x20(%0),%0 \n"
1939 "pavgb %%xmm2,%%xmm0 \n"
1940 "pavgb %%xmm3,%%xmm1 \n"
1941 "pand %%xmm5,%%xmm0 \n"
1942 "pand %%xmm5,%%xmm1 \n"
1943 "packuswb %%xmm1,%%xmm0 \n"
1944 "movdqa %%xmm0,%%xmm1 \n"
1945 "pand %%xmm5,%%xmm0 \n"
1946 "packuswb %%xmm0,%%xmm0 \n"
1947 "psrlw $0x8,%%xmm1 \n"
1948 "packuswb %%xmm1,%%xmm1 \n"
1949 "movq %%xmm0,(%1) \n"
1950 "movq %%xmm1,(%1,%2) \n"
1951 "lea 0x8(%1),%1 \n"
1952 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001953 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001954 : "+r"(src_uyvy), // %0
1955 "+r"(dst_u), // %1
1956 "+r"(dst_y), // %2
1957 "+r"(pix) // %3
1958 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1959 : "memory", "cc"
1960#if defined(__SSE2__)
1961 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1962#endif
1963 );
1964}
1965
1966void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
1967 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001968 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001969 "1: \n"
1970 "movdqu (%0),%%xmm0 \n"
1971 "movdqu 0x10(%0),%%xmm1 \n"
1972 "lea 0x20(%0),%0 \n"
1973 "psrlw $0x8,%%xmm0 \n"
1974 "psrlw $0x8,%%xmm1 \n"
1975 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001976 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001977 "movdqu %%xmm0,(%1) \n"
1978 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001979 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001980 : "+r"(src_uyvy), // %0
1981 "+r"(dst_y), // %1
1982 "+r"(pix) // %2
1983 :
1984 : "memory", "cc"
1985#if defined(__SSE2__)
1986 , "xmm0", "xmm1"
1987#endif
1988 );
1989}
1990
1991void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
1992 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001993 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001994 "pcmpeqb %%xmm5,%%xmm5 \n"
1995 "psrlw $0x8,%%xmm5 \n"
1996 "sub %1,%2 \n"
1997 "1: \n"
1998 "movdqu (%0),%%xmm0 \n"
1999 "movdqu 0x10(%0),%%xmm1 \n"
2000 "movdqu (%0,%4,1),%%xmm2 \n"
2001 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2002 "lea 0x20(%0),%0 \n"
2003 "pavgb %%xmm2,%%xmm0 \n"
2004 "pavgb %%xmm3,%%xmm1 \n"
2005 "pand %%xmm5,%%xmm0 \n"
2006 "pand %%xmm5,%%xmm1 \n"
2007 "packuswb %%xmm1,%%xmm0 \n"
2008 "movdqa %%xmm0,%%xmm1 \n"
2009 "pand %%xmm5,%%xmm0 \n"
2010 "packuswb %%xmm0,%%xmm0 \n"
2011 "psrlw $0x8,%%xmm1 \n"
2012 "packuswb %%xmm1,%%xmm1 \n"
2013 "movq %%xmm0,(%1) \n"
2014 "movq %%xmm1,(%1,%2) \n"
2015 "lea 0x8(%1),%1 \n"
2016 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002017 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002018 : "+r"(src_uyvy), // %0
2019 "+r"(dst_u), // %1
2020 "+r"(dst_y), // %2
2021 "+r"(pix) // %3
2022 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2023 : "memory", "cc"
2024#if defined(__SSE2__)
2025 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2026#endif
2027 );
2028}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002029#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002030
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002031#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002032// Blend 8 pixels at a time.
2033// src_argb0 unaligned.
2034// src_argb1 and dst_argb aligned to 16 bytes.
2035// width must be multiple of 4 pixels.
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002036void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.comc757f302012-04-03 00:49:16 +00002037 uint8* dst_argb, int width) {
2038 asm volatile (
2039 "pcmpeqb %%xmm7,%%xmm7 \n"
2040 "psrlw $0xf,%%xmm7 \n"
2041 "pcmpeqb %%xmm6,%%xmm6 \n"
2042 "psrlw $0x8,%%xmm6 \n"
2043 "pcmpeqb %%xmm5,%%xmm5 \n"
2044 "psllw $0x8,%%xmm5 \n"
2045 "pcmpeqb %%xmm4,%%xmm4 \n"
2046 "pslld $0x18,%%xmm4 \n"
2047
2048 // 8 pixel loop
2049 "1: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002050 "movdqu (%0),%%xmm3 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002051 "movdqa %%xmm3,%%xmm0 \n"
2052 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002053 "movdqu (%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002054 "psrlw $0x8,%%xmm3 \n"
2055 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2056 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2057 "pand %%xmm6,%%xmm2 \n"
2058 "paddw %%xmm7,%%xmm3 \n"
2059 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002060 "movdqu (%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002061 "psrlw $0x8,%%xmm1 \n"
2062 "por %%xmm4,%%xmm0 \n"
2063 "pmullw %%xmm3,%%xmm1 \n"
2064 "movdqu 0x10(%0),%%xmm3 \n"
2065 "lea 0x20(%0),%0 \n"
2066 "psrlw $0x8,%%xmm2 \n"
2067 "paddusb %%xmm2,%%xmm0 \n"
2068 "pand %%xmm5,%%xmm1 \n"
2069 "paddusb %%xmm1,%%xmm0 \n"
2070 "sub $0x4,%3 \n"
2071 "movdqa %%xmm0,(%2) \n"
2072 "jle 9f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002073 "movdqa %%xmm3,%%xmm0 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002074 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002075 "movdqu 0x10(%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002076 "psrlw $0x8,%%xmm3 \n"
2077 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2078 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2079 "pand %%xmm6,%%xmm2 \n"
2080 "paddw %%xmm7,%%xmm3 \n"
2081 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002082 "movdqu 0x10(%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002083 "lea 0x20(%1),%1 \n"
2084 "psrlw $0x8,%%xmm1 \n"
2085 "por %%xmm4,%%xmm0 \n"
2086 "pmullw %%xmm3,%%xmm1 \n"
2087 "psrlw $0x8,%%xmm2 \n"
2088 "paddusb %%xmm2,%%xmm0 \n"
2089 "pand %%xmm5,%%xmm1 \n"
2090 "paddusb %%xmm1,%%xmm0 \n"
2091 "sub $0x4,%3 \n"
2092 "movdqa %%xmm0,0x10(%2) \n"
2093 "lea 0x20(%2),%2 \n"
2094 "jg 1b \n"
2095 "9: \n"
2096 : "+r"(src_argb0), // %0
2097 "+r"(src_argb1), // %1
2098 "+r"(dst_argb), // %2
2099 "+r"(width) // %3
2100 :
2101 : "memory", "cc"
2102#if defined(__SSE2__)
2103 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2104#endif
2105 );
2106}
2107
2108// Blend 1 pixel at a time, unaligned
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002109void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com96af8702012-04-06 18:22:27 +00002110 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002111 asm volatile (
2112 "pcmpeqb %%xmm7,%%xmm7 \n"
2113 "psrlw $0xf,%%xmm7 \n"
2114 "pcmpeqb %%xmm6,%%xmm6 \n"
2115 "psrlw $0x8,%%xmm6 \n"
2116 "pcmpeqb %%xmm5,%%xmm5 \n"
2117 "psllw $0x8,%%xmm5 \n"
2118 "pcmpeqb %%xmm4,%%xmm4 \n"
2119 "pslld $0x18,%%xmm4 \n"
2120
2121 // 1 pixel loop
2122 "1: \n"
2123 "movd (%0),%%xmm3 \n"
2124 "lea 0x4(%0),%0 \n"
2125 "movdqa %%xmm3,%%xmm0 \n"
2126 "pxor %%xmm4,%%xmm3 \n"
2127 "movd (%1),%%xmm2 \n"
2128 "psrlw $0x8,%%xmm3 \n"
2129 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2130 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2131 "pand %%xmm6,%%xmm2 \n"
2132 "paddw %%xmm7,%%xmm3 \n"
2133 "pmullw %%xmm3,%%xmm2 \n"
2134 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002135 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002136 "psrlw $0x8,%%xmm1 \n"
2137 "por %%xmm4,%%xmm0 \n"
2138 "pmullw %%xmm3,%%xmm1 \n"
2139 "psrlw $0x8,%%xmm2 \n"
2140 "paddusb %%xmm2,%%xmm0 \n"
2141 "pand %%xmm5,%%xmm1 \n"
2142 "paddusb %%xmm1,%%xmm0 \n"
2143 "sub $0x1,%3 \n"
2144 "movd %%xmm0,(%2) \n"
2145 "lea 0x4(%2),%2 \n"
2146 "jg 1b \n"
2147 : "+r"(src_argb0), // %0
2148 "+r"(src_argb1), // %1
2149 "+r"(dst_argb), // %2
2150 "+r"(width) // %3
2151 :
2152 : "memory", "cc"
2153#if defined(__SSE2__)
2154 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2155#endif
2156 );
2157}
fbarchard@google.comc757f302012-04-03 00:49:16 +00002158#endif // HAS_ARGBBLENDROW_SSE2
2159
fbarchard@google.com96af8702012-04-06 18:22:27 +00002160#ifdef HAS_ARGBBLENDROW_SSSE3
2161// Shuffle table for reversing the bytes.
2162CONST uvec8 kShuffleAlpha = {
2163 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2164 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2165};
2166void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2167 uint8* dst_argb, int width) {
2168 asm volatile (
2169 "pcmpeqb %%xmm7,%%xmm7 \n"
2170 "psrlw $0xf,%%xmm7 \n"
2171 "pcmpeqb %%xmm6,%%xmm6 \n"
2172 "psrlw $0x8,%%xmm6 \n"
2173 "pcmpeqb %%xmm5,%%xmm5 \n"
2174 "psllw $0x8,%%xmm5 \n"
2175 "pcmpeqb %%xmm4,%%xmm4 \n"
2176 "pslld $0x18,%%xmm4 \n"
2177
2178 // 8 pixel loop
2179 "1: \n"
2180 "movdqu (%0),%%xmm3 \n"
2181 "movdqa %%xmm3,%%xmm0 \n"
2182 "pxor %%xmm4,%%xmm3 \n"
2183 "pshufb %4,%%xmm3 \n"
2184 "movdqu (%1),%%xmm2 \n"
2185 "pand %%xmm6,%%xmm2 \n"
2186 "paddw %%xmm7,%%xmm3 \n"
2187 "pmullw %%xmm3,%%xmm2 \n"
2188 "movdqu (%1),%%xmm1 \n"
2189 "psrlw $0x8,%%xmm1 \n"
2190 "por %%xmm4,%%xmm0 \n"
2191 "pmullw %%xmm3,%%xmm1 \n"
2192 "movdqu 0x10(%0),%%xmm3 \n"
2193 "lea 0x20(%0),%0 \n"
2194 "psrlw $0x8,%%xmm2 \n"
2195 "paddusb %%xmm2,%%xmm0 \n"
2196 "pand %%xmm5,%%xmm1 \n"
2197 "paddusb %%xmm1,%%xmm0 \n"
2198 "sub $0x4,%3 \n"
2199 "movdqa %%xmm0,(%2) \n"
2200 "jle 9f \n"
2201 "movdqa %%xmm3,%%xmm0 \n"
2202 "pxor %%xmm4,%%xmm3 \n"
2203 "movdqu 0x10(%1),%%xmm2 \n"
2204 "pshufb %4,%%xmm3 \n"
2205 "pand %%xmm6,%%xmm2 \n"
2206 "paddw %%xmm7,%%xmm3 \n"
2207 "pmullw %%xmm3,%%xmm2 \n"
2208 "movdqu 0x10(%1),%%xmm1 \n"
2209 "lea 0x20(%1),%1 \n"
2210 "psrlw $0x8,%%xmm1 \n"
2211 "por %%xmm4,%%xmm0 \n"
2212 "pmullw %%xmm3,%%xmm1 \n"
2213 "psrlw $0x8,%%xmm2 \n"
2214 "paddusb %%xmm2,%%xmm0 \n"
2215 "pand %%xmm5,%%xmm1 \n"
2216 "paddusb %%xmm1,%%xmm0 \n"
2217 "sub $0x4,%3 \n"
2218 "movdqa %%xmm0,0x10(%2) \n"
2219 "lea 0x20(%2),%2 \n"
2220 "jg 1b \n"
2221 "9: \n"
2222 : "+r"(src_argb0), // %0
2223 "+r"(src_argb1), // %1
2224 "+r"(dst_argb), // %2
2225 "+r"(width) // %3
2226 : "m"(kShuffleAlpha) // %4
2227 : "memory", "cc"
2228#if defined(__SSE2__)
2229 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2230#endif
2231 );
2232}
2233#endif // HAS_ARGBBLENDROW_SSSE3
2234
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002235#endif // defined(__x86_64__) || defined(__i386__)
2236
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002237#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002238} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002239} // namespace libyuv
2240#endif