blob: 90adcb8880e7d2c7fe603f593e4a3d7da6ed4238 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000115 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000116 "1: \n"
117 "movq (%0),%%xmm0 \n"
118 "lea 0x8(%0),%0 \n"
119 "punpcklbw %%xmm0,%%xmm0 \n"
120 "movdqa %%xmm0,%%xmm1 \n"
121 "punpcklwd %%xmm0,%%xmm0 \n"
122 "punpckhwd %%xmm1,%%xmm1 \n"
123 "por %%xmm5,%%xmm0 \n"
124 "por %%xmm5,%%xmm1 \n"
125 "movdqa %%xmm0,(%1) \n"
126 "movdqa %%xmm1,0x10(%1) \n"
127 "lea 0x20(%1),%1 \n"
128 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000129 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000130 : "+r"(src_y), // %0
131 "+r"(dst_argb), // %1
132 "+r"(pix) // %2
133 :
134 : "memory", "cc"
135#if defined(__SSE2__)
136 , "xmm0", "xmm1", "xmm5"
137#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000138 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000139}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140
141void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000142 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000143 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000144 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000145 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "1: \n"
147 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000148 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000150 "movdqa %%xmm0,(%0,%1,1) \n"
151 "lea 0x10(%0),%0 \n"
152 "jg 1b \n"
153
fbarchard@google.comb6149762011-11-07 21:58:52 +0000154 : "+r"(src_abgr), // %0
155 "+r"(dst_argb), // %1
156 "+r"(pix) // %2
157 : "m"(kShuffleMaskABGRToARGB) // %3
158 : "memory", "cc"
159#if defined(__SSE2__)
160 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000161#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000162 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163}
164
165void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000166 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000168 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000169 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "1: \n"
171 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000172 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000173 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000174 "movdqa %%xmm0,(%0,%1,1) \n"
175 "lea 0x10(%0),%0 \n"
176 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000177 : "+r"(src_bgra), // %0
178 "+r"(dst_argb), // %1
179 "+r"(pix) // %2
180 : "m"(kShuffleMaskBGRAToARGB) // %3
181 : "memory", "cc"
182#if defined(__SSE2__)
183 , "xmm0", "xmm5"
184#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000185 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000186}
187
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000188void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000189 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000190 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
191 "pslld $0x18,%%xmm5 \n"
192 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000193 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000194 "1: \n"
195 "movdqu (%0),%%xmm0 \n"
196 "movdqu 0x10(%0),%%xmm1 \n"
197 "movdqu 0x20(%0),%%xmm3 \n"
198 "lea 0x30(%0),%0 \n"
199 "movdqa %%xmm3,%%xmm2 \n"
200 "palignr $0x8,%%xmm1,%%xmm2 \n"
201 "pshufb %%xmm4,%%xmm2 \n"
202 "por %%xmm5,%%xmm2 \n"
203 "palignr $0xc,%%xmm0,%%xmm1 \n"
204 "pshufb %%xmm4,%%xmm0 \n"
205 "movdqa %%xmm2,0x20(%1) \n"
206 "por %%xmm5,%%xmm0 \n"
207 "pshufb %%xmm4,%%xmm1 \n"
208 "movdqa %%xmm0,(%1) \n"
209 "por %%xmm5,%%xmm1 \n"
210 "palignr $0x4,%%xmm3,%%xmm3 \n"
211 "pshufb %%xmm4,%%xmm3 \n"
212 "movdqa %%xmm1,0x10(%1) \n"
213 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000215 "movdqa %%xmm3,0x30(%1) \n"
216 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000217 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000218 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000221 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000227}
228
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000229void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
232 "pslld $0x18,%%xmm5 \n"
233 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000234 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000235 "1: \n"
236 "movdqu (%0),%%xmm0 \n"
237 "movdqu 0x10(%0),%%xmm1 \n"
238 "movdqu 0x20(%0),%%xmm3 \n"
239 "lea 0x30(%0),%0 \n"
240 "movdqa %%xmm3,%%xmm2 \n"
241 "palignr $0x8,%%xmm1,%%xmm2 \n"
242 "pshufb %%xmm4,%%xmm2 \n"
243 "por %%xmm5,%%xmm2 \n"
244 "palignr $0xc,%%xmm0,%%xmm1 \n"
245 "pshufb %%xmm4,%%xmm0 \n"
246 "movdqa %%xmm2,0x20(%1) \n"
247 "por %%xmm5,%%xmm0 \n"
248 "pshufb %%xmm4,%%xmm1 \n"
249 "movdqa %%xmm0,(%1) \n"
250 "por %%xmm5,%%xmm1 \n"
251 "palignr $0x4,%%xmm3,%%xmm3 \n"
252 "pshufb %%xmm4,%%xmm3 \n"
253 "movdqa %%xmm1,0x10(%1) \n"
254 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000255 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000256 "movdqa %%xmm3,0x30(%1) \n"
257 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000258 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000259 : "+r"(src_raw), // %0
260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000262 : "m"(kShuffleMaskRAWToARGB) // %3
263 : "memory", "cc"
264#if defined(__SSE2__)
265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
266#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000268}
269
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000270void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000271 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000272 "mov $0x1080108,%%eax \n"
273 "movd %%eax,%%xmm5 \n"
274 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000275 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000276 "movd %%eax,%%xmm6 \n"
277 "pshufd $0x0,%%xmm6,%%xmm6 \n"
278 "pcmpeqb %%xmm3,%%xmm3 \n"
279 "psllw $0xb,%%xmm3 \n"
280 "pcmpeqb %%xmm4,%%xmm4 \n"
281 "psllw $0xa,%%xmm4 \n"
282 "psrlw $0x5,%%xmm4 \n"
283 "pcmpeqb %%xmm7,%%xmm7 \n"
284 "psllw $0x8,%%xmm7 \n"
285 "sub %0,%1 \n"
286 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000287 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000288 "1: \n"
289 "movdqu (%0),%%xmm0 \n"
290 "movdqa %%xmm0,%%xmm1 \n"
291 "movdqa %%xmm0,%%xmm2 \n"
292 "pand %%xmm3,%%xmm1 \n"
293 "psllw $0xb,%%xmm2 \n"
294 "pmulhuw %%xmm5,%%xmm1 \n"
295 "pmulhuw %%xmm5,%%xmm2 \n"
296 "psllw $0x8,%%xmm1 \n"
297 "por %%xmm2,%%xmm1 \n"
298 "pand %%xmm4,%%xmm0 \n"
299 "pmulhuw %%xmm6,%%xmm0 \n"
300 "por %%xmm7,%%xmm0 \n"
301 "movdqa %%xmm1,%%xmm2 \n"
302 "punpcklbw %%xmm0,%%xmm1 \n"
303 "punpckhbw %%xmm0,%%xmm2 \n"
304 "movdqa %%xmm1,(%1,%0,2) \n"
305 "movdqa %%xmm2,0x10(%1,%0,2) \n"
306 "lea 0x10(%0),%0 \n"
307 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000308 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000309 : "+r"(src), // %0
310 "+r"(dst), // %1
311 "+r"(pix) // %2
312 :
313 : "memory", "cc", "eax"
314#if defined(__SSE2__)
315 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
316#endif
317 );
318}
319
320void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000321 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000322 "mov $0x1080108,%%eax \n"
323 "movd %%eax,%%xmm5 \n"
324 "pshufd $0x0,%%xmm5,%%xmm5 \n"
325 "mov $0x42004200,%%eax \n"
326 "movd %%eax,%%xmm6 \n"
327 "pshufd $0x0,%%xmm6,%%xmm6 \n"
328 "pcmpeqb %%xmm3,%%xmm3 \n"
329 "psllw $0xb,%%xmm3 \n"
330 "movdqa %%xmm3,%%xmm4 \n"
331 "psrlw $0x6,%%xmm4 \n"
332 "pcmpeqb %%xmm7,%%xmm7 \n"
333 "psllw $0x8,%%xmm7 \n"
334 "sub %0,%1 \n"
335 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000336 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000337 "1: \n"
338 "movdqu (%0),%%xmm0 \n"
339 "movdqa %%xmm0,%%xmm1 \n"
340 "movdqa %%xmm0,%%xmm2 \n"
341 "psllw $0x1,%%xmm1 \n"
342 "psllw $0xb,%%xmm2 \n"
343 "pand %%xmm3,%%xmm1 \n"
344 "pmulhuw %%xmm5,%%xmm2 \n"
345 "pmulhuw %%xmm5,%%xmm1 \n"
346 "psllw $0x8,%%xmm1 \n"
347 "por %%xmm2,%%xmm1 \n"
348 "movdqa %%xmm0,%%xmm2 \n"
349 "pand %%xmm4,%%xmm0 \n"
350 "psraw $0x8,%%xmm2 \n"
351 "pmulhuw %%xmm6,%%xmm0 \n"
352 "pand %%xmm7,%%xmm2 \n"
353 "por %%xmm2,%%xmm0 \n"
354 "movdqa %%xmm1,%%xmm2 \n"
355 "punpcklbw %%xmm0,%%xmm1 \n"
356 "punpckhbw %%xmm0,%%xmm2 \n"
357 "movdqa %%xmm1,(%1,%0,2) \n"
358 "movdqa %%xmm2,0x10(%1,%0,2) \n"
359 "lea 0x10(%0),%0 \n"
360 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000361 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000362 : "+r"(src), // %0
363 "+r"(dst), // %1
364 "+r"(pix) // %2
365 :
366 : "memory", "cc", "eax"
367#if defined(__SSE2__)
368 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
369#endif
370 );
371}
372
373void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000374 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000375 "mov $0xf0f0f0f,%%eax \n"
376 "movd %%eax,%%xmm4 \n"
377 "pshufd $0x0,%%xmm4,%%xmm4 \n"
378 "movdqa %%xmm4,%%xmm5 \n"
379 "pslld $0x4,%%xmm5 \n"
380 "sub %0,%1 \n"
381 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000382 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000383 "1: \n"
384 "movdqu (%0),%%xmm0 \n"
385 "movdqa %%xmm0,%%xmm2 \n"
386 "pand %%xmm4,%%xmm0 \n"
387 "pand %%xmm5,%%xmm2 \n"
388 "movdqa %%xmm0,%%xmm1 \n"
389 "movdqa %%xmm2,%%xmm3 \n"
390 "psllw $0x4,%%xmm1 \n"
391 "psrlw $0x4,%%xmm3 \n"
392 "por %%xmm1,%%xmm0 \n"
393 "por %%xmm3,%%xmm2 \n"
394 "movdqa %%xmm0,%%xmm1 \n"
395 "punpcklbw %%xmm2,%%xmm0 \n"
396 "punpckhbw %%xmm2,%%xmm1 \n"
397 "movdqa %%xmm0,(%1,%0,2) \n"
398 "movdqa %%xmm1,0x10(%1,%0,2) \n"
399 "lea 0x10(%0),%0 \n"
400 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000401 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000402 : "+r"(src), // %0
403 "+r"(dst), // %1
404 "+r"(pix) // %2
405 :
406 : "memory", "cc", "eax"
407#if defined(__SSE2__)
408 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
409#endif
410 );
411}
412
413void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000414 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000415 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000416 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000417 "1: \n"
418 "movdqa (%0),%%xmm0 \n"
419 "movdqa 0x10(%0),%%xmm1 \n"
420 "movdqa 0x20(%0),%%xmm2 \n"
421 "movdqa 0x30(%0),%%xmm3 \n"
422 "lea 0x40(%0),%0 \n"
423 "pshufb %%xmm6,%%xmm0 \n"
424 "pshufb %%xmm6,%%xmm1 \n"
425 "pshufb %%xmm6,%%xmm2 \n"
426 "pshufb %%xmm6,%%xmm3 \n"
427 "movdqa %%xmm1,%%xmm4 \n"
428 "psrldq $0x4,%%xmm1 \n"
429 "pslldq $0xc,%%xmm4 \n"
430 "movdqa %%xmm2,%%xmm5 \n"
431 "por %%xmm4,%%xmm0 \n"
432 "pslldq $0x8,%%xmm5 \n"
433 "movdqa %%xmm0,(%1) \n"
434 "por %%xmm5,%%xmm1 \n"
435 "psrldq $0x8,%%xmm2 \n"
436 "pslldq $0x4,%%xmm3 \n"
437 "por %%xmm3,%%xmm2 \n"
438 "movdqa %%xmm1,0x10(%1) \n"
439 "movdqa %%xmm2,0x20(%1) \n"
440 "lea 0x30(%1),%1 \n"
441 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 : "m"(kShuffleMaskARGBToRGB24) // %3
447 : "memory", "cc"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
450#endif
451 );
452}
453
454void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000457 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000458 "1: \n"
459 "movdqa (%0),%%xmm0 \n"
460 "movdqa 0x10(%0),%%xmm1 \n"
461 "movdqa 0x20(%0),%%xmm2 \n"
462 "movdqa 0x30(%0),%%xmm3 \n"
463 "lea 0x40(%0),%0 \n"
464 "pshufb %%xmm6,%%xmm0 \n"
465 "pshufb %%xmm6,%%xmm1 \n"
466 "pshufb %%xmm6,%%xmm2 \n"
467 "pshufb %%xmm6,%%xmm3 \n"
468 "movdqa %%xmm1,%%xmm4 \n"
469 "psrldq $0x4,%%xmm1 \n"
470 "pslldq $0xc,%%xmm4 \n"
471 "movdqa %%xmm2,%%xmm5 \n"
472 "por %%xmm4,%%xmm0 \n"
473 "pslldq $0x8,%%xmm5 \n"
474 "movdqa %%xmm0,(%1) \n"
475 "por %%xmm5,%%xmm1 \n"
476 "psrldq $0x8,%%xmm2 \n"
477 "pslldq $0x4,%%xmm3 \n"
478 "por %%xmm3,%%xmm2 \n"
479 "movdqa %%xmm1,0x10(%1) \n"
480 "movdqa %%xmm2,0x20(%1) \n"
481 "lea 0x30(%1),%1 \n"
482 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000483 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000484 : "+r"(src), // %0
485 "+r"(dst), // %1
486 "+r"(pix) // %2
487 : "m"(kShuffleMaskARGBToRAW) // %3
488 : "memory", "cc"
489#if defined(__SSE2__)
490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
491#endif
492 );
493}
494
495void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000496 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000497 "pcmpeqb %%xmm3,%%xmm3 \n"
498 "psrld $0x1b,%%xmm3 \n"
499 "pcmpeqb %%xmm4,%%xmm4 \n"
500 "psrld $0x1a,%%xmm4 \n"
501 "pslld $0x5,%%xmm4 \n"
502 "pcmpeqb %%xmm5,%%xmm5 \n"
503 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000504 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000505 "1: \n"
506 "movdqa (%0),%%xmm0 \n"
507 "movdqa %%xmm0,%%xmm1 \n"
508 "movdqa %%xmm0,%%xmm2 \n"
509 "pslld $0x8,%%xmm0 \n"
510 "psrld $0x3,%%xmm1 \n"
511 "psrld $0x5,%%xmm2 \n"
512 "psrad $0x10,%%xmm0 \n"
513 "pand %%xmm3,%%xmm1 \n"
514 "pand %%xmm4,%%xmm2 \n"
515 "pand %%xmm5,%%xmm0 \n"
516 "por %%xmm2,%%xmm1 \n"
517 "por %%xmm1,%%xmm0 \n"
518 "packssdw %%xmm0,%%xmm0 \n"
519 "lea 0x10(%0),%0 \n"
520 "movq %%xmm0,(%1) \n"
521 "lea 0x8(%1),%1 \n"
522 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 :
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
531#endif
532 );
533}
534
535void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "pcmpeqb %%xmm4,%%xmm4 \n"
538 "psrld $0x1b,%%xmm4 \n"
539 "movdqa %%xmm4,%%xmm5 \n"
540 "pslld $0x5,%%xmm5 \n"
541 "movdqa %%xmm4,%%xmm6 \n"
542 "pslld $0xa,%%xmm6 \n"
543 "pcmpeqb %%xmm7,%%xmm7 \n"
544 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000545 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000546 "1: \n"
547 "movdqa (%0),%%xmm0 \n"
548 "movdqa %%xmm0,%%xmm1 \n"
549 "movdqa %%xmm0,%%xmm2 \n"
550 "movdqa %%xmm0,%%xmm3 \n"
551 "psrad $0x10,%%xmm0 \n"
552 "psrld $0x3,%%xmm1 \n"
553 "psrld $0x6,%%xmm2 \n"
554 "psrld $0x9,%%xmm3 \n"
555 "pand %%xmm7,%%xmm0 \n"
556 "pand %%xmm4,%%xmm1 \n"
557 "pand %%xmm5,%%xmm2 \n"
558 "pand %%xmm6,%%xmm3 \n"
559 "por %%xmm1,%%xmm0 \n"
560 "por %%xmm3,%%xmm2 \n"
561 "por %%xmm2,%%xmm0 \n"
562 "packssdw %%xmm0,%%xmm0 \n"
563 "lea 0x10(%0),%0 \n"
564 "movq %%xmm0,(%1) \n"
565 "lea 0x8(%1),%1 \n"
566 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000567 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 : "+r"(src), // %0
569 "+r"(dst), // %1
570 "+r"(pix) // %2
571 :
572 : "memory", "cc"
573#if defined(__SSE2__)
574 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
575#endif
576 );
577}
578
579void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000580 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000581 "pcmpeqb %%xmm4,%%xmm4 \n"
582 "psllw $0xc,%%xmm4 \n"
583 "movdqa %%xmm4,%%xmm3 \n"
584 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "pand %%xmm3,%%xmm0 \n"
590 "pand %%xmm4,%%xmm1 \n"
591 "psrlq $0x4,%%xmm0 \n"
592 "psrlq $0x8,%%xmm1 \n"
593 "por %%xmm1,%%xmm0 \n"
594 "packuswb %%xmm0,%%xmm0 \n"
595 "lea 0x10(%0),%0 \n"
596 "movq %%xmm0,(%1) \n"
597 "lea 0x8(%1),%1 \n"
598 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000599 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000600 : "+r"(src), // %0
601 "+r"(dst), // %1
602 "+r"(pix) // %2
603 :
604 : "memory", "cc"
605#if defined(__SSE2__)
606 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
607#endif
608 );
609}
610
fbarchard@google.comb6149762011-11-07 21:58:52 +0000611void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000612 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000613 "movdqa %4,%%xmm5 \n"
614 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000615 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000616 "1: \n"
617 "movdqa (%0),%%xmm0 \n"
618 "movdqa 0x10(%0),%%xmm1 \n"
619 "movdqa 0x20(%0),%%xmm2 \n"
620 "movdqa 0x30(%0),%%xmm3 \n"
621 "pmaddubsw %%xmm4,%%xmm0 \n"
622 "pmaddubsw %%xmm4,%%xmm1 \n"
623 "pmaddubsw %%xmm4,%%xmm2 \n"
624 "pmaddubsw %%xmm4,%%xmm3 \n"
625 "lea 0x40(%0),%0 \n"
626 "phaddw %%xmm1,%%xmm0 \n"
627 "phaddw %%xmm3,%%xmm2 \n"
628 "psrlw $0x7,%%xmm0 \n"
629 "psrlw $0x7,%%xmm2 \n"
630 "packuswb %%xmm2,%%xmm0 \n"
631 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000632 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000633 "movdqa %%xmm0,(%1) \n"
634 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000636 : "+r"(src_argb), // %0
637 "+r"(dst_y), // %1
638 "+r"(pix) // %2
639 : "m"(kARGBToY), // %3
640 "m"(kAddY16) // %4
641 : "memory", "cc"
642#if defined(__SSE2__)
643 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
644#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000645 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000646}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000647
648void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000649 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000650 "movdqa %4,%%xmm5 \n"
651 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000652 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000653 "1: \n"
654 "movdqu (%0),%%xmm0 \n"
655 "movdqu 0x10(%0),%%xmm1 \n"
656 "movdqu 0x20(%0),%%xmm2 \n"
657 "movdqu 0x30(%0),%%xmm3 \n"
658 "pmaddubsw %%xmm4,%%xmm0 \n"
659 "pmaddubsw %%xmm4,%%xmm1 \n"
660 "pmaddubsw %%xmm4,%%xmm2 \n"
661 "pmaddubsw %%xmm4,%%xmm3 \n"
662 "lea 0x40(%0),%0 \n"
663 "phaddw %%xmm1,%%xmm0 \n"
664 "phaddw %%xmm3,%%xmm2 \n"
665 "psrlw $0x7,%%xmm0 \n"
666 "psrlw $0x7,%%xmm2 \n"
667 "packuswb %%xmm2,%%xmm0 \n"
668 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000669 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000670 "movdqu %%xmm0,(%1) \n"
671 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000672 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000673 : "+r"(src_argb), // %0
674 "+r"(dst_y), // %1
675 "+r"(pix) // %2
676 : "m"(kARGBToY), // %3
677 "m"(kAddY16) // %4
678 : "memory", "cc"
679#if defined(__SSE2__)
680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
681#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000682 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000683}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000684
fbarchard@google.com714050a2012-02-17 22:59:56 +0000685// TODO(fbarchard): pass xmm constants to single block of assembly.
686// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
687// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
688// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
689// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000690void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
691 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000693 "movdqa %0,%%xmm4 \n"
694 "movdqa %1,%%xmm3 \n"
695 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000696 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000697 : "m"(kARGBToU), // %0
698 "m"(kARGBToV), // %1
699 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000700 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000701 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000702 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000703 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000704 "1: \n"
705 "movdqa (%0),%%xmm0 \n"
706 "movdqa 0x10(%0),%%xmm1 \n"
707 "movdqa 0x20(%0),%%xmm2 \n"
708 "movdqa 0x30(%0),%%xmm6 \n"
709 "pavgb (%0,%4,1),%%xmm0 \n"
710 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
711 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
712 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
713 "lea 0x40(%0),%0 \n"
714 "movdqa %%xmm0,%%xmm7 \n"
715 "shufps $0x88,%%xmm1,%%xmm0 \n"
716 "shufps $0xdd,%%xmm1,%%xmm7 \n"
717 "pavgb %%xmm7,%%xmm0 \n"
718 "movdqa %%xmm2,%%xmm7 \n"
719 "shufps $0x88,%%xmm6,%%xmm2 \n"
720 "shufps $0xdd,%%xmm6,%%xmm7 \n"
721 "pavgb %%xmm7,%%xmm2 \n"
722 "movdqa %%xmm0,%%xmm1 \n"
723 "movdqa %%xmm2,%%xmm6 \n"
724 "pmaddubsw %%xmm4,%%xmm0 \n"
725 "pmaddubsw %%xmm4,%%xmm2 \n"
726 "pmaddubsw %%xmm3,%%xmm1 \n"
727 "pmaddubsw %%xmm3,%%xmm6 \n"
728 "phaddw %%xmm2,%%xmm0 \n"
729 "phaddw %%xmm6,%%xmm1 \n"
730 "psraw $0x8,%%xmm0 \n"
731 "psraw $0x8,%%xmm1 \n"
732 "packsswb %%xmm1,%%xmm0 \n"
733 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000734 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000735 "movlps %%xmm0,(%1) \n"
736 "movhps %%xmm0,(%1,%2,1) \n"
737 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000738 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000739 : "+r"(src_argb0), // %0
740 "+r"(dst_u), // %1
741 "+r"(dst_v), // %2
742 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000743 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000744 : "memory", "cc"
745#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000746 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000747#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000748 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000749}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000750
751void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
752 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000753 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000754 "movdqa %0,%%xmm4 \n"
755 "movdqa %1,%%xmm3 \n"
756 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000757 :
758 : "m"(kARGBToU), // %0
759 "m"(kARGBToV), // %1
760 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000761 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000762 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000763 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm6 \n"
770 "movdqu (%0,%4,1),%%xmm7 \n"
771 "pavgb %%xmm7,%%xmm0 \n"
772 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
773 "pavgb %%xmm7,%%xmm1 \n"
774 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm2 \n"
776 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
777 "pavgb %%xmm7,%%xmm6 \n"
778 "lea 0x40(%0),%0 \n"
779 "movdqa %%xmm0,%%xmm7 \n"
780 "shufps $0x88,%%xmm1,%%xmm0 \n"
781 "shufps $0xdd,%%xmm1,%%xmm7 \n"
782 "pavgb %%xmm7,%%xmm0 \n"
783 "movdqa %%xmm2,%%xmm7 \n"
784 "shufps $0x88,%%xmm6,%%xmm2 \n"
785 "shufps $0xdd,%%xmm6,%%xmm7 \n"
786 "pavgb %%xmm7,%%xmm2 \n"
787 "movdqa %%xmm0,%%xmm1 \n"
788 "movdqa %%xmm2,%%xmm6 \n"
789 "pmaddubsw %%xmm4,%%xmm0 \n"
790 "pmaddubsw %%xmm4,%%xmm2 \n"
791 "pmaddubsw %%xmm3,%%xmm1 \n"
792 "pmaddubsw %%xmm3,%%xmm6 \n"
793 "phaddw %%xmm2,%%xmm0 \n"
794 "phaddw %%xmm6,%%xmm1 \n"
795 "psraw $0x8,%%xmm0 \n"
796 "psraw $0x8,%%xmm1 \n"
797 "packsswb %%xmm1,%%xmm0 \n"
798 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000799 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000800 "movlps %%xmm0,(%1) \n"
801 "movhps %%xmm0,(%1,%2,1) \n"
802 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000803 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000804 : "+r"(src_argb0), // %0
805 "+r"(dst_u), // %1
806 "+r"(dst_v), // %2
807 "+rm"(width) // %3
808 : "r"(static_cast<intptr_t>(src_stride_argb))
809 : "memory", "cc"
810#if defined(__SSE2__)
811 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
812#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000813 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000814}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000815
fbarchard@google.com714050a2012-02-17 22:59:56 +0000816void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000817 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000818 "movdqa %4,%%xmm5 \n"
819 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000820 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000821 "1: \n"
822 "movdqa (%0),%%xmm0 \n"
823 "movdqa 0x10(%0),%%xmm1 \n"
824 "movdqa 0x20(%0),%%xmm2 \n"
825 "movdqa 0x30(%0),%%xmm3 \n"
826 "pmaddubsw %%xmm4,%%xmm0 \n"
827 "pmaddubsw %%xmm4,%%xmm1 \n"
828 "pmaddubsw %%xmm4,%%xmm2 \n"
829 "pmaddubsw %%xmm4,%%xmm3 \n"
830 "lea 0x40(%0),%0 \n"
831 "phaddw %%xmm1,%%xmm0 \n"
832 "phaddw %%xmm3,%%xmm2 \n"
833 "psrlw $0x7,%%xmm0 \n"
834 "psrlw $0x7,%%xmm2 \n"
835 "packuswb %%xmm2,%%xmm0 \n"
836 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000837 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000838 "movdqa %%xmm0,(%1) \n"
839 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000840 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000841 : "+r"(src_bgra), // %0
842 "+r"(dst_y), // %1
843 "+r"(pix) // %2
844 : "m"(kBGRAToY), // %3
845 "m"(kAddY16) // %4
846 : "memory", "cc"
847#if defined(__SSE2__)
848 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000849#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000850 );
851}
852
853void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000854 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000855 "movdqa %4,%%xmm5 \n"
856 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000857 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000858 "1: \n"
859 "movdqu (%0),%%xmm0 \n"
860 "movdqu 0x10(%0),%%xmm1 \n"
861 "movdqu 0x20(%0),%%xmm2 \n"
862 "movdqu 0x30(%0),%%xmm3 \n"
863 "pmaddubsw %%xmm4,%%xmm0 \n"
864 "pmaddubsw %%xmm4,%%xmm1 \n"
865 "pmaddubsw %%xmm4,%%xmm2 \n"
866 "pmaddubsw %%xmm4,%%xmm3 \n"
867 "lea 0x40(%0),%0 \n"
868 "phaddw %%xmm1,%%xmm0 \n"
869 "phaddw %%xmm3,%%xmm2 \n"
870 "psrlw $0x7,%%xmm0 \n"
871 "psrlw $0x7,%%xmm2 \n"
872 "packuswb %%xmm2,%%xmm0 \n"
873 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000874 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000875 "movdqu %%xmm0,(%1) \n"
876 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000877 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000878 : "+r"(src_bgra), // %0
879 "+r"(dst_y), // %1
880 "+r"(pix) // %2
881 : "m"(kBGRAToY), // %3
882 "m"(kAddY16) // %4
883 : "memory", "cc"
884#if defined(__SSE2__)
885 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
886#endif
887 );
888}
889
890void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
891 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000892 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000893 "movdqa %0,%%xmm4 \n"
894 "movdqa %1,%%xmm3 \n"
895 "movdqa %2,%%xmm5 \n"
896 :
897 : "m"(kBGRAToU), // %0
898 "m"(kBGRAToV), // %1
899 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000900 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000901 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000902 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000903 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000904 "1: \n"
905 "movdqa (%0),%%xmm0 \n"
906 "movdqa 0x10(%0),%%xmm1 \n"
907 "movdqa 0x20(%0),%%xmm2 \n"
908 "movdqa 0x30(%0),%%xmm6 \n"
909 "pavgb (%0,%4,1),%%xmm0 \n"
910 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
911 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
912 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
913 "lea 0x40(%0),%0 \n"
914 "movdqa %%xmm0,%%xmm7 \n"
915 "shufps $0x88,%%xmm1,%%xmm0 \n"
916 "shufps $0xdd,%%xmm1,%%xmm7 \n"
917 "pavgb %%xmm7,%%xmm0 \n"
918 "movdqa %%xmm2,%%xmm7 \n"
919 "shufps $0x88,%%xmm6,%%xmm2 \n"
920 "shufps $0xdd,%%xmm6,%%xmm7 \n"
921 "pavgb %%xmm7,%%xmm2 \n"
922 "movdqa %%xmm0,%%xmm1 \n"
923 "movdqa %%xmm2,%%xmm6 \n"
924 "pmaddubsw %%xmm4,%%xmm0 \n"
925 "pmaddubsw %%xmm4,%%xmm2 \n"
926 "pmaddubsw %%xmm3,%%xmm1 \n"
927 "pmaddubsw %%xmm3,%%xmm6 \n"
928 "phaddw %%xmm2,%%xmm0 \n"
929 "phaddw %%xmm6,%%xmm1 \n"
930 "psraw $0x8,%%xmm0 \n"
931 "psraw $0x8,%%xmm1 \n"
932 "packsswb %%xmm1,%%xmm0 \n"
933 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000934 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000935 "movlps %%xmm0,(%1) \n"
936 "movhps %%xmm0,(%1,%2,1) \n"
937 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000938 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000939 : "+r"(src_bgra0), // %0
940 "+r"(dst_u), // %1
941 "+r"(dst_v), // %2
942 "+rm"(width) // %3
943 : "r"(static_cast<intptr_t>(src_stride_bgra))
944 : "memory", "cc"
945#if defined(__SSE2__)
946 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
947#endif
948 );
949}
950
951void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
952 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000953 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000954 "movdqa %0,%%xmm4 \n"
955 "movdqa %1,%%xmm3 \n"
956 "movdqa %2,%%xmm5 \n"
957 :
958 : "m"(kBGRAToU), // %0
959 "m"(kBGRAToV), // %1
960 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000961 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000962 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000963 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000964 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000965 "1: \n"
966 "movdqu (%0),%%xmm0 \n"
967 "movdqu 0x10(%0),%%xmm1 \n"
968 "movdqu 0x20(%0),%%xmm2 \n"
969 "movdqu 0x30(%0),%%xmm6 \n"
970 "movdqu (%0,%4,1),%%xmm7 \n"
971 "pavgb %%xmm7,%%xmm0 \n"
972 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
973 "pavgb %%xmm7,%%xmm1 \n"
974 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm2 \n"
976 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm6 \n"
978 "lea 0x40(%0),%0 \n"
979 "movdqa %%xmm0,%%xmm7 \n"
980 "shufps $0x88,%%xmm1,%%xmm0 \n"
981 "shufps $0xdd,%%xmm1,%%xmm7 \n"
982 "pavgb %%xmm7,%%xmm0 \n"
983 "movdqa %%xmm2,%%xmm7 \n"
984 "shufps $0x88,%%xmm6,%%xmm2 \n"
985 "shufps $0xdd,%%xmm6,%%xmm7 \n"
986 "pavgb %%xmm7,%%xmm2 \n"
987 "movdqa %%xmm0,%%xmm1 \n"
988 "movdqa %%xmm2,%%xmm6 \n"
989 "pmaddubsw %%xmm4,%%xmm0 \n"
990 "pmaddubsw %%xmm4,%%xmm2 \n"
991 "pmaddubsw %%xmm3,%%xmm1 \n"
992 "pmaddubsw %%xmm3,%%xmm6 \n"
993 "phaddw %%xmm2,%%xmm0 \n"
994 "phaddw %%xmm6,%%xmm1 \n"
995 "psraw $0x8,%%xmm0 \n"
996 "psraw $0x8,%%xmm1 \n"
997 "packsswb %%xmm1,%%xmm0 \n"
998 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000999 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001000 "movlps %%xmm0,(%1) \n"
1001 "movhps %%xmm0,(%1,%2,1) \n"
1002 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001003 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001004 : "+r"(src_bgra0), // %0
1005 "+r"(dst_u), // %1
1006 "+r"(dst_v), // %2
1007 "+rm"(width) // %3
1008 : "r"(static_cast<intptr_t>(src_stride_bgra))
1009 : "memory", "cc"
1010#if defined(__SSE2__)
1011 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1012#endif
1013 );
1014}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001015
1016void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001017 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001018 "movdqa %4,%%xmm5 \n"
1019 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001020 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001021 "1: \n"
1022 "movdqa (%0),%%xmm0 \n"
1023 "movdqa 0x10(%0),%%xmm1 \n"
1024 "movdqa 0x20(%0),%%xmm2 \n"
1025 "movdqa 0x30(%0),%%xmm3 \n"
1026 "pmaddubsw %%xmm4,%%xmm0 \n"
1027 "pmaddubsw %%xmm4,%%xmm1 \n"
1028 "pmaddubsw %%xmm4,%%xmm2 \n"
1029 "pmaddubsw %%xmm4,%%xmm3 \n"
1030 "lea 0x40(%0),%0 \n"
1031 "phaddw %%xmm1,%%xmm0 \n"
1032 "phaddw %%xmm3,%%xmm2 \n"
1033 "psrlw $0x7,%%xmm0 \n"
1034 "psrlw $0x7,%%xmm2 \n"
1035 "packuswb %%xmm2,%%xmm0 \n"
1036 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001037 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001038 "movdqa %%xmm0,(%1) \n"
1039 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001040 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001041 : "+r"(src_abgr), // %0
1042 "+r"(dst_y), // %1
1043 "+r"(pix) // %2
1044 : "m"(kABGRToY), // %3
1045 "m"(kAddY16) // %4
1046 : "memory", "cc"
1047#if defined(__SSE2__)
1048 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1049#endif
1050 );
1051}
1052
1053void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001054 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001055 "movdqa %4,%%xmm5 \n"
1056 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001057 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001058 "1: \n"
1059 "movdqu (%0),%%xmm0 \n"
1060 "movdqu 0x10(%0),%%xmm1 \n"
1061 "movdqu 0x20(%0),%%xmm2 \n"
1062 "movdqu 0x30(%0),%%xmm3 \n"
1063 "pmaddubsw %%xmm4,%%xmm0 \n"
1064 "pmaddubsw %%xmm4,%%xmm1 \n"
1065 "pmaddubsw %%xmm4,%%xmm2 \n"
1066 "pmaddubsw %%xmm4,%%xmm3 \n"
1067 "lea 0x40(%0),%0 \n"
1068 "phaddw %%xmm1,%%xmm0 \n"
1069 "phaddw %%xmm3,%%xmm2 \n"
1070 "psrlw $0x7,%%xmm0 \n"
1071 "psrlw $0x7,%%xmm2 \n"
1072 "packuswb %%xmm2,%%xmm0 \n"
1073 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001074 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001075 "movdqu %%xmm0,(%1) \n"
1076 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001077 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001078 : "+r"(src_abgr), // %0
1079 "+r"(dst_y), // %1
1080 "+r"(pix) // %2
1081 : "m"(kABGRToY), // %3
1082 "m"(kAddY16) // %4
1083 : "memory", "cc"
1084#if defined(__SSE2__)
1085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1086#endif
1087 );
1088}
1089
1090void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1091 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001092 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001093 "movdqa %0,%%xmm4 \n"
1094 "movdqa %1,%%xmm3 \n"
1095 "movdqa %2,%%xmm5 \n"
1096 :
1097 : "m"(kABGRToU), // %0
1098 "m"(kABGRToV), // %1
1099 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001100 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001101 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001102 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001103 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001104 "1: \n"
1105 "movdqa (%0),%%xmm0 \n"
1106 "movdqa 0x10(%0),%%xmm1 \n"
1107 "movdqa 0x20(%0),%%xmm2 \n"
1108 "movdqa 0x30(%0),%%xmm6 \n"
1109 "pavgb (%0,%4,1),%%xmm0 \n"
1110 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1111 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1112 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1113 "lea 0x40(%0),%0 \n"
1114 "movdqa %%xmm0,%%xmm7 \n"
1115 "shufps $0x88,%%xmm1,%%xmm0 \n"
1116 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1117 "pavgb %%xmm7,%%xmm0 \n"
1118 "movdqa %%xmm2,%%xmm7 \n"
1119 "shufps $0x88,%%xmm6,%%xmm2 \n"
1120 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1121 "pavgb %%xmm7,%%xmm2 \n"
1122 "movdqa %%xmm0,%%xmm1 \n"
1123 "movdqa %%xmm2,%%xmm6 \n"
1124 "pmaddubsw %%xmm4,%%xmm0 \n"
1125 "pmaddubsw %%xmm4,%%xmm2 \n"
1126 "pmaddubsw %%xmm3,%%xmm1 \n"
1127 "pmaddubsw %%xmm3,%%xmm6 \n"
1128 "phaddw %%xmm2,%%xmm0 \n"
1129 "phaddw %%xmm6,%%xmm1 \n"
1130 "psraw $0x8,%%xmm0 \n"
1131 "psraw $0x8,%%xmm1 \n"
1132 "packsswb %%xmm1,%%xmm0 \n"
1133 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001134 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 "movlps %%xmm0,(%1) \n"
1136 "movhps %%xmm0,(%1,%2,1) \n"
1137 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001138 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 : "+r"(src_abgr0), // %0
1140 "+r"(dst_u), // %1
1141 "+r"(dst_v), // %2
1142 "+rm"(width) // %3
1143 : "r"(static_cast<intptr_t>(src_stride_abgr))
1144 : "memory", "cc"
1145#if defined(__SSE2__)
1146 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1147#endif
1148 );
1149}
1150
1151void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1152 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001153 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001154 "movdqa %0,%%xmm4 \n"
1155 "movdqa %1,%%xmm3 \n"
1156 "movdqa %2,%%xmm5 \n"
1157 :
1158 : "m"(kABGRToU), // %0
1159 "m"(kABGRToV), // %1
1160 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001161 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001162 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001163 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001164 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001165 "1: \n"
1166 "movdqu (%0),%%xmm0 \n"
1167 "movdqu 0x10(%0),%%xmm1 \n"
1168 "movdqu 0x20(%0),%%xmm2 \n"
1169 "movdqu 0x30(%0),%%xmm6 \n"
1170 "movdqu (%0,%4,1),%%xmm7 \n"
1171 "pavgb %%xmm7,%%xmm0 \n"
1172 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1173 "pavgb %%xmm7,%%xmm1 \n"
1174 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm2 \n"
1176 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1177 "pavgb %%xmm7,%%xmm6 \n"
1178 "lea 0x40(%0),%0 \n"
1179 "movdqa %%xmm0,%%xmm7 \n"
1180 "shufps $0x88,%%xmm1,%%xmm0 \n"
1181 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1182 "pavgb %%xmm7,%%xmm0 \n"
1183 "movdqa %%xmm2,%%xmm7 \n"
1184 "shufps $0x88,%%xmm6,%%xmm2 \n"
1185 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1186 "pavgb %%xmm7,%%xmm2 \n"
1187 "movdqa %%xmm0,%%xmm1 \n"
1188 "movdqa %%xmm2,%%xmm6 \n"
1189 "pmaddubsw %%xmm4,%%xmm0 \n"
1190 "pmaddubsw %%xmm4,%%xmm2 \n"
1191 "pmaddubsw %%xmm3,%%xmm1 \n"
1192 "pmaddubsw %%xmm3,%%xmm6 \n"
1193 "phaddw %%xmm2,%%xmm0 \n"
1194 "phaddw %%xmm6,%%xmm1 \n"
1195 "psraw $0x8,%%xmm0 \n"
1196 "psraw $0x8,%%xmm1 \n"
1197 "packsswb %%xmm1,%%xmm0 \n"
1198 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001199 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001200 "movlps %%xmm0,(%1) \n"
1201 "movhps %%xmm0,(%1,%2,1) \n"
1202 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001203 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001204 : "+r"(src_abgr0), // %0
1205 "+r"(dst_u), // %1
1206 "+r"(dst_v), // %2
1207 "+rm"(width) // %3
1208 : "r"(static_cast<intptr_t>(src_stride_abgr))
1209 : "memory", "cc"
1210#if defined(__SSE2__)
1211 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1212#endif
1213 );
1214}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001215#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001216
fbarchard@google.come214fe32012-06-04 23:47:11 +00001217#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001218#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1219#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1220#define UR 0
1221
1222#define VB 0
1223#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1224#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1225
1226// Bias
1227#define BB UB * 128 + VB * 128
1228#define BG UG * 128 + VG * 128
1229#define BR UR * 128 + VR * 128
1230
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001231#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001232
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001233struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001234 vec8 kUVToB; // 0
1235 vec8 kUVToG; // 16
1236 vec8 kUVToR; // 32
1237 vec16 kUVBiasB; // 48
1238 vec16 kUVBiasG; // 64
1239 vec16 kUVBiasR; // 80
1240 vec16 kYSub16; // 96
1241 vec16 kYToRgb; // 112
1242 vec8 kVUToB; // 128
1243 vec8 kVUToG; // 144
1244 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001245} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001246 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1247 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1248 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1249 { BB, BB, BB, BB, BB, BB, BB, BB },
1250 { BG, BG, BG, BG, BG, BG, BG, BG },
1251 { BR, BR, BR, BR, BR, BR, BR, BR },
1252 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001253 { YG, YG, YG, YG, YG, YG, YG, YG },
1254 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1255 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1256 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001257};
1258
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001259
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001260// Read 8 UV from 411
1261#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001262 "movq (%[u_buf]),%%xmm0 \n" \
1263 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1264 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001265 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001266
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001267// Read 4 UV from 422, upsample to 8 UV
1268#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001269 "movd (%[u_buf]),%%xmm0 \n" \
1270 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1271 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001272 "punpcklbw %%xmm1,%%xmm0 \n" \
1273 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001274
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001275// Read 2 UV from 411, upsample to 8 UV
1276#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001277 "movd (%[u_buf]),%%xmm0 \n" \
1278 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1279 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001280 "punpcklbw %%xmm1,%%xmm0 \n" \
1281 "punpcklwd %%xmm0,%%xmm0 \n" \
1282 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001283
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001284// Read 4 UV from NV12, upsample to 8 UV
1285#define READNV12 \
1286 "movq (%[uv_buf]),%%xmm0 \n" \
1287 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
1288 "punpcklbw %%xmm1,%%xmm0 \n" \
1289
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001290// Convert 8 pixels: 8 UV and 8 Y
1291#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001292 "movdqa %%xmm0,%%xmm1 \n" \
1293 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001294 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1295 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1296 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1297 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1298 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1299 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1300 "movq (%[y_buf]),%%xmm3 \n" \
1301 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001302 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001303 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1304 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001305 "paddsw %%xmm3,%%xmm0 \n" \
1306 "paddsw %%xmm3,%%xmm1 \n" \
1307 "paddsw %%xmm3,%%xmm2 \n" \
1308 "psraw $0x6,%%xmm0 \n" \
1309 "psraw $0x6,%%xmm1 \n" \
1310 "psraw $0x6,%%xmm2 \n" \
1311 "packuswb %%xmm0,%%xmm0 \n" \
1312 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001313 "packuswb %%xmm2,%%xmm2 \n" \
1314
1315// Convert 8 pixels: 8 VU and 8 Y
1316#define YVUTORGB \
1317 "movdqa %%xmm0,%%xmm1 \n" \
1318 "movdqa %%xmm0,%%xmm2 \n" \
1319 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1320 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1321 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1322 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1323 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1324 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1325 "movq (%[y_buf]),%%xmm3 \n" \
1326 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1327 "punpcklbw %%xmm4,%%xmm3 \n" \
1328 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1329 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1330 "paddsw %%xmm3,%%xmm0 \n" \
1331 "paddsw %%xmm3,%%xmm1 \n" \
1332 "paddsw %%xmm3,%%xmm2 \n" \
1333 "psraw $0x6,%%xmm0 \n" \
1334 "psraw $0x6,%%xmm1 \n" \
1335 "psraw $0x6,%%xmm2 \n" \
1336 "packuswb %%xmm0,%%xmm0 \n" \
1337 "packuswb %%xmm1,%%xmm1 \n" \
1338 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001339
1340void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001341 const uint8* u_buf,
1342 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001343 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001344 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001345 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001346 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001347 "pcmpeqb %%xmm5,%%xmm5 \n"
1348 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001349 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001350 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001351 READYUV444
1352 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001353 "punpcklbw %%xmm1,%%xmm0 \n"
1354 "punpcklbw %%xmm5,%%xmm2 \n"
1355 "movdqa %%xmm0,%%xmm1 \n"
1356 "punpcklwd %%xmm2,%%xmm0 \n"
1357 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001358 "movdqa %%xmm0,(%[argb_buf]) \n"
1359 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1360 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1361 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001362 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001363 : [y_buf]"+r"(y_buf), // %[y_buf]
1364 [u_buf]"+r"(u_buf), // %[u_buf]
1365 [v_buf]"+r"(v_buf), // %[v_buf]
1366 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1367 [width]"+rm"(width) // %[width]
1368 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001369 : "memory", "cc"
1370#if defined(__SSE2__)
1371 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1372#endif
1373 );
1374}
1375
fbarchard@google.come214fe32012-06-04 23:47:11 +00001376void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001377 const uint8* u_buf,
1378 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001379 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001380 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001381 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001382 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001383 "pcmpeqb %%xmm5,%%xmm5 \n"
1384 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001385 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001386 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001387 READYUV422
1388 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001389 "punpcklbw %%xmm1,%%xmm0 \n"
1390 "punpcklbw %%xmm5,%%xmm2 \n"
1391 "movdqa %%xmm0,%%xmm1 \n"
1392 "punpcklwd %%xmm2,%%xmm0 \n"
1393 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001394 "movdqa %%xmm0,(%[argb_buf]) \n"
1395 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1396 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1397 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001398 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001399 : [y_buf]"+r"(y_buf), // %[y_buf]
1400 [u_buf]"+r"(u_buf), // %[u_buf]
1401 [v_buf]"+r"(v_buf), // %[v_buf]
1402 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1403 [width]"+rm"(width) // %[width]
1404 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001405 : "memory", "cc"
1406#if defined(__SSE2__)
1407 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1408#endif
1409 );
1410}
1411
1412void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1413 const uint8* u_buf,
1414 const uint8* v_buf,
1415 uint8* argb_buf,
1416 int width) {
1417 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001418 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001419 "pcmpeqb %%xmm5,%%xmm5 \n"
1420 "pxor %%xmm4,%%xmm4 \n"
1421 ".p2align 4 \n"
1422 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001423 READYUV411
1424 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001425 "punpcklbw %%xmm1,%%xmm0 \n"
1426 "punpcklbw %%xmm5,%%xmm2 \n"
1427 "movdqa %%xmm0,%%xmm1 \n"
1428 "punpcklwd %%xmm2,%%xmm0 \n"
1429 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001430 "movdqa %%xmm0,(%[argb_buf]) \n"
1431 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1432 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1433 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001434 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001435 : [y_buf]"+r"(y_buf), // %[y_buf]
1436 [u_buf]"+r"(u_buf), // %[u_buf]
1437 [v_buf]"+r"(v_buf), // %[v_buf]
1438 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1439 [width]"+rm"(width) // %[width]
1440 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1441 : "memory", "cc"
1442#if defined(__SSE2__)
1443 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1444#endif
1445 );
1446}
1447
1448void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1449 const uint8* uv_buf,
1450 uint8* argb_buf,
1451 int width) {
1452 asm volatile (
1453 "pcmpeqb %%xmm5,%%xmm5 \n"
1454 "pxor %%xmm4,%%xmm4 \n"
1455 ".p2align 4 \n"
1456 "1: \n"
1457 READNV12
1458 YUVTORGB
1459 "punpcklbw %%xmm1,%%xmm0 \n"
1460 "punpcklbw %%xmm5,%%xmm2 \n"
1461 "movdqa %%xmm0,%%xmm1 \n"
1462 "punpcklwd %%xmm2,%%xmm0 \n"
1463 "punpckhwd %%xmm2,%%xmm1 \n"
1464 "movdqa %%xmm0,(%[argb_buf]) \n"
1465 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1466 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1467 "sub $0x8,%[width] \n"
1468 "jg 1b \n"
1469 : [y_buf]"+r"(y_buf), // %[y_buf]
1470 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1471 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1472 [width]"+rm"(width) // %[width]
1473 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1474 : "memory", "cc"
1475#if defined(__SSE2__)
1476 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1477#endif
1478 );
1479}
1480
1481void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1482 const uint8* vu_buf,
1483 uint8* argb_buf,
1484 int width) {
1485 asm volatile (
1486 "pcmpeqb %%xmm5,%%xmm5 \n"
1487 "pxor %%xmm4,%%xmm4 \n"
1488 ".p2align 4 \n"
1489 "1: \n"
1490 READNV12
1491 YVUTORGB
1492 "punpcklbw %%xmm1,%%xmm0 \n"
1493 "punpcklbw %%xmm5,%%xmm2 \n"
1494 "movdqa %%xmm0,%%xmm1 \n"
1495 "punpcklwd %%xmm2,%%xmm0 \n"
1496 "punpckhwd %%xmm2,%%xmm1 \n"
1497 "movdqa %%xmm0,(%[argb_buf]) \n"
1498 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1499 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1500 "sub $0x8,%[width] \n"
1501 "jg 1b \n"
1502 : [y_buf]"+r"(y_buf), // %[y_buf]
1503 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1504 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1505 [width]"+rm"(width) // %[width]
1506 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001507 : "memory", "cc"
1508#if defined(__SSE2__)
1509 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1510#endif
1511 );
1512}
1513
1514void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1515 const uint8* u_buf,
1516 const uint8* v_buf,
1517 uint8* argb_buf,
1518 int width) {
1519 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001520 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001521 "pcmpeqb %%xmm5,%%xmm5 \n"
1522 "pxor %%xmm4,%%xmm4 \n"
1523 ".p2align 4 \n"
1524 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001525 READYUV444
1526 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001527 "punpcklbw %%xmm1,%%xmm0 \n"
1528 "punpcklbw %%xmm5,%%xmm2 \n"
1529 "movdqa %%xmm0,%%xmm1 \n"
1530 "punpcklwd %%xmm2,%%xmm0 \n"
1531 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001532 "movdqu %%xmm0,(%[argb_buf]) \n"
1533 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1534 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1535 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001536 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001537 : [y_buf]"+r"(y_buf), // %[y_buf]
1538 [u_buf]"+r"(u_buf), // %[u_buf]
1539 [v_buf]"+r"(v_buf), // %[v_buf]
1540 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1541 [width]"+rm"(width) // %[width]
1542 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001543 : "memory", "cc"
1544#if defined(__SSE2__)
1545 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1546#endif
1547 );
1548}
1549
1550void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1551 const uint8* u_buf,
1552 const uint8* v_buf,
1553 uint8* argb_buf,
1554 int width) {
1555 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001556 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001557 "pcmpeqb %%xmm5,%%xmm5 \n"
1558 "pxor %%xmm4,%%xmm4 \n"
1559 ".p2align 4 \n"
1560 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001561 READYUV422
1562 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001563 "punpcklbw %%xmm1,%%xmm0 \n"
1564 "punpcklbw %%xmm5,%%xmm2 \n"
1565 "movdqa %%xmm0,%%xmm1 \n"
1566 "punpcklwd %%xmm2,%%xmm0 \n"
1567 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001568 "movdqu %%xmm0,(%[argb_buf]) \n"
1569 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1570 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1571 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001572 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001573 : [y_buf]"+r"(y_buf), // %[y_buf]
1574 [u_buf]"+r"(u_buf), // %[u_buf]
1575 [v_buf]"+r"(v_buf), // %[v_buf]
1576 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1577 [width]"+rm"(width) // %[width]
1578 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001579 : "memory", "cc"
1580#if defined(__SSE2__)
1581 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1582#endif
1583 );
1584}
1585
1586void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1587 const uint8* u_buf,
1588 const uint8* v_buf,
1589 uint8* argb_buf,
1590 int width) {
1591 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001592 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001593 "pcmpeqb %%xmm5,%%xmm5 \n"
1594 "pxor %%xmm4,%%xmm4 \n"
1595 ".p2align 4 \n"
1596 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001597 READYUV411
1598 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001599 "punpcklbw %%xmm1,%%xmm0 \n"
1600 "punpcklbw %%xmm5,%%xmm2 \n"
1601 "movdqa %%xmm0,%%xmm1 \n"
1602 "punpcklwd %%xmm2,%%xmm0 \n"
1603 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001604 "movdqu %%xmm0,(%[argb_buf]) \n"
1605 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1606 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1607 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001608 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001609 : [y_buf]"+r"(y_buf), // %[y_buf]
1610 [u_buf]"+r"(u_buf), // %[u_buf]
1611 [v_buf]"+r"(v_buf), // %[v_buf]
1612 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1613 [width]"+rm"(width) // %[width]
1614 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1615 : "memory", "cc"
1616#if defined(__SSE2__)
1617 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1618#endif
1619 );
1620}
1621
1622void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1623 const uint8* uv_buf,
1624 uint8* argb_buf,
1625 int width) {
1626 asm volatile (
1627 "pcmpeqb %%xmm5,%%xmm5 \n"
1628 "pxor %%xmm4,%%xmm4 \n"
1629 ".p2align 4 \n"
1630 "1: \n"
1631 READNV12
1632 YUVTORGB
1633 "punpcklbw %%xmm1,%%xmm0 \n"
1634 "punpcklbw %%xmm5,%%xmm2 \n"
1635 "movdqa %%xmm0,%%xmm1 \n"
1636 "punpcklwd %%xmm2,%%xmm0 \n"
1637 "punpckhwd %%xmm2,%%xmm1 \n"
1638 "movdqu %%xmm0,(%[argb_buf]) \n"
1639 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1640 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1641 "sub $0x8,%[width] \n"
1642 "jg 1b \n"
1643 : [y_buf]"+r"(y_buf), // %[y_buf]
1644 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1645 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1646 [width]"+rm"(width) // %[width]
1647 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1648 : "memory", "cc"
1649#if defined(__SSE2__)
1650 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1651#endif
1652 );
1653}
1654
1655void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1656 const uint8* vu_buf,
1657 uint8* argb_buf,
1658 int width) {
1659 asm volatile (
1660 "pcmpeqb %%xmm5,%%xmm5 \n"
1661 "pxor %%xmm4,%%xmm4 \n"
1662 ".p2align 4 \n"
1663 "1: \n"
1664 READNV12
1665 YVUTORGB
1666 "punpcklbw %%xmm1,%%xmm0 \n"
1667 "punpcklbw %%xmm5,%%xmm2 \n"
1668 "movdqa %%xmm0,%%xmm1 \n"
1669 "punpcklwd %%xmm2,%%xmm0 \n"
1670 "punpckhwd %%xmm2,%%xmm1 \n"
1671 "movdqu %%xmm0,(%[argb_buf]) \n"
1672 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1673 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1674 "sub $0x8,%[width] \n"
1675 "jg 1b \n"
1676 : [y_buf]"+r"(y_buf), // %[y_buf]
1677 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1678 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1679 [width]"+rm"(width) // %[width]
1680 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001681 : "memory", "cc"
1682#if defined(__SSE2__)
1683 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1684#endif
1685 );
1686}
1687
1688void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1689 const uint8* u_buf,
1690 const uint8* v_buf,
1691 uint8* bgra_buf,
1692 int width) {
1693 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001694 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001695 "pcmpeqb %%xmm5,%%xmm5 \n"
1696 "pxor %%xmm4,%%xmm4 \n"
1697 ".p2align 4 \n"
1698 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001699 READYUV422
1700 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001701 "pcmpeqb %%xmm5,%%xmm5 \n"
1702 "punpcklbw %%xmm0,%%xmm1 \n"
1703 "punpcklbw %%xmm2,%%xmm5 \n"
1704 "movdqa %%xmm5,%%xmm0 \n"
1705 "punpcklwd %%xmm1,%%xmm5 \n"
1706 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001707 "movdqa %%xmm5,(%[argb_buf]) \n"
1708 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1709 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1710 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001711 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001712 : [y_buf]"+r"(y_buf), // %[y_buf]
1713 [u_buf]"+r"(u_buf), // %[u_buf]
1714 [v_buf]"+r"(v_buf), // %[v_buf]
1715 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1716 [width]"+rm"(width) // %[width]
1717 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001718 : "memory", "cc"
1719#if defined(__SSE2__)
1720 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1721#endif
1722 );
1723}
1724
fbarchard@google.come214fe32012-06-04 23:47:11 +00001725void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001726 const uint8* u_buf,
1727 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001728 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001729 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001730 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001731 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001732 "pcmpeqb %%xmm5,%%xmm5 \n"
1733 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001734 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001735 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001736 READYUV422
1737 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001738 "punpcklbw %%xmm1,%%xmm2 \n"
1739 "punpcklbw %%xmm5,%%xmm0 \n"
1740 "movdqa %%xmm2,%%xmm1 \n"
1741 "punpcklwd %%xmm0,%%xmm2 \n"
1742 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001743 "movdqa %%xmm2,(%[argb_buf]) \n"
1744 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1745 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1746 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001747 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001748 : [y_buf]"+r"(y_buf), // %[y_buf]
1749 [u_buf]"+r"(u_buf), // %[u_buf]
1750 [v_buf]"+r"(v_buf), // %[v_buf]
1751 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1752 [width]"+rm"(width) // %[width]
1753 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001754 : "memory", "cc"
1755#if defined(__SSE2__)
1756 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1757#endif
1758 );
1759}
1760
fbarchard@google.come214fe32012-06-04 23:47:11 +00001761void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001762 const uint8* u_buf,
1763 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001764 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001765 int width) {
1766 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001767 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001768 "pcmpeqb %%xmm5,%%xmm5 \n"
1769 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001770 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001771 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001772 READYUV422
1773 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001774 "pcmpeqb %%xmm5,%%xmm5 \n"
1775 "punpcklbw %%xmm0,%%xmm1 \n"
1776 "punpcklbw %%xmm2,%%xmm5 \n"
1777 "movdqa %%xmm5,%%xmm0 \n"
1778 "punpcklwd %%xmm1,%%xmm5 \n"
1779 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001780 "movdqu %%xmm5,(%[argb_buf]) \n"
1781 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
1782 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1783 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001784 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001785 : [y_buf]"+r"(y_buf), // %[y_buf]
1786 [u_buf]"+r"(u_buf), // %[u_buf]
1787 [v_buf]"+r"(v_buf), // %[v_buf]
1788 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1789 [width]"+rm"(width) // %[width]
1790 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001791 : "memory", "cc"
1792#if defined(__SSE2__)
1793 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1794#endif
1795 );
1796}
1797
fbarchard@google.come214fe32012-06-04 23:47:11 +00001798void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001799 const uint8* u_buf,
1800 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001801 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001802 int width) {
1803 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001804 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001805 "pcmpeqb %%xmm5,%%xmm5 \n"
1806 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001807 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001808 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001809 READYUV422
1810 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001811 "punpcklbw %%xmm1,%%xmm2 \n"
1812 "punpcklbw %%xmm5,%%xmm0 \n"
1813 "movdqa %%xmm2,%%xmm1 \n"
1814 "punpcklwd %%xmm0,%%xmm2 \n"
1815 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001816 "movdqu %%xmm2,(%[argb_buf]) \n"
1817 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1818 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1819 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001820 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001821 : [y_buf]"+r"(y_buf), // %[y_buf]
1822 [u_buf]"+r"(u_buf), // %[u_buf]
1823 [v_buf]"+r"(v_buf), // %[v_buf]
1824 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1825 [width]"+rm"(width) // %[width]
1826 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001827 : "memory", "cc"
1828#if defined(__SSE2__)
1829 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1830#endif
1831 );
1832}
fbarchard@google.come214fe32012-06-04 23:47:11 +00001833#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001834
1835#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001836void YToARGBRow_SSE2(const uint8* y_buf,
1837 uint8* rgb_buf,
1838 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001839 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001840 "pcmpeqb %%xmm4,%%xmm4 \n"
1841 "pslld $0x18,%%xmm4 \n"
1842 "mov $0x10001000,%%eax \n"
1843 "movd %%eax,%%xmm3 \n"
1844 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1845 "mov $0x012a012a,%%eax \n"
1846 "movd %%eax,%%xmm2 \n"
1847 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001848 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001849 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001850 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001851 "movq (%0),%%xmm0 \n"
1852 "lea 0x8(%0),%0 \n"
1853 "punpcklbw %%xmm0,%%xmm0 \n"
1854 "psubusw %%xmm3,%%xmm0 \n"
1855 "pmulhuw %%xmm2,%%xmm0 \n"
1856 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001857
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001858 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001859 "punpcklbw %%xmm0,%%xmm0 \n"
1860 "movdqa %%xmm0,%%xmm1 \n"
1861 "punpcklwd %%xmm0,%%xmm0 \n"
1862 "punpckhwd %%xmm1,%%xmm1 \n"
1863 "por %%xmm4,%%xmm0 \n"
1864 "por %%xmm4,%%xmm1 \n"
1865 "movdqa %%xmm0,(%1) \n"
1866 "movdqa %%xmm1,16(%1) \n"
1867 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001868
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001869 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001870 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001871 : "+r"(y_buf), // %0
1872 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001873 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001874 :
1875 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001876#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001877 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001878#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001879 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001880}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001881#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001882
fbarchard@google.com42831e02012-01-21 02:54:17 +00001883#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001884// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001885CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001886 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1887};
1888
fbarchard@google.com42831e02012-01-21 02:54:17 +00001889void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001890 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001891 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001892 "movdqa %3,%%xmm5 \n"
1893 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001894 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001895 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001896 "movdqa (%0,%2),%%xmm0 \n"
1897 "pshufb %%xmm5,%%xmm0 \n"
1898 "sub $0x10,%2 \n"
1899 "movdqa %%xmm0,(%1) \n"
1900 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001901 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001902 : "+r"(src), // %0
1903 "+r"(dst), // %1
1904 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001905 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001906 : "memory", "cc"
1907#if defined(__SSE2__)
1908 , "xmm0", "xmm5"
1909#endif
1910 );
1911}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001912#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001913
fbarchard@google.com42831e02012-01-21 02:54:17 +00001914#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001915void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001916 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001917 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001918 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001919 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001920 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001921 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001922 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001923 "psllw $0x8,%%xmm0 \n"
1924 "psrlw $0x8,%%xmm1 \n"
1925 "por %%xmm1,%%xmm0 \n"
1926 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1927 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1928 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1929 "sub $0x10,%2 \n"
1930 "movdqu %%xmm0,(%1) \n"
1931 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001932 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001933 : "+r"(src), // %0
1934 "+r"(dst), // %1
1935 "+r"(temp_width) // %2
1936 :
1937 : "memory", "cc"
1938#if defined(__SSE2__)
1939 , "xmm0", "xmm1"
1940#endif
1941 );
1942}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001943#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001944
fbarchard@google.com16a96642012-03-02 22:38:09 +00001945#ifdef HAS_MIRRORROW_UV_SSSE3
1946// Shuffle table for reversing the bytes of UV channels.
1947CONST uvec8 kShuffleMirrorUV = {
1948 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1949};
1950void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1951 int width) {
1952 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001953 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001954 "movdqa %4,%%xmm1 \n"
1955 "lea -16(%0,%3,2),%0 \n"
1956 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001957 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001958 "1: \n"
1959 "movdqa (%0),%%xmm0 \n"
1960 "lea -16(%0),%0 \n"
1961 "pshufb %%xmm1,%%xmm0 \n"
1962 "sub $8,%3 \n"
1963 "movlpd %%xmm0,(%1) \n"
1964 "movhpd %%xmm0,(%1,%2) \n"
1965 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001966 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001967 : "+r"(src), // %0
1968 "+r"(dst_u), // %1
1969 "+r"(dst_v), // %2
1970 "+r"(temp_width) // %3
1971 : "m"(kShuffleMirrorUV) // %4
1972 : "memory", "cc"
1973#if defined(__SSE2__)
1974 , "xmm0", "xmm1"
1975#endif
1976 );
1977}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001978#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00001979
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001980#ifdef HAS_SPLITUV_SSE2
1981void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001982 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001983 "pcmpeqb %%xmm5,%%xmm5 \n"
1984 "psrlw $0x8,%%xmm5 \n"
1985 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001986 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001987 "1: \n"
1988 "movdqa (%0),%%xmm0 \n"
1989 "movdqa 0x10(%0),%%xmm1 \n"
1990 "lea 0x20(%0),%0 \n"
1991 "movdqa %%xmm0,%%xmm2 \n"
1992 "movdqa %%xmm1,%%xmm3 \n"
1993 "pand %%xmm5,%%xmm0 \n"
1994 "pand %%xmm5,%%xmm1 \n"
1995 "packuswb %%xmm1,%%xmm0 \n"
1996 "psrlw $0x8,%%xmm2 \n"
1997 "psrlw $0x8,%%xmm3 \n"
1998 "packuswb %%xmm3,%%xmm2 \n"
1999 "movdqa %%xmm0,(%1) \n"
2000 "movdqa %%xmm2,(%1,%2) \n"
2001 "lea 0x10(%1),%1 \n"
2002 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002003 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002004 : "+r"(src_uv), // %0
2005 "+r"(dst_u), // %1
2006 "+r"(dst_v), // %2
2007 "+r"(pix) // %3
2008 :
2009 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002010#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002011 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002012#endif
2013 );
2014}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002015#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002016
fbarchard@google.com19932f82012-02-16 22:19:14 +00002017#ifdef HAS_COPYROW_SSE2
2018void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002019 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002020 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002021 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002022 "1: \n"
2023 "movdqa (%0),%%xmm0 \n"
2024 "movdqa 0x10(%0),%%xmm1 \n"
2025 "movdqa %%xmm0,(%0,%1) \n"
2026 "movdqa %%xmm1,0x10(%0,%1) \n"
2027 "lea 0x20(%0),%0 \n"
2028 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002029 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002030 : "+r"(src), // %0
2031 "+r"(dst), // %1
2032 "+r"(count) // %2
2033 :
2034 : "memory", "cc"
2035#if defined(__SSE2__)
2036 , "xmm0", "xmm1"
2037#endif
2038 );
2039}
2040#endif // HAS_COPYROW_SSE2
2041
2042#ifdef HAS_COPYROW_X86
2043void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2044 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002045 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002046 "shr $0x2,%2 \n"
2047 "rep movsl \n"
2048 : "+S"(src), // %0
2049 "+D"(dst), // %1
2050 "+c"(width_tmp) // %2
2051 :
2052 : "memory", "cc"
2053 );
2054}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002055#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002056
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002057#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002058void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002059 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002060 "pcmpeqb %%xmm5,%%xmm5 \n"
2061 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002062 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002063 "1: \n"
2064 "movdqa (%0),%%xmm0 \n"
2065 "movdqa 0x10(%0),%%xmm1 \n"
2066 "lea 0x20(%0),%0 \n"
2067 "pand %%xmm5,%%xmm0 \n"
2068 "pand %%xmm5,%%xmm1 \n"
2069 "packuswb %%xmm1,%%xmm0 \n"
2070 "movdqa %%xmm0,(%1) \n"
2071 "lea 0x10(%1),%1 \n"
2072 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002073 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002074 : "+r"(src_yuy2), // %0
2075 "+r"(dst_y), // %1
2076 "+r"(pix) // %2
2077 :
2078 : "memory", "cc"
2079#if defined(__SSE2__)
2080 , "xmm0", "xmm1", "xmm5"
2081#endif
2082 );
2083}
2084
2085void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2086 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002087 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002088 "pcmpeqb %%xmm5,%%xmm5 \n"
2089 "psrlw $0x8,%%xmm5 \n"
2090 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002091 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002092 "1: \n"
2093 "movdqa (%0),%%xmm0 \n"
2094 "movdqa 0x10(%0),%%xmm1 \n"
2095 "movdqa (%0,%4,1),%%xmm2 \n"
2096 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2097 "lea 0x20(%0),%0 \n"
2098 "pavgb %%xmm2,%%xmm0 \n"
2099 "pavgb %%xmm3,%%xmm1 \n"
2100 "psrlw $0x8,%%xmm0 \n"
2101 "psrlw $0x8,%%xmm1 \n"
2102 "packuswb %%xmm1,%%xmm0 \n"
2103 "movdqa %%xmm0,%%xmm1 \n"
2104 "pand %%xmm5,%%xmm0 \n"
2105 "packuswb %%xmm0,%%xmm0 \n"
2106 "psrlw $0x8,%%xmm1 \n"
2107 "packuswb %%xmm1,%%xmm1 \n"
2108 "movq %%xmm0,(%1) \n"
2109 "movq %%xmm1,(%1,%2) \n"
2110 "lea 0x8(%1),%1 \n"
2111 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002112 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002113 : "+r"(src_yuy2), // %0
2114 "+r"(dst_u), // %1
2115 "+r"(dst_y), // %2
2116 "+r"(pix) // %3
2117 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2118 : "memory", "cc"
2119#if defined(__SSE2__)
2120 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2121#endif
2122 );
2123}
2124
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002125
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002126void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2127 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002128 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002129 "pcmpeqb %%xmm5,%%xmm5 \n"
2130 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002131 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002132 "1: \n"
2133 "movdqu (%0),%%xmm0 \n"
2134 "movdqu 0x10(%0),%%xmm1 \n"
2135 "lea 0x20(%0),%0 \n"
2136 "pand %%xmm5,%%xmm0 \n"
2137 "pand %%xmm5,%%xmm1 \n"
2138 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002139 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002140 "movdqu %%xmm0,(%1) \n"
2141 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002142 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002143 : "+r"(src_yuy2), // %0
2144 "+r"(dst_y), // %1
2145 "+r"(pix) // %2
2146 :
2147 : "memory", "cc"
2148#if defined(__SSE2__)
2149 , "xmm0", "xmm1", "xmm5"
2150#endif
2151 );
2152}
2153
2154void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2155 int stride_yuy2,
2156 uint8* dst_u, uint8* dst_y,
2157 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002158 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002159 "pcmpeqb %%xmm5,%%xmm5 \n"
2160 "psrlw $0x8,%%xmm5 \n"
2161 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002162 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002163 "1: \n"
2164 "movdqu (%0),%%xmm0 \n"
2165 "movdqu 0x10(%0),%%xmm1 \n"
2166 "movdqu (%0,%4,1),%%xmm2 \n"
2167 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2168 "lea 0x20(%0),%0 \n"
2169 "pavgb %%xmm2,%%xmm0 \n"
2170 "pavgb %%xmm3,%%xmm1 \n"
2171 "psrlw $0x8,%%xmm0 \n"
2172 "psrlw $0x8,%%xmm1 \n"
2173 "packuswb %%xmm1,%%xmm0 \n"
2174 "movdqa %%xmm0,%%xmm1 \n"
2175 "pand %%xmm5,%%xmm0 \n"
2176 "packuswb %%xmm0,%%xmm0 \n"
2177 "psrlw $0x8,%%xmm1 \n"
2178 "packuswb %%xmm1,%%xmm1 \n"
2179 "movq %%xmm0,(%1) \n"
2180 "movq %%xmm1,(%1,%2) \n"
2181 "lea 0x8(%1),%1 \n"
2182 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002183 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002184 : "+r"(src_yuy2), // %0
2185 "+r"(dst_u), // %1
2186 "+r"(dst_y), // %2
2187 "+r"(pix) // %3
2188 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2189 : "memory", "cc"
2190#if defined(__SSE2__)
2191 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2192#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002193 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002194}
2195
2196void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002197 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002198 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002199 "1: \n"
2200 "movdqa (%0),%%xmm0 \n"
2201 "movdqa 0x10(%0),%%xmm1 \n"
2202 "lea 0x20(%0),%0 \n"
2203 "psrlw $0x8,%%xmm0 \n"
2204 "psrlw $0x8,%%xmm1 \n"
2205 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002206 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002207 "movdqa %%xmm0,(%1) \n"
2208 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002209 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002210 : "+r"(src_uyvy), // %0
2211 "+r"(dst_y), // %1
2212 "+r"(pix) // %2
2213 :
2214 : "memory", "cc"
2215#if defined(__SSE2__)
2216 , "xmm0", "xmm1"
2217#endif
2218 );
2219}
2220
2221void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2222 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002223 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002224 "pcmpeqb %%xmm5,%%xmm5 \n"
2225 "psrlw $0x8,%%xmm5 \n"
2226 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002227 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002228 "1: \n"
2229 "movdqa (%0),%%xmm0 \n"
2230 "movdqa 0x10(%0),%%xmm1 \n"
2231 "movdqa (%0,%4,1),%%xmm2 \n"
2232 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2233 "lea 0x20(%0),%0 \n"
2234 "pavgb %%xmm2,%%xmm0 \n"
2235 "pavgb %%xmm3,%%xmm1 \n"
2236 "pand %%xmm5,%%xmm0 \n"
2237 "pand %%xmm5,%%xmm1 \n"
2238 "packuswb %%xmm1,%%xmm0 \n"
2239 "movdqa %%xmm0,%%xmm1 \n"
2240 "pand %%xmm5,%%xmm0 \n"
2241 "packuswb %%xmm0,%%xmm0 \n"
2242 "psrlw $0x8,%%xmm1 \n"
2243 "packuswb %%xmm1,%%xmm1 \n"
2244 "movq %%xmm0,(%1) \n"
2245 "movq %%xmm1,(%1,%2) \n"
2246 "lea 0x8(%1),%1 \n"
2247 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002248 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002249 : "+r"(src_uyvy), // %0
2250 "+r"(dst_u), // %1
2251 "+r"(dst_y), // %2
2252 "+r"(pix) // %3
2253 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2254 : "memory", "cc"
2255#if defined(__SSE2__)
2256 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2257#endif
2258 );
2259}
2260
2261void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2262 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002263 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002264 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002265 "1: \n"
2266 "movdqu (%0),%%xmm0 \n"
2267 "movdqu 0x10(%0),%%xmm1 \n"
2268 "lea 0x20(%0),%0 \n"
2269 "psrlw $0x8,%%xmm0 \n"
2270 "psrlw $0x8,%%xmm1 \n"
2271 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002272 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002273 "movdqu %%xmm0,(%1) \n"
2274 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002275 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002276 : "+r"(src_uyvy), // %0
2277 "+r"(dst_y), // %1
2278 "+r"(pix) // %2
2279 :
2280 : "memory", "cc"
2281#if defined(__SSE2__)
2282 , "xmm0", "xmm1"
2283#endif
2284 );
2285}
2286
2287void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2288 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002289 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002290 "pcmpeqb %%xmm5,%%xmm5 \n"
2291 "psrlw $0x8,%%xmm5 \n"
2292 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002293 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002294 "1: \n"
2295 "movdqu (%0),%%xmm0 \n"
2296 "movdqu 0x10(%0),%%xmm1 \n"
2297 "movdqu (%0,%4,1),%%xmm2 \n"
2298 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2299 "lea 0x20(%0),%0 \n"
2300 "pavgb %%xmm2,%%xmm0 \n"
2301 "pavgb %%xmm3,%%xmm1 \n"
2302 "pand %%xmm5,%%xmm0 \n"
2303 "pand %%xmm5,%%xmm1 \n"
2304 "packuswb %%xmm1,%%xmm0 \n"
2305 "movdqa %%xmm0,%%xmm1 \n"
2306 "pand %%xmm5,%%xmm0 \n"
2307 "packuswb %%xmm0,%%xmm0 \n"
2308 "psrlw $0x8,%%xmm1 \n"
2309 "packuswb %%xmm1,%%xmm1 \n"
2310 "movq %%xmm0,(%1) \n"
2311 "movq %%xmm1,(%1,%2) \n"
2312 "lea 0x8(%1),%1 \n"
2313 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002314 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002315 : "+r"(src_uyvy), // %0
2316 "+r"(dst_u), // %1
2317 "+r"(dst_y), // %2
2318 "+r"(pix) // %3
2319 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2320 : "memory", "cc"
2321#if defined(__SSE2__)
2322 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2323#endif
2324 );
2325}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002326#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002327
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002328#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002329// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002330void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
2331 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002332 asm volatile (
2333 "pcmpeqb %%xmm7,%%xmm7 \n"
2334 "psrlw $0xf,%%xmm7 \n"
2335 "pcmpeqb %%xmm6,%%xmm6 \n"
2336 "psrlw $0x8,%%xmm6 \n"
2337 "pcmpeqb %%xmm5,%%xmm5 \n"
2338 "psllw $0x8,%%xmm5 \n"
2339 "pcmpeqb %%xmm4,%%xmm4 \n"
2340 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002341 "sub $0x1,%3 \n"
2342 "je 91f \n"
2343 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002344
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002345 // 1 pixel loop until destination pointer is aligned.
2346 "10: \n"
2347 "test $0xf,%2 \n"
2348 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002349 "movd (%0),%%xmm3 \n"
2350 "lea 0x4(%0),%0 \n"
2351 "movdqa %%xmm3,%%xmm0 \n"
2352 "pxor %%xmm4,%%xmm3 \n"
2353 "movd (%1),%%xmm2 \n"
2354 "psrlw $0x8,%%xmm3 \n"
2355 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2356 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2357 "pand %%xmm6,%%xmm2 \n"
2358 "paddw %%xmm7,%%xmm3 \n"
2359 "pmullw %%xmm3,%%xmm2 \n"
2360 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002361 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002362 "psrlw $0x8,%%xmm1 \n"
2363 "por %%xmm4,%%xmm0 \n"
2364 "pmullw %%xmm3,%%xmm1 \n"
2365 "psrlw $0x8,%%xmm2 \n"
2366 "paddusb %%xmm2,%%xmm0 \n"
2367 "pand %%xmm5,%%xmm1 \n"
2368 "paddusb %%xmm1,%%xmm0 \n"
2369 "sub $0x1,%3 \n"
2370 "movd %%xmm0,(%2) \n"
2371 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002372 "jge 10b \n"
2373
2374 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002375 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002376 "jl 49f \n"
2377
fbarchard@google.com794fe122012-06-15 01:05:01 +00002378 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002379 ".p2align 2 \n"
2380 "41: \n"
2381 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002382 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002383 "movdqa %%xmm3,%%xmm0 \n"
2384 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002385 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002386 "psrlw $0x8,%%xmm3 \n"
2387 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2388 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002389 "pand %%xmm6,%%xmm2 \n"
2390 "paddw %%xmm7,%%xmm3 \n"
2391 "pmullw %%xmm3,%%xmm2 \n"
2392 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002393 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002394 "psrlw $0x8,%%xmm1 \n"
2395 "por %%xmm4,%%xmm0 \n"
2396 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002397 "psrlw $0x8,%%xmm2 \n"
2398 "paddusb %%xmm2,%%xmm0 \n"
2399 "pand %%xmm5,%%xmm1 \n"
2400 "paddusb %%xmm1,%%xmm0 \n"
2401 "sub $0x4,%3 \n"
2402 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002403 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002404 "jge 41b \n"
2405
2406 "49: \n"
2407 "add $0x3,%3 \n"
2408 "jl 99f \n"
2409
fbarchard@google.com794fe122012-06-15 01:05:01 +00002410 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002411 "91: \n"
2412 "movd (%0),%%xmm3 \n"
2413 "lea 0x4(%0),%0 \n"
2414 "movdqa %%xmm3,%%xmm0 \n"
2415 "pxor %%xmm4,%%xmm3 \n"
2416 "movd (%1),%%xmm2 \n"
2417 "psrlw $0x8,%%xmm3 \n"
2418 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2419 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2420 "pand %%xmm6,%%xmm2 \n"
2421 "paddw %%xmm7,%%xmm3 \n"
2422 "pmullw %%xmm3,%%xmm2 \n"
2423 "movd (%1),%%xmm1 \n"
2424 "lea 0x4(%1),%1 \n"
2425 "psrlw $0x8,%%xmm1 \n"
2426 "por %%xmm4,%%xmm0 \n"
2427 "pmullw %%xmm3,%%xmm1 \n"
2428 "psrlw $0x8,%%xmm2 \n"
2429 "paddusb %%xmm2,%%xmm0 \n"
2430 "pand %%xmm5,%%xmm1 \n"
2431 "paddusb %%xmm1,%%xmm0 \n"
2432 "sub $0x1,%3 \n"
2433 "movd %%xmm0,(%2) \n"
2434 "lea 0x4(%2),%2 \n"
2435 "jge 91b \n"
2436 "99: \n"
2437 : "+r"(src_argb0), // %0
2438 "+r"(src_argb1), // %1
2439 "+r"(dst_argb), // %2
2440 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00002441 :
2442 : "memory", "cc"
2443#if defined(__SSE2__)
2444 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2445#endif
2446 );
2447}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002448#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002449
fbarchard@google.com96af8702012-04-06 18:22:27 +00002450#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002451// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00002452CONST uvec8 kShuffleAlpha = {
2453 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2454 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2455};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002456
2457// Blend 8 pixels at a time
2458// Shuffle table for reversing the bytes.
2459
2460// Same as SSE2, but replaces
2461// psrlw xmm3, 8 // alpha
2462// pshufhw xmm3, xmm3,0F5h // 8 alpha words
2463// pshuflw xmm3, xmm3,0F5h
2464// with..
2465// pshufb xmm3, kShuffleAlpha // alpha
2466
2467void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2468 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002469 asm volatile (
2470 "pcmpeqb %%xmm7,%%xmm7 \n"
2471 "psrlw $0xf,%%xmm7 \n"
2472 "pcmpeqb %%xmm6,%%xmm6 \n"
2473 "psrlw $0x8,%%xmm6 \n"
2474 "pcmpeqb %%xmm5,%%xmm5 \n"
2475 "psllw $0x8,%%xmm5 \n"
2476 "pcmpeqb %%xmm4,%%xmm4 \n"
2477 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002478 "sub $0x1,%3 \n"
2479 "je 91f \n"
2480 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002481
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002482 // 1 pixel loop until destination pointer is aligned.
2483 "10: \n"
2484 "test $0xf,%2 \n"
2485 "je 19f \n"
2486 "movd (%0),%%xmm3 \n"
2487 "lea 0x4(%0),%0 \n"
2488 "movdqa %%xmm3,%%xmm0 \n"
2489 "pxor %%xmm4,%%xmm3 \n"
2490 "movd (%1),%%xmm2 \n"
2491 "pshufb %4,%%xmm3 \n"
2492 "pand %%xmm6,%%xmm2 \n"
2493 "paddw %%xmm7,%%xmm3 \n"
2494 "pmullw %%xmm3,%%xmm2 \n"
2495 "movd (%1),%%xmm1 \n"
2496 "lea 0x4(%1),%1 \n"
2497 "psrlw $0x8,%%xmm1 \n"
2498 "por %%xmm4,%%xmm0 \n"
2499 "pmullw %%xmm3,%%xmm1 \n"
2500 "psrlw $0x8,%%xmm2 \n"
2501 "paddusb %%xmm2,%%xmm0 \n"
2502 "pand %%xmm5,%%xmm1 \n"
2503 "paddusb %%xmm1,%%xmm0 \n"
2504 "sub $0x1,%3 \n"
2505 "movd %%xmm0,(%2) \n"
2506 "lea 0x4(%2),%2 \n"
2507 "jge 10b \n"
2508
2509 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002510 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002511 "jl 49f \n"
2512
fbarchard@google.com794fe122012-06-15 01:05:01 +00002513 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002514 ".p2align 2 \n"
2515 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002516 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002517 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002518 "movdqa %%xmm3,%%xmm0 \n"
2519 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002520 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002521 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002522 "pand %%xmm6,%%xmm2 \n"
2523 "paddw %%xmm7,%%xmm3 \n"
2524 "pmullw %%xmm3,%%xmm2 \n"
2525 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002526 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002527 "psrlw $0x8,%%xmm1 \n"
2528 "por %%xmm4,%%xmm0 \n"
2529 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002530 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002531 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002532 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002533 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002534 "sub $0x4,%3 \n"
2535 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002536 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002537 "jge 41b \n"
2538
2539 "49: \n"
2540 "add $0x3,%3 \n"
2541 "jl 99f \n"
2542
fbarchard@google.com794fe122012-06-15 01:05:01 +00002543 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002544 "91: \n"
2545 "movd (%0),%%xmm3 \n"
2546 "lea 0x4(%0),%0 \n"
2547 "movdqa %%xmm3,%%xmm0 \n"
2548 "pxor %%xmm4,%%xmm3 \n"
2549 "movd (%1),%%xmm2 \n"
2550 "pshufb %4,%%xmm3 \n"
2551 "pand %%xmm6,%%xmm2 \n"
2552 "paddw %%xmm7,%%xmm3 \n"
2553 "pmullw %%xmm3,%%xmm2 \n"
2554 "movd (%1),%%xmm1 \n"
2555 "lea 0x4(%1),%1 \n"
2556 "psrlw $0x8,%%xmm1 \n"
2557 "por %%xmm4,%%xmm0 \n"
2558 "pmullw %%xmm3,%%xmm1 \n"
2559 "psrlw $0x8,%%xmm2 \n"
2560 "paddusb %%xmm2,%%xmm0 \n"
2561 "pand %%xmm5,%%xmm1 \n"
2562 "paddusb %%xmm1,%%xmm0 \n"
2563 "sub $0x1,%3 \n"
2564 "movd %%xmm0,(%2) \n"
2565 "lea 0x4(%2),%2 \n"
2566 "jge 91b \n"
2567 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002568 : "+r"(src_argb0), // %0
2569 "+r"(src_argb1), // %1
2570 "+r"(dst_argb), // %2
2571 "+r"(width) // %3
2572 : "m"(kShuffleAlpha) // %4
2573 : "memory", "cc"
2574#if defined(__SSE2__)
2575 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2576#endif
2577 );
2578}
2579#endif // HAS_ARGBBLENDROW_SSSE3
2580
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002581#ifdef HAS_ARGBATTENUATE_SSE2
2582// Attenuate 4 pixels at a time.
2583// aligned to 16 bytes
2584void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2585 asm volatile (
2586 "sub %0,%1 \n"
2587 "pcmpeqb %%xmm4,%%xmm4 \n"
2588 "pslld $0x18,%%xmm4 \n"
2589 "pcmpeqb %%xmm5,%%xmm5 \n"
2590 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002591
fbarchard@google.com794fe122012-06-15 01:05:01 +00002592 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002593 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002594 "1: \n"
2595 "movdqa (%0),%%xmm0 \n"
2596 "punpcklbw %%xmm0,%%xmm0 \n"
2597 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2598 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2599 "pmulhuw %%xmm2,%%xmm0 \n"
2600 "movdqa (%0),%%xmm1 \n"
2601 "punpckhbw %%xmm1,%%xmm1 \n"
2602 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2603 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2604 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002605 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002606 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002607 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002608 "psrlw $0x8,%%xmm1 \n"
2609 "packuswb %%xmm1,%%xmm0 \n"
2610 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002611 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002612 "sub $0x4,%2 \n"
2613 "movdqa %%xmm0,(%0,%1,1) \n"
2614 "lea 0x10(%0),%0 \n"
2615 "jg 1b \n"
2616 : "+r"(src_argb), // %0
2617 "+r"(dst_argb), // %1
2618 "+r"(width) // %2
2619 :
2620 : "memory", "cc"
2621#if defined(__SSE2__)
2622 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2623#endif
2624 );
2625}
2626#endif // HAS_ARGBATTENUATE_SSE2
2627
fbarchard@google.com810cd912012-04-20 20:15:27 +00002628#ifdef HAS_ARGBATTENUATE_SSSE3
2629// Shuffle table duplicating alpha
2630CONST uvec8 kShuffleAlpha0 = {
2631 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2632};
2633CONST uvec8 kShuffleAlpha1 = {
2634 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2635 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2636};
2637// Attenuate 4 pixels at a time.
2638// aligned to 16 bytes
2639void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2640 asm volatile (
2641 "sub %0,%1 \n"
2642 "pcmpeqb %%xmm3,%%xmm3 \n"
2643 "pslld $0x18,%%xmm3 \n"
2644 "movdqa %3,%%xmm4 \n"
2645 "movdqa %4,%%xmm5 \n"
2646
2647 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002648 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002649 "1: \n"
2650 "movdqa (%0),%%xmm0 \n"
2651 "pshufb %%xmm4,%%xmm0 \n"
2652 "movdqa (%0),%%xmm1 \n"
2653 "punpcklbw %%xmm1,%%xmm1 \n"
2654 "pmulhuw %%xmm1,%%xmm0 \n"
2655 "movdqa (%0),%%xmm1 \n"
2656 "pshufb %%xmm5,%%xmm1 \n"
2657 "movdqa (%0),%%xmm2 \n"
2658 "punpckhbw %%xmm2,%%xmm2 \n"
2659 "pmulhuw %%xmm2,%%xmm1 \n"
2660 "movdqa (%0),%%xmm2 \n"
2661 "pand %%xmm3,%%xmm2 \n"
2662 "psrlw $0x8,%%xmm0 \n"
2663 "psrlw $0x8,%%xmm1 \n"
2664 "packuswb %%xmm1,%%xmm0 \n"
2665 "por %%xmm2,%%xmm0 \n"
2666 "sub $0x4,%2 \n"
2667 "movdqa %%xmm0,(%0,%1,1) \n"
2668 "lea 0x10(%0),%0 \n"
2669 "jg 1b \n"
2670 : "+r"(src_argb), // %0
2671 "+r"(dst_argb), // %1
2672 "+r"(width) // %2
2673 : "m"(kShuffleAlpha0), // %3
2674 "m"(kShuffleAlpha1) // %4
2675 : "memory", "cc"
2676#if defined(__SSE2__)
2677 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2678#endif
2679 );
2680}
2681#endif // HAS_ARGBATTENUATE_SSSE3
2682
2683#ifdef HAS_ARGBUNATTENUATE_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00002684// Unattenuate 4 pixels at a time.
2685// aligned to 16 bytes
2686void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2687 int width) {
2688 uintptr_t alpha = 0;
2689 asm volatile (
2690 "sub %0,%1 \n"
2691 "pcmpeqb %%xmm4,%%xmm4 \n"
2692 "pslld $0x18,%%xmm4 \n"
2693
2694 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002695 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002696 "1: \n"
2697 "movdqa (%0),%%xmm0 \n"
2698 "movzb 0x3(%0),%3 \n"
2699 "punpcklbw %%xmm0,%%xmm0 \n"
2700 "movd 0x0(%4,%3,4),%%xmm2 \n"
2701 "movzb 0x7(%0),%3 \n"
2702 "movd 0x0(%4,%3,4),%%xmm3 \n"
2703 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2704 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2705 "movlhps %%xmm3,%%xmm2 \n"
2706 "pmulhuw %%xmm2,%%xmm0 \n"
2707 "movdqa (%0),%%xmm1 \n"
2708 "movzb 0xb(%0),%3 \n"
2709 "punpckhbw %%xmm1,%%xmm1 \n"
2710 "movd 0x0(%4,%3,4),%%xmm2 \n"
2711 "movzb 0xf(%0),%3 \n"
2712 "movd 0x0(%4,%3,4),%%xmm3 \n"
2713 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2714 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2715 "movlhps %%xmm3,%%xmm2 \n"
2716 "pmulhuw %%xmm2,%%xmm1 \n"
2717 "movdqa (%0),%%xmm2 \n"
2718 "pand %%xmm4,%%xmm2 \n"
2719 "packuswb %%xmm1,%%xmm0 \n"
2720 "por %%xmm2,%%xmm0 \n"
2721 "sub $0x4,%2 \n"
2722 "movdqa %%xmm0,(%0,%1,1) \n"
2723 "lea 0x10(%0),%0 \n"
2724 "jg 1b \n"
2725 : "+r"(src_argb), // %0
2726 "+r"(dst_argb), // %1
2727 "+r"(width), // %2
2728 "+r"(alpha) // %3
2729 : "r"(fixed_invtbl8) // %4
2730 : "memory", "cc"
2731#if defined(__SSE2__)
2732 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2733#endif
2734 );
2735}
2736#endif // HAS_ARGBUNATTENUATE_SSE2
2737
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002738#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002739// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
2740CONST vec8 kARGBToGray = {
2741 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
2742};
2743
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002744// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2745void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) {
2746 asm volatile (
2747 "movdqa %2,%%xmm4 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002748 // 8 pixel loop \n"
2749 ".p2align 4 \n"
2750 "1: \n"
2751 "movdqa (%0),%%xmm0 \n"
2752 "movdqa 0x10(%0),%%xmm1 \n"
2753 "pmaddubsw %%xmm4,%%xmm0 \n"
2754 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002755 "phaddw %%xmm1,%%xmm0 \n"
2756 "psrlw $0x7,%%xmm0 \n"
2757 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002758 "movdqa (%0),%%xmm2 \n"
2759 "movdqa 0x10(%0),%%xmm3 \n"
2760 "psrld $0x18,%%xmm2 \n"
2761 "psrld $0x18,%%xmm3 \n"
2762 "packuswb %%xmm3,%%xmm2 \n"
2763 "packuswb %%xmm2,%%xmm2 \n"
2764 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002765 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002766 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002767 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002768 "punpcklwd %%xmm3,%%xmm0 \n"
2769 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002770 "sub $0x8,%1 \n"
2771 "movdqa %%xmm0,(%0) \n"
2772 "movdqa %%xmm1,0x10(%0) \n"
2773 "lea 0x20(%0),%0 \n"
2774 "jg 1b \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002775 : "+r"(dst_argb), // %0
2776 "+r"(width) // %1
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002777 : "m"(kARGBToGray) // %2
2778 : "memory", "cc"
2779#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00002780 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002781#endif
2782 );
2783}
2784#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002785
2786#ifdef HAS_ARGBSEPIAROW_SSSE3
2787// b = (r * 35 + g * 68 + b * 17) >> 7
2788// g = (r * 45 + g * 88 + b * 22) >> 7
2789// r = (r * 50 + g * 98 + b * 24) >> 7
2790// Constant for ARGB color to sepia tone
2791CONST vec8 kARGBToSepiaB = {
2792 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
2793};
2794
2795CONST vec8 kARGBToSepiaG = {
2796 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
2797};
2798
2799CONST vec8 kARGBToSepiaR = {
2800 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
2801};
2802
fbarchard@google.come442dc42012-06-18 17:37:09 +00002803// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00002804void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
2805 asm volatile (
2806 "movdqa %2,%%xmm2 \n"
2807 "movdqa %3,%%xmm3 \n"
2808 "movdqa %4,%%xmm4 \n"
2809 // 8 pixel loop \n"
2810 ".p2align 4 \n"
2811 "1: \n"
2812 "movdqa (%0),%%xmm0 \n"
2813 "movdqa 0x10(%0),%%xmm6 \n"
2814 "pmaddubsw %%xmm2,%%xmm0 \n"
2815 "pmaddubsw %%xmm2,%%xmm6 \n"
2816 "phaddw %%xmm6,%%xmm0 \n"
2817 "psrlw $0x7,%%xmm0 \n"
2818 "packuswb %%xmm0,%%xmm0 \n"
2819 "movdqa (%0),%%xmm5 \n"
2820 "movdqa 0x10(%0),%%xmm1 \n"
2821 "pmaddubsw %%xmm3,%%xmm5 \n"
2822 "pmaddubsw %%xmm3,%%xmm1 \n"
2823 "phaddw %%xmm1,%%xmm5 \n"
2824 "psrlw $0x7,%%xmm5 \n"
2825 "packuswb %%xmm5,%%xmm5 \n"
2826 "punpcklbw %%xmm5,%%xmm0 \n"
2827 "movdqa (%0),%%xmm5 \n"
2828 "movdqa 0x10(%0),%%xmm1 \n"
2829 "pmaddubsw %%xmm4,%%xmm5 \n"
2830 "pmaddubsw %%xmm4,%%xmm1 \n"
2831 "phaddw %%xmm1,%%xmm5 \n"
2832 "psrlw $0x7,%%xmm5 \n"
2833 "packuswb %%xmm5,%%xmm5 \n"
2834 "movdqa (%0),%%xmm6 \n"
2835 "movdqa 0x10(%0),%%xmm1 \n"
2836 "psrld $0x18,%%xmm6 \n"
2837 "psrld $0x18,%%xmm1 \n"
2838 "packuswb %%xmm1,%%xmm6 \n"
2839 "packuswb %%xmm6,%%xmm6 \n"
2840 "punpcklbw %%xmm6,%%xmm5 \n"
2841 "movdqa %%xmm0,%%xmm1 \n"
2842 "punpcklwd %%xmm5,%%xmm0 \n"
2843 "punpckhwd %%xmm5,%%xmm1 \n"
2844 "sub $0x8,%1 \n"
2845 "movdqa %%xmm0,(%0) \n"
2846 "movdqa %%xmm1,0x10(%0) \n"
2847 "lea 0x20(%0),%0 \n"
2848 "jg 1b \n"
2849 : "+r"(dst_argb), // %0
2850 "+r"(width) // %1
2851 : "m"(kARGBToSepiaB), // %2
2852 "m"(kARGBToSepiaG), // %3
2853 "m"(kARGBToSepiaR) // %4
2854 : "memory", "cc"
2855#if defined(__SSE2__)
2856 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2857#endif
2858 );
2859}
2860#endif // HAS_ARGBSEPIAROW_SSSE3
2861
fbarchard@google.come442dc42012-06-18 17:37:09 +00002862#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
2863// Tranform 8 ARGB pixels (32 bytes) with color matrix.
2864// Same as Sepia except matrix is provided.
2865void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
2866 int width) {
2867 asm volatile (
2868 "movd (%2),%%xmm2 \n"
2869 "movd 0x4(%2),%%xmm3 \n"
2870 "movd 0x8(%2),%%xmm4 \n"
2871 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2872 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2873 "pshufd $0x0,%%xmm4,%%xmm4 \n"
2874 // 8 pixel loop \n"
2875 ".p2align 4 \n"
2876 "1: \n"
2877 "movdqa (%0),%%xmm0 \n"
2878 "movdqa 0x10(%0),%%xmm6 \n"
2879 "pmaddubsw %%xmm2,%%xmm0 \n"
2880 "pmaddubsw %%xmm2,%%xmm6 \n"
2881 "phaddw %%xmm6,%%xmm0 \n"
2882 "psrlw $0x7,%%xmm0 \n"
2883 "packuswb %%xmm0,%%xmm0 \n"
2884 "movdqa (%0),%%xmm5 \n"
2885 "movdqa 0x10(%0),%%xmm1 \n"
2886 "pmaddubsw %%xmm3,%%xmm5 \n"
2887 "pmaddubsw %%xmm3,%%xmm1 \n"
2888 "phaddw %%xmm1,%%xmm5 \n"
2889 "psrlw $0x7,%%xmm5 \n"
2890 "packuswb %%xmm5,%%xmm5 \n"
2891 "punpcklbw %%xmm5,%%xmm0 \n"
2892 "movdqa (%0),%%xmm5 \n"
2893 "movdqa 0x10(%0),%%xmm1 \n"
2894 "pmaddubsw %%xmm4,%%xmm5 \n"
2895 "pmaddubsw %%xmm4,%%xmm1 \n"
2896 "phaddw %%xmm1,%%xmm5 \n"
2897 "psrlw $0x7,%%xmm5 \n"
2898 "packuswb %%xmm5,%%xmm5 \n"
2899 "movdqa (%0),%%xmm6 \n"
2900 "movdqa 0x10(%0),%%xmm1 \n"
2901 "psrld $0x18,%%xmm6 \n"
2902 "psrld $0x18,%%xmm1 \n"
2903 "packuswb %%xmm1,%%xmm6 \n"
2904 "packuswb %%xmm6,%%xmm6 \n"
2905 "punpcklbw %%xmm6,%%xmm5 \n"
2906 "movdqa %%xmm0,%%xmm1 \n"
2907 "punpcklwd %%xmm5,%%xmm0 \n"
2908 "punpckhwd %%xmm5,%%xmm1 \n"
2909 "sub $0x8,%1 \n"
2910 "movdqa %%xmm0,(%0) \n"
2911 "movdqa %%xmm1,0x10(%0) \n"
2912 "lea 0x20(%0),%0 \n"
2913 "jg 1b \n"
2914 : "+r"(dst_argb), // %0
2915 "+r"(width) // %1
2916 : "r"(matrix_argb) // %2
2917 : "memory", "cc"
2918#if defined(__SSE2__)
2919 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2920#endif
2921 );
2922}
2923#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
2924
fbarchard@google.comf51e8792012-06-10 02:40:04 +00002925#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
2926// Creates a table of cumulative sums where each value is a sum of all values
2927// above and to the left of the value, inclusive of the value.
2928void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00002929 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00002930 asm volatile (
2931 "sub %1,%2 \n"
2932 "pxor %%xmm0,%%xmm0 \n"
2933 "pxor %%xmm1,%%xmm1 \n"
2934 "sub $0x4,%3 \n"
2935 "jl 49f \n"
2936 "test $0xf,%1 \n"
2937 "jne 49f \n"
2938
2939 // 4 pixel loop \n"
2940 ".p2align 2 \n"
2941 "40: \n"
2942 "movdqu (%0),%%xmm2 \n"
2943 "lea 0x10(%0),%0 \n"
2944 "movdqa %%xmm2,%%xmm4 \n"
2945 "punpcklbw %%xmm1,%%xmm2 \n"
2946 "movdqa %%xmm2,%%xmm3 \n"
2947 "punpcklwd %%xmm1,%%xmm2 \n"
2948 "punpckhwd %%xmm1,%%xmm3 \n"
2949 "punpckhbw %%xmm1,%%xmm4 \n"
2950 "movdqa %%xmm4,%%xmm5 \n"
2951 "punpcklwd %%xmm1,%%xmm4 \n"
2952 "punpckhwd %%xmm1,%%xmm5 \n"
2953 "paddd %%xmm2,%%xmm0 \n"
2954 "movdqa (%1,%2,1),%%xmm2 \n"
2955 "paddd %%xmm0,%%xmm2 \n"
2956 "paddd %%xmm3,%%xmm0 \n"
2957 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
2958 "paddd %%xmm0,%%xmm3 \n"
2959 "paddd %%xmm4,%%xmm0 \n"
2960 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
2961 "paddd %%xmm0,%%xmm4 \n"
2962 "paddd %%xmm5,%%xmm0 \n"
2963 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
2964 "paddd %%xmm0,%%xmm5 \n"
2965 "movdqa %%xmm2,(%1) \n"
2966 "movdqa %%xmm3,0x10(%1) \n"
2967 "movdqa %%xmm4,0x20(%1) \n"
2968 "movdqa %%xmm5,0x30(%1) \n"
2969 "lea 0x40(%1),%1 \n"
2970 "sub $0x4,%3 \n"
2971 "jge 40b \n"
2972
2973 "49: \n"
2974 "add $0x3,%3 \n"
2975 "jl 19f \n"
2976
2977 // 1 pixel loop \n"
2978 ".p2align 2 \n"
2979 "10: \n"
2980 "movd (%0),%%xmm2 \n"
2981 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00002982 "punpcklbw %%xmm1,%%xmm2 \n"
2983 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00002984 "paddd %%xmm2,%%xmm0 \n"
2985 "movdqu (%1,%2,1),%%xmm2 \n"
2986 "paddd %%xmm0,%%xmm2 \n"
2987 "movdqu %%xmm2,(%1) \n"
2988 "lea 0x10(%1),%1 \n"
2989 "sub $0x1,%3 \n"
2990 "jge 10b \n"
2991
2992 "19: \n"
2993 : "+r"(row), // %0
2994 "+r"(cumsum), // %1
2995 "+r"(previous_cumsum), // %2
2996 "+r"(width) // %3
2997 :
2998 : "memory", "cc"
2999#if defined(__SSE2__)
3000 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3001#endif
3002 );
3003}
3004#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3005
3006#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3007void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3008 int width, int area, uint8* dst, int count) {
3009 asm volatile (
3010 "movd %5,%%xmm4 \n"
3011 "cvtdq2ps %%xmm4,%%xmm4 \n"
3012 "rcpss %%xmm4,%%xmm4 \n"
3013 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3014 "sub $0x4,%3 \n"
3015 "jl 49f \n"
3016
3017 // 4 pixel loop \n"
3018 ".p2align 2 \n"
3019 "40: \n"
3020 "movdqa (%0),%%xmm0 \n"
3021 "movdqa 0x10(%0),%%xmm1 \n"
3022 "movdqa 0x20(%0),%%xmm2 \n"
3023 "movdqa 0x30(%0),%%xmm3 \n"
3024 "psubd (%0,%4,4),%%xmm0 \n"
3025 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3026 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3027 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3028 "lea 0x40(%0),%0 \n"
3029 "psubd (%1),%%xmm0 \n"
3030 "psubd 0x10(%1),%%xmm1 \n"
3031 "psubd 0x20(%1),%%xmm2 \n"
3032 "psubd 0x30(%1),%%xmm3 \n"
3033 "paddd (%1,%4,4),%%xmm0 \n"
3034 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3035 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3036 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3037 "lea 0x40(%1),%1 \n"
3038 "cvtdq2ps %%xmm0,%%xmm0 \n"
3039 "cvtdq2ps %%xmm1,%%xmm1 \n"
3040 "mulps %%xmm4,%%xmm0 \n"
3041 "mulps %%xmm4,%%xmm1 \n"
3042 "cvtdq2ps %%xmm2,%%xmm2 \n"
3043 "cvtdq2ps %%xmm3,%%xmm3 \n"
3044 "mulps %%xmm4,%%xmm2 \n"
3045 "mulps %%xmm4,%%xmm3 \n"
3046 "cvtps2dq %%xmm0,%%xmm0 \n"
3047 "cvtps2dq %%xmm1,%%xmm1 \n"
3048 "cvtps2dq %%xmm2,%%xmm2 \n"
3049 "cvtps2dq %%xmm3,%%xmm3 \n"
3050 "packssdw %%xmm1,%%xmm0 \n"
3051 "packssdw %%xmm3,%%xmm2 \n"
3052 "packuswb %%xmm2,%%xmm0 \n"
3053 "movdqu %%xmm0,(%2) \n"
3054 "lea 0x10(%2),%2 \n"
3055 "sub $0x4,%3 \n"
3056 "jge 40b \n"
3057
3058 "49: \n"
3059 "add $0x3,%3 \n"
3060 "jl 19f \n"
3061
3062 // 1 pixel loop \n"
3063 ".p2align 2 \n"
3064 "10: \n"
3065 "movdqa (%0),%%xmm0 \n"
3066 "psubd (%0,%4,4),%%xmm0 \n"
3067 "lea 0x10(%0),%0 \n"
3068 "psubd (%1),%%xmm0 \n"
3069 "paddd (%1,%4,4),%%xmm0 \n"
3070 "lea 0x10(%1),%1 \n"
3071 "cvtdq2ps %%xmm0,%%xmm0 \n"
3072 "mulps %%xmm4,%%xmm0 \n"
3073 "cvtps2dq %%xmm0,%%xmm0 \n"
3074 "packssdw %%xmm0,%%xmm0 \n"
3075 "packuswb %%xmm0,%%xmm0 \n"
3076 "movd %%xmm0,(%2) \n"
3077 "lea 0x4(%2),%2 \n"
3078 "sub $0x1,%3 \n"
3079 "jge 10b \n"
3080 "19: \n"
3081 : "+r"(topleft), // %0
3082 "+r"(botleft), // %1
3083 "+r"(dst), // %2
3084 "+rm"(count) // %3
3085 : "r"(static_cast<intptr_t>(width)), // %4
3086 "rm"(area) // %5
3087 : "memory", "cc"
3088#if defined(__SSE2__)
3089 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3090#endif
3091 );
3092}
3093#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
3094
3095
fbarchard@google.com2d11d432012-02-16 02:50:39 +00003096#endif // defined(__x86_64__) || defined(__i386__)
3097
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003098#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003099} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003100} // namespace libyuv
3101#endif