blob: 0fdb0923bdbbee0d06bf73eda1c3bffd20d43e97 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000115 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000116 "1: \n"
117 "movq (%0),%%xmm0 \n"
118 "lea 0x8(%0),%0 \n"
119 "punpcklbw %%xmm0,%%xmm0 \n"
120 "movdqa %%xmm0,%%xmm1 \n"
121 "punpcklwd %%xmm0,%%xmm0 \n"
122 "punpckhwd %%xmm1,%%xmm1 \n"
123 "por %%xmm5,%%xmm0 \n"
124 "por %%xmm5,%%xmm1 \n"
125 "movdqa %%xmm0,(%1) \n"
126 "movdqa %%xmm1,0x10(%1) \n"
127 "lea 0x20(%1),%1 \n"
128 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000129 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000130 : "+r"(src_y), // %0
131 "+r"(dst_argb), // %1
132 "+r"(pix) // %2
133 :
134 : "memory", "cc"
135#if defined(__SSE2__)
136 , "xmm0", "xmm1", "xmm5"
137#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000138 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000139}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140
141void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000142 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000143 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000144 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000145 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "1: \n"
147 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000148 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000150 "movdqa %%xmm0,(%0,%1,1) \n"
151 "lea 0x10(%0),%0 \n"
152 "jg 1b \n"
153
fbarchard@google.comb6149762011-11-07 21:58:52 +0000154 : "+r"(src_abgr), // %0
155 "+r"(dst_argb), // %1
156 "+r"(pix) // %2
157 : "m"(kShuffleMaskABGRToARGB) // %3
158 : "memory", "cc"
159#if defined(__SSE2__)
160 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000161#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000162 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163}
164
165void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000166 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000168 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000169 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "1: \n"
171 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000172 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000173 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000174 "movdqa %%xmm0,(%0,%1,1) \n"
175 "lea 0x10(%0),%0 \n"
176 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000177 : "+r"(src_bgra), // %0
178 "+r"(dst_argb), // %1
179 "+r"(pix) // %2
180 : "m"(kShuffleMaskBGRAToARGB) // %3
181 : "memory", "cc"
182#if defined(__SSE2__)
183 , "xmm0", "xmm5"
184#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000185 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000186}
187
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000188void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000189 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000190 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
191 "pslld $0x18,%%xmm5 \n"
192 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000193 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000194 "1: \n"
195 "movdqu (%0),%%xmm0 \n"
196 "movdqu 0x10(%0),%%xmm1 \n"
197 "movdqu 0x20(%0),%%xmm3 \n"
198 "lea 0x30(%0),%0 \n"
199 "movdqa %%xmm3,%%xmm2 \n"
200 "palignr $0x8,%%xmm1,%%xmm2 \n"
201 "pshufb %%xmm4,%%xmm2 \n"
202 "por %%xmm5,%%xmm2 \n"
203 "palignr $0xc,%%xmm0,%%xmm1 \n"
204 "pshufb %%xmm4,%%xmm0 \n"
205 "movdqa %%xmm2,0x20(%1) \n"
206 "por %%xmm5,%%xmm0 \n"
207 "pshufb %%xmm4,%%xmm1 \n"
208 "movdqa %%xmm0,(%1) \n"
209 "por %%xmm5,%%xmm1 \n"
210 "palignr $0x4,%%xmm3,%%xmm3 \n"
211 "pshufb %%xmm4,%%xmm3 \n"
212 "movdqa %%xmm1,0x10(%1) \n"
213 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000215 "movdqa %%xmm3,0x30(%1) \n"
216 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000217 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000218 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000221 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000227}
228
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000229void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
232 "pslld $0x18,%%xmm5 \n"
233 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000234 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000235 "1: \n"
236 "movdqu (%0),%%xmm0 \n"
237 "movdqu 0x10(%0),%%xmm1 \n"
238 "movdqu 0x20(%0),%%xmm3 \n"
239 "lea 0x30(%0),%0 \n"
240 "movdqa %%xmm3,%%xmm2 \n"
241 "palignr $0x8,%%xmm1,%%xmm2 \n"
242 "pshufb %%xmm4,%%xmm2 \n"
243 "por %%xmm5,%%xmm2 \n"
244 "palignr $0xc,%%xmm0,%%xmm1 \n"
245 "pshufb %%xmm4,%%xmm0 \n"
246 "movdqa %%xmm2,0x20(%1) \n"
247 "por %%xmm5,%%xmm0 \n"
248 "pshufb %%xmm4,%%xmm1 \n"
249 "movdqa %%xmm0,(%1) \n"
250 "por %%xmm5,%%xmm1 \n"
251 "palignr $0x4,%%xmm3,%%xmm3 \n"
252 "pshufb %%xmm4,%%xmm3 \n"
253 "movdqa %%xmm1,0x10(%1) \n"
254 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000255 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000256 "movdqa %%xmm3,0x30(%1) \n"
257 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000258 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000259 : "+r"(src_raw), // %0
260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000262 : "m"(kShuffleMaskRAWToARGB) // %3
263 : "memory", "cc"
264#if defined(__SSE2__)
265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
266#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000268}
269
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000270void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000271 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000272 "mov $0x1080108,%%eax \n"
273 "movd %%eax,%%xmm5 \n"
274 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000275 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000276 "movd %%eax,%%xmm6 \n"
277 "pshufd $0x0,%%xmm6,%%xmm6 \n"
278 "pcmpeqb %%xmm3,%%xmm3 \n"
279 "psllw $0xb,%%xmm3 \n"
280 "pcmpeqb %%xmm4,%%xmm4 \n"
281 "psllw $0xa,%%xmm4 \n"
282 "psrlw $0x5,%%xmm4 \n"
283 "pcmpeqb %%xmm7,%%xmm7 \n"
284 "psllw $0x8,%%xmm7 \n"
285 "sub %0,%1 \n"
286 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000287 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000288 "1: \n"
289 "movdqu (%0),%%xmm0 \n"
290 "movdqa %%xmm0,%%xmm1 \n"
291 "movdqa %%xmm0,%%xmm2 \n"
292 "pand %%xmm3,%%xmm1 \n"
293 "psllw $0xb,%%xmm2 \n"
294 "pmulhuw %%xmm5,%%xmm1 \n"
295 "pmulhuw %%xmm5,%%xmm2 \n"
296 "psllw $0x8,%%xmm1 \n"
297 "por %%xmm2,%%xmm1 \n"
298 "pand %%xmm4,%%xmm0 \n"
299 "pmulhuw %%xmm6,%%xmm0 \n"
300 "por %%xmm7,%%xmm0 \n"
301 "movdqa %%xmm1,%%xmm2 \n"
302 "punpcklbw %%xmm0,%%xmm1 \n"
303 "punpckhbw %%xmm0,%%xmm2 \n"
304 "movdqa %%xmm1,(%1,%0,2) \n"
305 "movdqa %%xmm2,0x10(%1,%0,2) \n"
306 "lea 0x10(%0),%0 \n"
307 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000308 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000309 : "+r"(src), // %0
310 "+r"(dst), // %1
311 "+r"(pix) // %2
312 :
313 : "memory", "cc", "eax"
314#if defined(__SSE2__)
315 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
316#endif
317 );
318}
319
320void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000321 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000322 "mov $0x1080108,%%eax \n"
323 "movd %%eax,%%xmm5 \n"
324 "pshufd $0x0,%%xmm5,%%xmm5 \n"
325 "mov $0x42004200,%%eax \n"
326 "movd %%eax,%%xmm6 \n"
327 "pshufd $0x0,%%xmm6,%%xmm6 \n"
328 "pcmpeqb %%xmm3,%%xmm3 \n"
329 "psllw $0xb,%%xmm3 \n"
330 "movdqa %%xmm3,%%xmm4 \n"
331 "psrlw $0x6,%%xmm4 \n"
332 "pcmpeqb %%xmm7,%%xmm7 \n"
333 "psllw $0x8,%%xmm7 \n"
334 "sub %0,%1 \n"
335 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000336 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000337 "1: \n"
338 "movdqu (%0),%%xmm0 \n"
339 "movdqa %%xmm0,%%xmm1 \n"
340 "movdqa %%xmm0,%%xmm2 \n"
341 "psllw $0x1,%%xmm1 \n"
342 "psllw $0xb,%%xmm2 \n"
343 "pand %%xmm3,%%xmm1 \n"
344 "pmulhuw %%xmm5,%%xmm2 \n"
345 "pmulhuw %%xmm5,%%xmm1 \n"
346 "psllw $0x8,%%xmm1 \n"
347 "por %%xmm2,%%xmm1 \n"
348 "movdqa %%xmm0,%%xmm2 \n"
349 "pand %%xmm4,%%xmm0 \n"
350 "psraw $0x8,%%xmm2 \n"
351 "pmulhuw %%xmm6,%%xmm0 \n"
352 "pand %%xmm7,%%xmm2 \n"
353 "por %%xmm2,%%xmm0 \n"
354 "movdqa %%xmm1,%%xmm2 \n"
355 "punpcklbw %%xmm0,%%xmm1 \n"
356 "punpckhbw %%xmm0,%%xmm2 \n"
357 "movdqa %%xmm1,(%1,%0,2) \n"
358 "movdqa %%xmm2,0x10(%1,%0,2) \n"
359 "lea 0x10(%0),%0 \n"
360 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000361 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000362 : "+r"(src), // %0
363 "+r"(dst), // %1
364 "+r"(pix) // %2
365 :
366 : "memory", "cc", "eax"
367#if defined(__SSE2__)
368 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
369#endif
370 );
371}
372
373void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000374 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000375 "mov $0xf0f0f0f,%%eax \n"
376 "movd %%eax,%%xmm4 \n"
377 "pshufd $0x0,%%xmm4,%%xmm4 \n"
378 "movdqa %%xmm4,%%xmm5 \n"
379 "pslld $0x4,%%xmm5 \n"
380 "sub %0,%1 \n"
381 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000382 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000383 "1: \n"
384 "movdqu (%0),%%xmm0 \n"
385 "movdqa %%xmm0,%%xmm2 \n"
386 "pand %%xmm4,%%xmm0 \n"
387 "pand %%xmm5,%%xmm2 \n"
388 "movdqa %%xmm0,%%xmm1 \n"
389 "movdqa %%xmm2,%%xmm3 \n"
390 "psllw $0x4,%%xmm1 \n"
391 "psrlw $0x4,%%xmm3 \n"
392 "por %%xmm1,%%xmm0 \n"
393 "por %%xmm3,%%xmm2 \n"
394 "movdqa %%xmm0,%%xmm1 \n"
395 "punpcklbw %%xmm2,%%xmm0 \n"
396 "punpckhbw %%xmm2,%%xmm1 \n"
397 "movdqa %%xmm0,(%1,%0,2) \n"
398 "movdqa %%xmm1,0x10(%1,%0,2) \n"
399 "lea 0x10(%0),%0 \n"
400 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000401 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000402 : "+r"(src), // %0
403 "+r"(dst), // %1
404 "+r"(pix) // %2
405 :
406 : "memory", "cc", "eax"
407#if defined(__SSE2__)
408 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
409#endif
410 );
411}
412
413void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000414 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000415 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000416 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000417 "1: \n"
418 "movdqa (%0),%%xmm0 \n"
419 "movdqa 0x10(%0),%%xmm1 \n"
420 "movdqa 0x20(%0),%%xmm2 \n"
421 "movdqa 0x30(%0),%%xmm3 \n"
422 "lea 0x40(%0),%0 \n"
423 "pshufb %%xmm6,%%xmm0 \n"
424 "pshufb %%xmm6,%%xmm1 \n"
425 "pshufb %%xmm6,%%xmm2 \n"
426 "pshufb %%xmm6,%%xmm3 \n"
427 "movdqa %%xmm1,%%xmm4 \n"
428 "psrldq $0x4,%%xmm1 \n"
429 "pslldq $0xc,%%xmm4 \n"
430 "movdqa %%xmm2,%%xmm5 \n"
431 "por %%xmm4,%%xmm0 \n"
432 "pslldq $0x8,%%xmm5 \n"
433 "movdqa %%xmm0,(%1) \n"
434 "por %%xmm5,%%xmm1 \n"
435 "psrldq $0x8,%%xmm2 \n"
436 "pslldq $0x4,%%xmm3 \n"
437 "por %%xmm3,%%xmm2 \n"
438 "movdqa %%xmm1,0x10(%1) \n"
439 "movdqa %%xmm2,0x20(%1) \n"
440 "lea 0x30(%1),%1 \n"
441 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 : "m"(kShuffleMaskARGBToRGB24) // %3
447 : "memory", "cc"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
450#endif
451 );
452}
453
454void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000457 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000458 "1: \n"
459 "movdqa (%0),%%xmm0 \n"
460 "movdqa 0x10(%0),%%xmm1 \n"
461 "movdqa 0x20(%0),%%xmm2 \n"
462 "movdqa 0x30(%0),%%xmm3 \n"
463 "lea 0x40(%0),%0 \n"
464 "pshufb %%xmm6,%%xmm0 \n"
465 "pshufb %%xmm6,%%xmm1 \n"
466 "pshufb %%xmm6,%%xmm2 \n"
467 "pshufb %%xmm6,%%xmm3 \n"
468 "movdqa %%xmm1,%%xmm4 \n"
469 "psrldq $0x4,%%xmm1 \n"
470 "pslldq $0xc,%%xmm4 \n"
471 "movdqa %%xmm2,%%xmm5 \n"
472 "por %%xmm4,%%xmm0 \n"
473 "pslldq $0x8,%%xmm5 \n"
474 "movdqa %%xmm0,(%1) \n"
475 "por %%xmm5,%%xmm1 \n"
476 "psrldq $0x8,%%xmm2 \n"
477 "pslldq $0x4,%%xmm3 \n"
478 "por %%xmm3,%%xmm2 \n"
479 "movdqa %%xmm1,0x10(%1) \n"
480 "movdqa %%xmm2,0x20(%1) \n"
481 "lea 0x30(%1),%1 \n"
482 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000483 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000484 : "+r"(src), // %0
485 "+r"(dst), // %1
486 "+r"(pix) // %2
487 : "m"(kShuffleMaskARGBToRAW) // %3
488 : "memory", "cc"
489#if defined(__SSE2__)
490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
491#endif
492 );
493}
494
495void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000496 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000497 "pcmpeqb %%xmm3,%%xmm3 \n"
498 "psrld $0x1b,%%xmm3 \n"
499 "pcmpeqb %%xmm4,%%xmm4 \n"
500 "psrld $0x1a,%%xmm4 \n"
501 "pslld $0x5,%%xmm4 \n"
502 "pcmpeqb %%xmm5,%%xmm5 \n"
503 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000504 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000505 "1: \n"
506 "movdqa (%0),%%xmm0 \n"
507 "movdqa %%xmm0,%%xmm1 \n"
508 "movdqa %%xmm0,%%xmm2 \n"
509 "pslld $0x8,%%xmm0 \n"
510 "psrld $0x3,%%xmm1 \n"
511 "psrld $0x5,%%xmm2 \n"
512 "psrad $0x10,%%xmm0 \n"
513 "pand %%xmm3,%%xmm1 \n"
514 "pand %%xmm4,%%xmm2 \n"
515 "pand %%xmm5,%%xmm0 \n"
516 "por %%xmm2,%%xmm1 \n"
517 "por %%xmm1,%%xmm0 \n"
518 "packssdw %%xmm0,%%xmm0 \n"
519 "lea 0x10(%0),%0 \n"
520 "movq %%xmm0,(%1) \n"
521 "lea 0x8(%1),%1 \n"
522 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 :
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
531#endif
532 );
533}
534
535void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "pcmpeqb %%xmm4,%%xmm4 \n"
538 "psrld $0x1b,%%xmm4 \n"
539 "movdqa %%xmm4,%%xmm5 \n"
540 "pslld $0x5,%%xmm5 \n"
541 "movdqa %%xmm4,%%xmm6 \n"
542 "pslld $0xa,%%xmm6 \n"
543 "pcmpeqb %%xmm7,%%xmm7 \n"
544 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000545 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000546 "1: \n"
547 "movdqa (%0),%%xmm0 \n"
548 "movdqa %%xmm0,%%xmm1 \n"
549 "movdqa %%xmm0,%%xmm2 \n"
550 "movdqa %%xmm0,%%xmm3 \n"
551 "psrad $0x10,%%xmm0 \n"
552 "psrld $0x3,%%xmm1 \n"
553 "psrld $0x6,%%xmm2 \n"
554 "psrld $0x9,%%xmm3 \n"
555 "pand %%xmm7,%%xmm0 \n"
556 "pand %%xmm4,%%xmm1 \n"
557 "pand %%xmm5,%%xmm2 \n"
558 "pand %%xmm6,%%xmm3 \n"
559 "por %%xmm1,%%xmm0 \n"
560 "por %%xmm3,%%xmm2 \n"
561 "por %%xmm2,%%xmm0 \n"
562 "packssdw %%xmm0,%%xmm0 \n"
563 "lea 0x10(%0),%0 \n"
564 "movq %%xmm0,(%1) \n"
565 "lea 0x8(%1),%1 \n"
566 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000567 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 : "+r"(src), // %0
569 "+r"(dst), // %1
570 "+r"(pix) // %2
571 :
572 : "memory", "cc"
573#if defined(__SSE2__)
574 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
575#endif
576 );
577}
578
579void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000580 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000581 "pcmpeqb %%xmm4,%%xmm4 \n"
582 "psllw $0xc,%%xmm4 \n"
583 "movdqa %%xmm4,%%xmm3 \n"
584 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "pand %%xmm3,%%xmm0 \n"
590 "pand %%xmm4,%%xmm1 \n"
591 "psrlq $0x4,%%xmm0 \n"
592 "psrlq $0x8,%%xmm1 \n"
593 "por %%xmm1,%%xmm0 \n"
594 "packuswb %%xmm0,%%xmm0 \n"
595 "lea 0x10(%0),%0 \n"
596 "movq %%xmm0,(%1) \n"
597 "lea 0x8(%1),%1 \n"
598 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000599 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000600 : "+r"(src), // %0
601 "+r"(dst), // %1
602 "+r"(pix) // %2
603 :
604 : "memory", "cc"
605#if defined(__SSE2__)
606 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
607#endif
608 );
609}
610
fbarchard@google.comb6149762011-11-07 21:58:52 +0000611void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000612 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000613 "movdqa %4,%%xmm5 \n"
614 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000615 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000616 "1: \n"
617 "movdqa (%0),%%xmm0 \n"
618 "movdqa 0x10(%0),%%xmm1 \n"
619 "movdqa 0x20(%0),%%xmm2 \n"
620 "movdqa 0x30(%0),%%xmm3 \n"
621 "pmaddubsw %%xmm4,%%xmm0 \n"
622 "pmaddubsw %%xmm4,%%xmm1 \n"
623 "pmaddubsw %%xmm4,%%xmm2 \n"
624 "pmaddubsw %%xmm4,%%xmm3 \n"
625 "lea 0x40(%0),%0 \n"
626 "phaddw %%xmm1,%%xmm0 \n"
627 "phaddw %%xmm3,%%xmm2 \n"
628 "psrlw $0x7,%%xmm0 \n"
629 "psrlw $0x7,%%xmm2 \n"
630 "packuswb %%xmm2,%%xmm0 \n"
631 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000632 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000633 "movdqa %%xmm0,(%1) \n"
634 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000636 : "+r"(src_argb), // %0
637 "+r"(dst_y), // %1
638 "+r"(pix) // %2
639 : "m"(kARGBToY), // %3
640 "m"(kAddY16) // %4
641 : "memory", "cc"
642#if defined(__SSE2__)
643 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
644#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000645 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000646}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000647
648void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000649 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000650 "movdqa %4,%%xmm5 \n"
651 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000652 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000653 "1: \n"
654 "movdqu (%0),%%xmm0 \n"
655 "movdqu 0x10(%0),%%xmm1 \n"
656 "movdqu 0x20(%0),%%xmm2 \n"
657 "movdqu 0x30(%0),%%xmm3 \n"
658 "pmaddubsw %%xmm4,%%xmm0 \n"
659 "pmaddubsw %%xmm4,%%xmm1 \n"
660 "pmaddubsw %%xmm4,%%xmm2 \n"
661 "pmaddubsw %%xmm4,%%xmm3 \n"
662 "lea 0x40(%0),%0 \n"
663 "phaddw %%xmm1,%%xmm0 \n"
664 "phaddw %%xmm3,%%xmm2 \n"
665 "psrlw $0x7,%%xmm0 \n"
666 "psrlw $0x7,%%xmm2 \n"
667 "packuswb %%xmm2,%%xmm0 \n"
668 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000669 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000670 "movdqu %%xmm0,(%1) \n"
671 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000672 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000673 : "+r"(src_argb), // %0
674 "+r"(dst_y), // %1
675 "+r"(pix) // %2
676 : "m"(kARGBToY), // %3
677 "m"(kAddY16) // %4
678 : "memory", "cc"
679#if defined(__SSE2__)
680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
681#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000682 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000683}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000684
fbarchard@google.com714050a2012-02-17 22:59:56 +0000685// TODO(fbarchard): pass xmm constants to single block of assembly.
686// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
687// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
688// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
689// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000690void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
691 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000693 "movdqa %0,%%xmm4 \n"
694 "movdqa %1,%%xmm3 \n"
695 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000696 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000697 : "m"(kARGBToU), // %0
698 "m"(kARGBToV), // %1
699 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000700 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000701 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000702 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000703 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000704 "1: \n"
705 "movdqa (%0),%%xmm0 \n"
706 "movdqa 0x10(%0),%%xmm1 \n"
707 "movdqa 0x20(%0),%%xmm2 \n"
708 "movdqa 0x30(%0),%%xmm6 \n"
709 "pavgb (%0,%4,1),%%xmm0 \n"
710 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
711 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
712 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
713 "lea 0x40(%0),%0 \n"
714 "movdqa %%xmm0,%%xmm7 \n"
715 "shufps $0x88,%%xmm1,%%xmm0 \n"
716 "shufps $0xdd,%%xmm1,%%xmm7 \n"
717 "pavgb %%xmm7,%%xmm0 \n"
718 "movdqa %%xmm2,%%xmm7 \n"
719 "shufps $0x88,%%xmm6,%%xmm2 \n"
720 "shufps $0xdd,%%xmm6,%%xmm7 \n"
721 "pavgb %%xmm7,%%xmm2 \n"
722 "movdqa %%xmm0,%%xmm1 \n"
723 "movdqa %%xmm2,%%xmm6 \n"
724 "pmaddubsw %%xmm4,%%xmm0 \n"
725 "pmaddubsw %%xmm4,%%xmm2 \n"
726 "pmaddubsw %%xmm3,%%xmm1 \n"
727 "pmaddubsw %%xmm3,%%xmm6 \n"
728 "phaddw %%xmm2,%%xmm0 \n"
729 "phaddw %%xmm6,%%xmm1 \n"
730 "psraw $0x8,%%xmm0 \n"
731 "psraw $0x8,%%xmm1 \n"
732 "packsswb %%xmm1,%%xmm0 \n"
733 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000734 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000735 "movlps %%xmm0,(%1) \n"
736 "movhps %%xmm0,(%1,%2,1) \n"
737 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000738 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000739 : "+r"(src_argb0), // %0
740 "+r"(dst_u), // %1
741 "+r"(dst_v), // %2
742 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000743 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000744 : "memory", "cc"
745#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000746 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000747#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000748 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000749}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000750
751void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
752 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000753 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000754 "movdqa %0,%%xmm4 \n"
755 "movdqa %1,%%xmm3 \n"
756 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000757 :
758 : "m"(kARGBToU), // %0
759 "m"(kARGBToV), // %1
760 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000761 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000762 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000763 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm6 \n"
770 "movdqu (%0,%4,1),%%xmm7 \n"
771 "pavgb %%xmm7,%%xmm0 \n"
772 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
773 "pavgb %%xmm7,%%xmm1 \n"
774 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm2 \n"
776 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
777 "pavgb %%xmm7,%%xmm6 \n"
778 "lea 0x40(%0),%0 \n"
779 "movdqa %%xmm0,%%xmm7 \n"
780 "shufps $0x88,%%xmm1,%%xmm0 \n"
781 "shufps $0xdd,%%xmm1,%%xmm7 \n"
782 "pavgb %%xmm7,%%xmm0 \n"
783 "movdqa %%xmm2,%%xmm7 \n"
784 "shufps $0x88,%%xmm6,%%xmm2 \n"
785 "shufps $0xdd,%%xmm6,%%xmm7 \n"
786 "pavgb %%xmm7,%%xmm2 \n"
787 "movdqa %%xmm0,%%xmm1 \n"
788 "movdqa %%xmm2,%%xmm6 \n"
789 "pmaddubsw %%xmm4,%%xmm0 \n"
790 "pmaddubsw %%xmm4,%%xmm2 \n"
791 "pmaddubsw %%xmm3,%%xmm1 \n"
792 "pmaddubsw %%xmm3,%%xmm6 \n"
793 "phaddw %%xmm2,%%xmm0 \n"
794 "phaddw %%xmm6,%%xmm1 \n"
795 "psraw $0x8,%%xmm0 \n"
796 "psraw $0x8,%%xmm1 \n"
797 "packsswb %%xmm1,%%xmm0 \n"
798 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000799 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000800 "movlps %%xmm0,(%1) \n"
801 "movhps %%xmm0,(%1,%2,1) \n"
802 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000803 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000804 : "+r"(src_argb0), // %0
805 "+r"(dst_u), // %1
806 "+r"(dst_v), // %2
807 "+rm"(width) // %3
808 : "r"(static_cast<intptr_t>(src_stride_argb))
809 : "memory", "cc"
810#if defined(__SSE2__)
811 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
812#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000813 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000814}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000815
fbarchard@google.com714050a2012-02-17 22:59:56 +0000816void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000817 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000818 "movdqa %4,%%xmm5 \n"
819 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000820 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000821 "1: \n"
822 "movdqa (%0),%%xmm0 \n"
823 "movdqa 0x10(%0),%%xmm1 \n"
824 "movdqa 0x20(%0),%%xmm2 \n"
825 "movdqa 0x30(%0),%%xmm3 \n"
826 "pmaddubsw %%xmm4,%%xmm0 \n"
827 "pmaddubsw %%xmm4,%%xmm1 \n"
828 "pmaddubsw %%xmm4,%%xmm2 \n"
829 "pmaddubsw %%xmm4,%%xmm3 \n"
830 "lea 0x40(%0),%0 \n"
831 "phaddw %%xmm1,%%xmm0 \n"
832 "phaddw %%xmm3,%%xmm2 \n"
833 "psrlw $0x7,%%xmm0 \n"
834 "psrlw $0x7,%%xmm2 \n"
835 "packuswb %%xmm2,%%xmm0 \n"
836 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000837 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000838 "movdqa %%xmm0,(%1) \n"
839 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000840 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000841 : "+r"(src_bgra), // %0
842 "+r"(dst_y), // %1
843 "+r"(pix) // %2
844 : "m"(kBGRAToY), // %3
845 "m"(kAddY16) // %4
846 : "memory", "cc"
847#if defined(__SSE2__)
848 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000849#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000850 );
851}
852
853void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000854 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000855 "movdqa %4,%%xmm5 \n"
856 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000857 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000858 "1: \n"
859 "movdqu (%0),%%xmm0 \n"
860 "movdqu 0x10(%0),%%xmm1 \n"
861 "movdqu 0x20(%0),%%xmm2 \n"
862 "movdqu 0x30(%0),%%xmm3 \n"
863 "pmaddubsw %%xmm4,%%xmm0 \n"
864 "pmaddubsw %%xmm4,%%xmm1 \n"
865 "pmaddubsw %%xmm4,%%xmm2 \n"
866 "pmaddubsw %%xmm4,%%xmm3 \n"
867 "lea 0x40(%0),%0 \n"
868 "phaddw %%xmm1,%%xmm0 \n"
869 "phaddw %%xmm3,%%xmm2 \n"
870 "psrlw $0x7,%%xmm0 \n"
871 "psrlw $0x7,%%xmm2 \n"
872 "packuswb %%xmm2,%%xmm0 \n"
873 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000874 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000875 "movdqu %%xmm0,(%1) \n"
876 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000877 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000878 : "+r"(src_bgra), // %0
879 "+r"(dst_y), // %1
880 "+r"(pix) // %2
881 : "m"(kBGRAToY), // %3
882 "m"(kAddY16) // %4
883 : "memory", "cc"
884#if defined(__SSE2__)
885 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
886#endif
887 );
888}
889
890void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
891 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000892 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000893 "movdqa %0,%%xmm4 \n"
894 "movdqa %1,%%xmm3 \n"
895 "movdqa %2,%%xmm5 \n"
896 :
897 : "m"(kBGRAToU), // %0
898 "m"(kBGRAToV), // %1
899 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000900 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000901 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000902 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000903 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000904 "1: \n"
905 "movdqa (%0),%%xmm0 \n"
906 "movdqa 0x10(%0),%%xmm1 \n"
907 "movdqa 0x20(%0),%%xmm2 \n"
908 "movdqa 0x30(%0),%%xmm6 \n"
909 "pavgb (%0,%4,1),%%xmm0 \n"
910 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
911 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
912 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
913 "lea 0x40(%0),%0 \n"
914 "movdqa %%xmm0,%%xmm7 \n"
915 "shufps $0x88,%%xmm1,%%xmm0 \n"
916 "shufps $0xdd,%%xmm1,%%xmm7 \n"
917 "pavgb %%xmm7,%%xmm0 \n"
918 "movdqa %%xmm2,%%xmm7 \n"
919 "shufps $0x88,%%xmm6,%%xmm2 \n"
920 "shufps $0xdd,%%xmm6,%%xmm7 \n"
921 "pavgb %%xmm7,%%xmm2 \n"
922 "movdqa %%xmm0,%%xmm1 \n"
923 "movdqa %%xmm2,%%xmm6 \n"
924 "pmaddubsw %%xmm4,%%xmm0 \n"
925 "pmaddubsw %%xmm4,%%xmm2 \n"
926 "pmaddubsw %%xmm3,%%xmm1 \n"
927 "pmaddubsw %%xmm3,%%xmm6 \n"
928 "phaddw %%xmm2,%%xmm0 \n"
929 "phaddw %%xmm6,%%xmm1 \n"
930 "psraw $0x8,%%xmm0 \n"
931 "psraw $0x8,%%xmm1 \n"
932 "packsswb %%xmm1,%%xmm0 \n"
933 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000934 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000935 "movlps %%xmm0,(%1) \n"
936 "movhps %%xmm0,(%1,%2,1) \n"
937 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000938 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000939 : "+r"(src_bgra0), // %0
940 "+r"(dst_u), // %1
941 "+r"(dst_v), // %2
942 "+rm"(width) // %3
943 : "r"(static_cast<intptr_t>(src_stride_bgra))
944 : "memory", "cc"
945#if defined(__SSE2__)
946 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
947#endif
948 );
949}
950
951void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
952 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000953 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000954 "movdqa %0,%%xmm4 \n"
955 "movdqa %1,%%xmm3 \n"
956 "movdqa %2,%%xmm5 \n"
957 :
958 : "m"(kBGRAToU), // %0
959 "m"(kBGRAToV), // %1
960 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000961 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000962 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000963 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000964 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000965 "1: \n"
966 "movdqu (%0),%%xmm0 \n"
967 "movdqu 0x10(%0),%%xmm1 \n"
968 "movdqu 0x20(%0),%%xmm2 \n"
969 "movdqu 0x30(%0),%%xmm6 \n"
970 "movdqu (%0,%4,1),%%xmm7 \n"
971 "pavgb %%xmm7,%%xmm0 \n"
972 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
973 "pavgb %%xmm7,%%xmm1 \n"
974 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm2 \n"
976 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm6 \n"
978 "lea 0x40(%0),%0 \n"
979 "movdqa %%xmm0,%%xmm7 \n"
980 "shufps $0x88,%%xmm1,%%xmm0 \n"
981 "shufps $0xdd,%%xmm1,%%xmm7 \n"
982 "pavgb %%xmm7,%%xmm0 \n"
983 "movdqa %%xmm2,%%xmm7 \n"
984 "shufps $0x88,%%xmm6,%%xmm2 \n"
985 "shufps $0xdd,%%xmm6,%%xmm7 \n"
986 "pavgb %%xmm7,%%xmm2 \n"
987 "movdqa %%xmm0,%%xmm1 \n"
988 "movdqa %%xmm2,%%xmm6 \n"
989 "pmaddubsw %%xmm4,%%xmm0 \n"
990 "pmaddubsw %%xmm4,%%xmm2 \n"
991 "pmaddubsw %%xmm3,%%xmm1 \n"
992 "pmaddubsw %%xmm3,%%xmm6 \n"
993 "phaddw %%xmm2,%%xmm0 \n"
994 "phaddw %%xmm6,%%xmm1 \n"
995 "psraw $0x8,%%xmm0 \n"
996 "psraw $0x8,%%xmm1 \n"
997 "packsswb %%xmm1,%%xmm0 \n"
998 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000999 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001000 "movlps %%xmm0,(%1) \n"
1001 "movhps %%xmm0,(%1,%2,1) \n"
1002 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001003 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001004 : "+r"(src_bgra0), // %0
1005 "+r"(dst_u), // %1
1006 "+r"(dst_v), // %2
1007 "+rm"(width) // %3
1008 : "r"(static_cast<intptr_t>(src_stride_bgra))
1009 : "memory", "cc"
1010#if defined(__SSE2__)
1011 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1012#endif
1013 );
1014}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001015
1016void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001017 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001018 "movdqa %4,%%xmm5 \n"
1019 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001020 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001021 "1: \n"
1022 "movdqa (%0),%%xmm0 \n"
1023 "movdqa 0x10(%0),%%xmm1 \n"
1024 "movdqa 0x20(%0),%%xmm2 \n"
1025 "movdqa 0x30(%0),%%xmm3 \n"
1026 "pmaddubsw %%xmm4,%%xmm0 \n"
1027 "pmaddubsw %%xmm4,%%xmm1 \n"
1028 "pmaddubsw %%xmm4,%%xmm2 \n"
1029 "pmaddubsw %%xmm4,%%xmm3 \n"
1030 "lea 0x40(%0),%0 \n"
1031 "phaddw %%xmm1,%%xmm0 \n"
1032 "phaddw %%xmm3,%%xmm2 \n"
1033 "psrlw $0x7,%%xmm0 \n"
1034 "psrlw $0x7,%%xmm2 \n"
1035 "packuswb %%xmm2,%%xmm0 \n"
1036 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001037 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001038 "movdqa %%xmm0,(%1) \n"
1039 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001040 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001041 : "+r"(src_abgr), // %0
1042 "+r"(dst_y), // %1
1043 "+r"(pix) // %2
1044 : "m"(kABGRToY), // %3
1045 "m"(kAddY16) // %4
1046 : "memory", "cc"
1047#if defined(__SSE2__)
1048 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1049#endif
1050 );
1051}
1052
1053void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001054 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001055 "movdqa %4,%%xmm5 \n"
1056 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001057 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001058 "1: \n"
1059 "movdqu (%0),%%xmm0 \n"
1060 "movdqu 0x10(%0),%%xmm1 \n"
1061 "movdqu 0x20(%0),%%xmm2 \n"
1062 "movdqu 0x30(%0),%%xmm3 \n"
1063 "pmaddubsw %%xmm4,%%xmm0 \n"
1064 "pmaddubsw %%xmm4,%%xmm1 \n"
1065 "pmaddubsw %%xmm4,%%xmm2 \n"
1066 "pmaddubsw %%xmm4,%%xmm3 \n"
1067 "lea 0x40(%0),%0 \n"
1068 "phaddw %%xmm1,%%xmm0 \n"
1069 "phaddw %%xmm3,%%xmm2 \n"
1070 "psrlw $0x7,%%xmm0 \n"
1071 "psrlw $0x7,%%xmm2 \n"
1072 "packuswb %%xmm2,%%xmm0 \n"
1073 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001074 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001075 "movdqu %%xmm0,(%1) \n"
1076 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001077 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001078 : "+r"(src_abgr), // %0
1079 "+r"(dst_y), // %1
1080 "+r"(pix) // %2
1081 : "m"(kABGRToY), // %3
1082 "m"(kAddY16) // %4
1083 : "memory", "cc"
1084#if defined(__SSE2__)
1085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1086#endif
1087 );
1088}
1089
1090void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1091 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001092 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001093 "movdqa %0,%%xmm4 \n"
1094 "movdqa %1,%%xmm3 \n"
1095 "movdqa %2,%%xmm5 \n"
1096 :
1097 : "m"(kABGRToU), // %0
1098 "m"(kABGRToV), // %1
1099 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001100 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001101 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001102 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001103 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001104 "1: \n"
1105 "movdqa (%0),%%xmm0 \n"
1106 "movdqa 0x10(%0),%%xmm1 \n"
1107 "movdqa 0x20(%0),%%xmm2 \n"
1108 "movdqa 0x30(%0),%%xmm6 \n"
1109 "pavgb (%0,%4,1),%%xmm0 \n"
1110 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1111 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1112 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1113 "lea 0x40(%0),%0 \n"
1114 "movdqa %%xmm0,%%xmm7 \n"
1115 "shufps $0x88,%%xmm1,%%xmm0 \n"
1116 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1117 "pavgb %%xmm7,%%xmm0 \n"
1118 "movdqa %%xmm2,%%xmm7 \n"
1119 "shufps $0x88,%%xmm6,%%xmm2 \n"
1120 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1121 "pavgb %%xmm7,%%xmm2 \n"
1122 "movdqa %%xmm0,%%xmm1 \n"
1123 "movdqa %%xmm2,%%xmm6 \n"
1124 "pmaddubsw %%xmm4,%%xmm0 \n"
1125 "pmaddubsw %%xmm4,%%xmm2 \n"
1126 "pmaddubsw %%xmm3,%%xmm1 \n"
1127 "pmaddubsw %%xmm3,%%xmm6 \n"
1128 "phaddw %%xmm2,%%xmm0 \n"
1129 "phaddw %%xmm6,%%xmm1 \n"
1130 "psraw $0x8,%%xmm0 \n"
1131 "psraw $0x8,%%xmm1 \n"
1132 "packsswb %%xmm1,%%xmm0 \n"
1133 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001134 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 "movlps %%xmm0,(%1) \n"
1136 "movhps %%xmm0,(%1,%2,1) \n"
1137 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001138 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 : "+r"(src_abgr0), // %0
1140 "+r"(dst_u), // %1
1141 "+r"(dst_v), // %2
1142 "+rm"(width) // %3
1143 : "r"(static_cast<intptr_t>(src_stride_abgr))
1144 : "memory", "cc"
1145#if defined(__SSE2__)
1146 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1147#endif
1148 );
1149}
1150
1151void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1152 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001153 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001154 "movdqa %0,%%xmm4 \n"
1155 "movdqa %1,%%xmm3 \n"
1156 "movdqa %2,%%xmm5 \n"
1157 :
1158 : "m"(kABGRToU), // %0
1159 "m"(kABGRToV), // %1
1160 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001161 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001162 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001163 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001164 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001165 "1: \n"
1166 "movdqu (%0),%%xmm0 \n"
1167 "movdqu 0x10(%0),%%xmm1 \n"
1168 "movdqu 0x20(%0),%%xmm2 \n"
1169 "movdqu 0x30(%0),%%xmm6 \n"
1170 "movdqu (%0,%4,1),%%xmm7 \n"
1171 "pavgb %%xmm7,%%xmm0 \n"
1172 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1173 "pavgb %%xmm7,%%xmm1 \n"
1174 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm2 \n"
1176 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1177 "pavgb %%xmm7,%%xmm6 \n"
1178 "lea 0x40(%0),%0 \n"
1179 "movdqa %%xmm0,%%xmm7 \n"
1180 "shufps $0x88,%%xmm1,%%xmm0 \n"
1181 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1182 "pavgb %%xmm7,%%xmm0 \n"
1183 "movdqa %%xmm2,%%xmm7 \n"
1184 "shufps $0x88,%%xmm6,%%xmm2 \n"
1185 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1186 "pavgb %%xmm7,%%xmm2 \n"
1187 "movdqa %%xmm0,%%xmm1 \n"
1188 "movdqa %%xmm2,%%xmm6 \n"
1189 "pmaddubsw %%xmm4,%%xmm0 \n"
1190 "pmaddubsw %%xmm4,%%xmm2 \n"
1191 "pmaddubsw %%xmm3,%%xmm1 \n"
1192 "pmaddubsw %%xmm3,%%xmm6 \n"
1193 "phaddw %%xmm2,%%xmm0 \n"
1194 "phaddw %%xmm6,%%xmm1 \n"
1195 "psraw $0x8,%%xmm0 \n"
1196 "psraw $0x8,%%xmm1 \n"
1197 "packsswb %%xmm1,%%xmm0 \n"
1198 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001199 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001200 "movlps %%xmm0,(%1) \n"
1201 "movhps %%xmm0,(%1,%2,1) \n"
1202 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001203 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001204 : "+r"(src_abgr0), // %0
1205 "+r"(dst_u), // %1
1206 "+r"(dst_v), // %2
1207 "+rm"(width) // %3
1208 : "r"(static_cast<intptr_t>(src_stride_abgr))
1209 : "memory", "cc"
1210#if defined(__SSE2__)
1211 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1212#endif
1213 );
1214}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001215#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001216
fbarchard@google.come214fe32012-06-04 23:47:11 +00001217#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001218#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1219#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1220#define UR 0
1221
1222#define VB 0
1223#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1224#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1225
1226// Bias
1227#define BB UB * 128 + VB * 128
1228#define BG UG * 128 + VG * 128
1229#define BR UR * 128 + VR * 128
1230
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001231#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001232
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001233struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001234 vec8 kUVToB; // 0
1235 vec8 kUVToG; // 16
1236 vec8 kUVToR; // 32
1237 vec16 kUVBiasB; // 48
1238 vec16 kUVBiasG; // 64
1239 vec16 kUVBiasR; // 80
1240 vec16 kYSub16; // 96
1241 vec16 kYToRgb; // 112
1242 vec8 kVUToB; // 128
1243 vec8 kVUToG; // 144
1244 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001245} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001246 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1247 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1248 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1249 { BB, BB, BB, BB, BB, BB, BB, BB },
1250 { BG, BG, BG, BG, BG, BG, BG, BG },
1251 { BR, BR, BR, BR, BR, BR, BR, BR },
1252 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001253 { YG, YG, YG, YG, YG, YG, YG, YG },
1254 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1255 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1256 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001257};
1258
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001259
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001260// Read 8 UV from 411
1261#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001262 "movq (%[u_buf]),%%xmm0 \n" \
1263 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1264 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001265 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001266
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001267// Read 4 UV from 422, upsample to 8 UV
1268#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001269 "movd (%[u_buf]),%%xmm0 \n" \
1270 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1271 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001272 "punpcklbw %%xmm1,%%xmm0 \n" \
1273 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001274
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001275// Read 2 UV from 411, upsample to 8 UV
1276#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001277 "movd (%[u_buf]),%%xmm0 \n" \
1278 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1279 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001280 "punpcklbw %%xmm1,%%xmm0 \n" \
1281 "punpcklwd %%xmm0,%%xmm0 \n" \
1282 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001283
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001284// Read 4 UV from NV12, upsample to 8 UV
1285#define READNV12 \
1286 "movq (%[uv_buf]),%%xmm0 \n" \
1287 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
1288 "punpcklbw %%xmm1,%%xmm0 \n" \
1289
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001290// Convert 8 pixels: 8 UV and 8 Y
1291#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001292 "movdqa %%xmm0,%%xmm1 \n" \
1293 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001294 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1295 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1296 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1297 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1298 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1299 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1300 "movq (%[y_buf]),%%xmm3 \n" \
1301 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001302 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001303 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1304 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001305 "paddsw %%xmm3,%%xmm0 \n" \
1306 "paddsw %%xmm3,%%xmm1 \n" \
1307 "paddsw %%xmm3,%%xmm2 \n" \
1308 "psraw $0x6,%%xmm0 \n" \
1309 "psraw $0x6,%%xmm1 \n" \
1310 "psraw $0x6,%%xmm2 \n" \
1311 "packuswb %%xmm0,%%xmm0 \n" \
1312 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001313 "packuswb %%xmm2,%%xmm2 \n" \
1314
1315// Convert 8 pixels: 8 VU and 8 Y
1316#define YVUTORGB \
1317 "movdqa %%xmm0,%%xmm1 \n" \
1318 "movdqa %%xmm0,%%xmm2 \n" \
1319 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1320 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1321 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1322 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1323 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1324 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1325 "movq (%[y_buf]),%%xmm3 \n" \
1326 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1327 "punpcklbw %%xmm4,%%xmm3 \n" \
1328 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1329 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1330 "paddsw %%xmm3,%%xmm0 \n" \
1331 "paddsw %%xmm3,%%xmm1 \n" \
1332 "paddsw %%xmm3,%%xmm2 \n" \
1333 "psraw $0x6,%%xmm0 \n" \
1334 "psraw $0x6,%%xmm1 \n" \
1335 "psraw $0x6,%%xmm2 \n" \
1336 "packuswb %%xmm0,%%xmm0 \n" \
1337 "packuswb %%xmm1,%%xmm1 \n" \
1338 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001339
1340void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001341 const uint8* u_buf,
1342 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001343 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001344 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001345 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001346 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001347 "pcmpeqb %%xmm5,%%xmm5 \n"
1348 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001349 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001350 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001351 READYUV444
1352 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001353 "punpcklbw %%xmm1,%%xmm0 \n"
1354 "punpcklbw %%xmm5,%%xmm2 \n"
1355 "movdqa %%xmm0,%%xmm1 \n"
1356 "punpcklwd %%xmm2,%%xmm0 \n"
1357 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001358 "movdqa %%xmm0,(%[argb_buf]) \n"
1359 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1360 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1361 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001362 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001363 : [y_buf]"+r"(y_buf), // %[y_buf]
1364 [u_buf]"+r"(u_buf), // %[u_buf]
1365 [v_buf]"+r"(v_buf), // %[v_buf]
1366 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1367 [width]"+rm"(width) // %[width]
1368 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001369 : "memory", "cc"
1370#if defined(__SSE2__)
1371 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1372#endif
1373 );
1374}
1375
fbarchard@google.come214fe32012-06-04 23:47:11 +00001376void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001377 const uint8* u_buf,
1378 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001379 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001380 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001381 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001382 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001383 "pcmpeqb %%xmm5,%%xmm5 \n"
1384 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001385 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001386 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001387 READYUV422
1388 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001389 "punpcklbw %%xmm1,%%xmm0 \n"
1390 "punpcklbw %%xmm5,%%xmm2 \n"
1391 "movdqa %%xmm0,%%xmm1 \n"
1392 "punpcklwd %%xmm2,%%xmm0 \n"
1393 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001394 "movdqa %%xmm0,(%[argb_buf]) \n"
1395 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1396 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1397 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001398 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001399 : [y_buf]"+r"(y_buf), // %[y_buf]
1400 [u_buf]"+r"(u_buf), // %[u_buf]
1401 [v_buf]"+r"(v_buf), // %[v_buf]
1402 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1403 [width]"+rm"(width) // %[width]
1404 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001405 : "memory", "cc"
1406#if defined(__SSE2__)
1407 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1408#endif
1409 );
1410}
1411
1412void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1413 const uint8* u_buf,
1414 const uint8* v_buf,
1415 uint8* argb_buf,
1416 int width) {
1417 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001418 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001419 "pcmpeqb %%xmm5,%%xmm5 \n"
1420 "pxor %%xmm4,%%xmm4 \n"
1421 ".p2align 4 \n"
1422 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001423 READYUV411
1424 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001425 "punpcklbw %%xmm1,%%xmm0 \n"
1426 "punpcklbw %%xmm5,%%xmm2 \n"
1427 "movdqa %%xmm0,%%xmm1 \n"
1428 "punpcklwd %%xmm2,%%xmm0 \n"
1429 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001430 "movdqa %%xmm0,(%[argb_buf]) \n"
1431 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1432 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1433 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001434 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001435 : [y_buf]"+r"(y_buf), // %[y_buf]
1436 [u_buf]"+r"(u_buf), // %[u_buf]
1437 [v_buf]"+r"(v_buf), // %[v_buf]
1438 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1439 [width]"+rm"(width) // %[width]
1440 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1441 : "memory", "cc"
1442#if defined(__SSE2__)
1443 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1444#endif
1445 );
1446}
1447
1448void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1449 const uint8* uv_buf,
1450 uint8* argb_buf,
1451 int width) {
1452 asm volatile (
1453 "pcmpeqb %%xmm5,%%xmm5 \n"
1454 "pxor %%xmm4,%%xmm4 \n"
1455 ".p2align 4 \n"
1456 "1: \n"
1457 READNV12
1458 YUVTORGB
1459 "punpcklbw %%xmm1,%%xmm0 \n"
1460 "punpcklbw %%xmm5,%%xmm2 \n"
1461 "movdqa %%xmm0,%%xmm1 \n"
1462 "punpcklwd %%xmm2,%%xmm0 \n"
1463 "punpckhwd %%xmm2,%%xmm1 \n"
1464 "movdqa %%xmm0,(%[argb_buf]) \n"
1465 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1466 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1467 "sub $0x8,%[width] \n"
1468 "jg 1b \n"
1469 : [y_buf]"+r"(y_buf), // %[y_buf]
1470 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1471 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1472 [width]"+rm"(width) // %[width]
1473 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1474 : "memory", "cc"
1475#if defined(__SSE2__)
1476 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1477#endif
1478 );
1479}
1480
1481void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1482 const uint8* vu_buf,
1483 uint8* argb_buf,
1484 int width) {
1485 asm volatile (
1486 "pcmpeqb %%xmm5,%%xmm5 \n"
1487 "pxor %%xmm4,%%xmm4 \n"
1488 ".p2align 4 \n"
1489 "1: \n"
1490 READNV12
1491 YVUTORGB
1492 "punpcklbw %%xmm1,%%xmm0 \n"
1493 "punpcklbw %%xmm5,%%xmm2 \n"
1494 "movdqa %%xmm0,%%xmm1 \n"
1495 "punpcklwd %%xmm2,%%xmm0 \n"
1496 "punpckhwd %%xmm2,%%xmm1 \n"
1497 "movdqa %%xmm0,(%[argb_buf]) \n"
1498 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1499 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1500 "sub $0x8,%[width] \n"
1501 "jg 1b \n"
1502 : [y_buf]"+r"(y_buf), // %[y_buf]
1503 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1504 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1505 [width]"+rm"(width) // %[width]
1506 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001507 : "memory", "cc"
1508#if defined(__SSE2__)
1509 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1510#endif
1511 );
1512}
1513
1514void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1515 const uint8* u_buf,
1516 const uint8* v_buf,
1517 uint8* argb_buf,
1518 int width) {
1519 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001520 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001521 "pcmpeqb %%xmm5,%%xmm5 \n"
1522 "pxor %%xmm4,%%xmm4 \n"
1523 ".p2align 4 \n"
1524 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001525 READYUV444
1526 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001527 "punpcklbw %%xmm1,%%xmm0 \n"
1528 "punpcklbw %%xmm5,%%xmm2 \n"
1529 "movdqa %%xmm0,%%xmm1 \n"
1530 "punpcklwd %%xmm2,%%xmm0 \n"
1531 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001532 "movdqu %%xmm0,(%[argb_buf]) \n"
1533 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1534 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1535 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001536 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001537 : [y_buf]"+r"(y_buf), // %[y_buf]
1538 [u_buf]"+r"(u_buf), // %[u_buf]
1539 [v_buf]"+r"(v_buf), // %[v_buf]
1540 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1541 [width]"+rm"(width) // %[width]
1542 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001543 : "memory", "cc"
1544#if defined(__SSE2__)
1545 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1546#endif
1547 );
1548}
1549
1550void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1551 const uint8* u_buf,
1552 const uint8* v_buf,
1553 uint8* argb_buf,
1554 int width) {
1555 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001556 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001557 "pcmpeqb %%xmm5,%%xmm5 \n"
1558 "pxor %%xmm4,%%xmm4 \n"
1559 ".p2align 4 \n"
1560 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001561 READYUV422
1562 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001563 "punpcklbw %%xmm1,%%xmm0 \n"
1564 "punpcklbw %%xmm5,%%xmm2 \n"
1565 "movdqa %%xmm0,%%xmm1 \n"
1566 "punpcklwd %%xmm2,%%xmm0 \n"
1567 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001568 "movdqu %%xmm0,(%[argb_buf]) \n"
1569 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1570 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1571 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001572 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001573 : [y_buf]"+r"(y_buf), // %[y_buf]
1574 [u_buf]"+r"(u_buf), // %[u_buf]
1575 [v_buf]"+r"(v_buf), // %[v_buf]
1576 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1577 [width]"+rm"(width) // %[width]
1578 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001579 : "memory", "cc"
1580#if defined(__SSE2__)
1581 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1582#endif
1583 );
1584}
1585
1586void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1587 const uint8* u_buf,
1588 const uint8* v_buf,
1589 uint8* argb_buf,
1590 int width) {
1591 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001592 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001593 "pcmpeqb %%xmm5,%%xmm5 \n"
1594 "pxor %%xmm4,%%xmm4 \n"
1595 ".p2align 4 \n"
1596 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001597 READYUV411
1598 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001599 "punpcklbw %%xmm1,%%xmm0 \n"
1600 "punpcklbw %%xmm5,%%xmm2 \n"
1601 "movdqa %%xmm0,%%xmm1 \n"
1602 "punpcklwd %%xmm2,%%xmm0 \n"
1603 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001604 "movdqu %%xmm0,(%[argb_buf]) \n"
1605 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1606 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1607 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001608 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001609 : [y_buf]"+r"(y_buf), // %[y_buf]
1610 [u_buf]"+r"(u_buf), // %[u_buf]
1611 [v_buf]"+r"(v_buf), // %[v_buf]
1612 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1613 [width]"+rm"(width) // %[width]
1614 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1615 : "memory", "cc"
1616#if defined(__SSE2__)
1617 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1618#endif
1619 );
1620}
1621
1622void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1623 const uint8* uv_buf,
1624 uint8* argb_buf,
1625 int width) {
1626 asm volatile (
1627 "pcmpeqb %%xmm5,%%xmm5 \n"
1628 "pxor %%xmm4,%%xmm4 \n"
1629 ".p2align 4 \n"
1630 "1: \n"
1631 READNV12
1632 YUVTORGB
1633 "punpcklbw %%xmm1,%%xmm0 \n"
1634 "punpcklbw %%xmm5,%%xmm2 \n"
1635 "movdqa %%xmm0,%%xmm1 \n"
1636 "punpcklwd %%xmm2,%%xmm0 \n"
1637 "punpckhwd %%xmm2,%%xmm1 \n"
1638 "movdqu %%xmm0,(%[argb_buf]) \n"
1639 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1640 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1641 "sub $0x8,%[width] \n"
1642 "jg 1b \n"
1643 : [y_buf]"+r"(y_buf), // %[y_buf]
1644 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1645 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1646 [width]"+rm"(width) // %[width]
1647 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1648 : "memory", "cc"
1649#if defined(__SSE2__)
1650 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1651#endif
1652 );
1653}
1654
1655void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1656 const uint8* vu_buf,
1657 uint8* argb_buf,
1658 int width) {
1659 asm volatile (
1660 "pcmpeqb %%xmm5,%%xmm5 \n"
1661 "pxor %%xmm4,%%xmm4 \n"
1662 ".p2align 4 \n"
1663 "1: \n"
1664 READNV12
1665 YVUTORGB
1666 "punpcklbw %%xmm1,%%xmm0 \n"
1667 "punpcklbw %%xmm5,%%xmm2 \n"
1668 "movdqa %%xmm0,%%xmm1 \n"
1669 "punpcklwd %%xmm2,%%xmm0 \n"
1670 "punpckhwd %%xmm2,%%xmm1 \n"
1671 "movdqu %%xmm0,(%[argb_buf]) \n"
1672 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1673 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1674 "sub $0x8,%[width] \n"
1675 "jg 1b \n"
1676 : [y_buf]"+r"(y_buf), // %[y_buf]
1677 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1678 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1679 [width]"+rm"(width) // %[width]
1680 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001681 : "memory", "cc"
1682#if defined(__SSE2__)
1683 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1684#endif
1685 );
1686}
1687
1688void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1689 const uint8* u_buf,
1690 const uint8* v_buf,
1691 uint8* bgra_buf,
1692 int width) {
1693 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001694 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001695 "pcmpeqb %%xmm5,%%xmm5 \n"
1696 "pxor %%xmm4,%%xmm4 \n"
1697 ".p2align 4 \n"
1698 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001699 READYUV422
1700 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001701 "pcmpeqb %%xmm5,%%xmm5 \n"
1702 "punpcklbw %%xmm0,%%xmm1 \n"
1703 "punpcklbw %%xmm2,%%xmm5 \n"
1704 "movdqa %%xmm5,%%xmm0 \n"
1705 "punpcklwd %%xmm1,%%xmm5 \n"
1706 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001707 "movdqa %%xmm5,(%[argb_buf]) \n"
1708 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1709 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1710 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001711 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001712 : [y_buf]"+r"(y_buf), // %[y_buf]
1713 [u_buf]"+r"(u_buf), // %[u_buf]
1714 [v_buf]"+r"(v_buf), // %[v_buf]
1715 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1716 [width]"+rm"(width) // %[width]
1717 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001718 : "memory", "cc"
1719#if defined(__SSE2__)
1720 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1721#endif
1722 );
1723}
1724
fbarchard@google.come214fe32012-06-04 23:47:11 +00001725void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001726 const uint8* u_buf,
1727 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001728 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001729 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001730 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001731 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001732 "pcmpeqb %%xmm5,%%xmm5 \n"
1733 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001734 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001735 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001736 READYUV422
1737 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001738 "punpcklbw %%xmm1,%%xmm2 \n"
1739 "punpcklbw %%xmm5,%%xmm0 \n"
1740 "movdqa %%xmm2,%%xmm1 \n"
1741 "punpcklwd %%xmm0,%%xmm2 \n"
1742 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001743 "movdqa %%xmm2,(%[argb_buf]) \n"
1744 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1745 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1746 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001747 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001748 : [y_buf]"+r"(y_buf), // %[y_buf]
1749 [u_buf]"+r"(u_buf), // %[u_buf]
1750 [v_buf]"+r"(v_buf), // %[v_buf]
1751 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1752 [width]"+rm"(width) // %[width]
1753 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001754 : "memory", "cc"
1755#if defined(__SSE2__)
1756 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1757#endif
1758 );
1759}
1760
fbarchard@google.come214fe32012-06-04 23:47:11 +00001761void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001762 const uint8* u_buf,
1763 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001764 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001765 int width) {
1766 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001767 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001768 "pcmpeqb %%xmm5,%%xmm5 \n"
1769 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001770 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001771 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001772 READYUV422
1773 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001774 "pcmpeqb %%xmm5,%%xmm5 \n"
1775 "punpcklbw %%xmm0,%%xmm1 \n"
1776 "punpcklbw %%xmm2,%%xmm5 \n"
1777 "movdqa %%xmm5,%%xmm0 \n"
1778 "punpcklwd %%xmm1,%%xmm5 \n"
1779 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001780 "movdqu %%xmm5,(%[argb_buf]) \n"
1781 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
1782 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1783 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001784 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001785 : [y_buf]"+r"(y_buf), // %[y_buf]
1786 [u_buf]"+r"(u_buf), // %[u_buf]
1787 [v_buf]"+r"(v_buf), // %[v_buf]
1788 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1789 [width]"+rm"(width) // %[width]
1790 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001791 : "memory", "cc"
1792#if defined(__SSE2__)
1793 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1794#endif
1795 );
1796}
1797
fbarchard@google.come214fe32012-06-04 23:47:11 +00001798void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001799 const uint8* u_buf,
1800 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001801 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001802 int width) {
1803 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001804 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001805 "pcmpeqb %%xmm5,%%xmm5 \n"
1806 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001807 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001808 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001809 READYUV422
1810 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001811 "punpcklbw %%xmm1,%%xmm2 \n"
1812 "punpcklbw %%xmm5,%%xmm0 \n"
1813 "movdqa %%xmm2,%%xmm1 \n"
1814 "punpcklwd %%xmm0,%%xmm2 \n"
1815 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001816 "movdqu %%xmm2,(%[argb_buf]) \n"
1817 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1818 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1819 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001820 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001821 : [y_buf]"+r"(y_buf), // %[y_buf]
1822 [u_buf]"+r"(u_buf), // %[u_buf]
1823 [v_buf]"+r"(v_buf), // %[v_buf]
1824 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1825 [width]"+rm"(width) // %[width]
1826 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001827 : "memory", "cc"
1828#if defined(__SSE2__)
1829 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1830#endif
1831 );
1832}
fbarchard@google.come214fe32012-06-04 23:47:11 +00001833#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001834
1835#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001836void YToARGBRow_SSE2(const uint8* y_buf,
1837 uint8* rgb_buf,
1838 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001839 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001840 "pcmpeqb %%xmm4,%%xmm4 \n"
1841 "pslld $0x18,%%xmm4 \n"
1842 "mov $0x10001000,%%eax \n"
1843 "movd %%eax,%%xmm3 \n"
1844 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1845 "mov $0x012a012a,%%eax \n"
1846 "movd %%eax,%%xmm2 \n"
1847 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001848 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001849 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001850 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001851 "movq (%0),%%xmm0 \n"
1852 "lea 0x8(%0),%0 \n"
1853 "punpcklbw %%xmm0,%%xmm0 \n"
1854 "psubusw %%xmm3,%%xmm0 \n"
1855 "pmulhuw %%xmm2,%%xmm0 \n"
1856 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001857
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001858 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001859 "punpcklbw %%xmm0,%%xmm0 \n"
1860 "movdqa %%xmm0,%%xmm1 \n"
1861 "punpcklwd %%xmm0,%%xmm0 \n"
1862 "punpckhwd %%xmm1,%%xmm1 \n"
1863 "por %%xmm4,%%xmm0 \n"
1864 "por %%xmm4,%%xmm1 \n"
1865 "movdqa %%xmm0,(%1) \n"
1866 "movdqa %%xmm1,16(%1) \n"
1867 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001868
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001869 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001870 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001871 : "+r"(y_buf), // %0
1872 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001873 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001874 :
1875 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001876#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001877 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001878#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001879 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001880}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001881#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001882
fbarchard@google.com42831e02012-01-21 02:54:17 +00001883#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001884// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001885CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001886 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1887};
1888
fbarchard@google.com42831e02012-01-21 02:54:17 +00001889void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001890 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001891 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001892 "movdqa %3,%%xmm5 \n"
1893 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001894 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001895 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001896 "movdqa (%0,%2),%%xmm0 \n"
1897 "pshufb %%xmm5,%%xmm0 \n"
1898 "sub $0x10,%2 \n"
1899 "movdqa %%xmm0,(%1) \n"
1900 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001901 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001902 : "+r"(src), // %0
1903 "+r"(dst), // %1
1904 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001905 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001906 : "memory", "cc"
1907#if defined(__SSE2__)
1908 , "xmm0", "xmm5"
1909#endif
1910 );
1911}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001912#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001913
fbarchard@google.com42831e02012-01-21 02:54:17 +00001914#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001915void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001916 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001917 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001918 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001919 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001920 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001921 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001922 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001923 "psllw $0x8,%%xmm0 \n"
1924 "psrlw $0x8,%%xmm1 \n"
1925 "por %%xmm1,%%xmm0 \n"
1926 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1927 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1928 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1929 "sub $0x10,%2 \n"
1930 "movdqu %%xmm0,(%1) \n"
1931 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001932 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001933 : "+r"(src), // %0
1934 "+r"(dst), // %1
1935 "+r"(temp_width) // %2
1936 :
1937 : "memory", "cc"
1938#if defined(__SSE2__)
1939 , "xmm0", "xmm1"
1940#endif
1941 );
1942}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001943#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001944
fbarchard@google.com16a96642012-03-02 22:38:09 +00001945#ifdef HAS_MIRRORROW_UV_SSSE3
1946// Shuffle table for reversing the bytes of UV channels.
1947CONST uvec8 kShuffleMirrorUV = {
1948 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1949};
1950void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1951 int width) {
1952 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001953 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001954 "movdqa %4,%%xmm1 \n"
1955 "lea -16(%0,%3,2),%0 \n"
1956 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001957 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001958 "1: \n"
1959 "movdqa (%0),%%xmm0 \n"
1960 "lea -16(%0),%0 \n"
1961 "pshufb %%xmm1,%%xmm0 \n"
1962 "sub $8,%3 \n"
1963 "movlpd %%xmm0,(%1) \n"
1964 "movhpd %%xmm0,(%1,%2) \n"
1965 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001966 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001967 : "+r"(src), // %0
1968 "+r"(dst_u), // %1
1969 "+r"(dst_v), // %2
1970 "+r"(temp_width) // %3
1971 : "m"(kShuffleMirrorUV) // %4
1972 : "memory", "cc"
1973#if defined(__SSE2__)
1974 , "xmm0", "xmm1"
1975#endif
1976 );
1977}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001978#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00001979
fbarchard@google.com55663022012-04-26 00:01:41 +00001980#ifdef HAS_ADDROW_SSE2
1981// dst and width aligned to 16
1982void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
1983 asm volatile (
1984 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001985 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001986 "1: \n"
1987 "movdqu (%0),%%xmm2 \n"
1988 "lea 0x10(%0),%0 \n"
1989 "movdqa (%1),%%xmm0 \n"
1990 "movdqa 0x10(%1),%%xmm1 \n"
1991 "movdqa %%xmm2,%%xmm3 \n"
1992 "punpcklbw %%xmm4,%%xmm2 \n"
1993 "punpckhbw %%xmm4,%%xmm3 \n"
1994 "paddusw %%xmm2,%%xmm0 \n"
1995 "paddusw %%xmm3,%%xmm1 \n"
1996 "sub $0x10,%2 \n"
1997 "movdqa %%xmm0,(%1) \n"
1998 "movdqa %%xmm1,0x10(%1) \n"
1999 "lea 0x20(%1),%1 \n"
2000 "jg 1b \n"
2001 : "+r"(src), // %0
2002 "+r"(dst), // %1
2003 "+r"(width) // %2
2004 :
2005 : "memory", "cc"
2006#if defined(__SSE2__)
2007 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2008#endif
2009 );
2010}
2011
2012// dst and width aligned to 16
2013void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
2014 asm volatile (
2015 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002016 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00002017 "1: \n"
2018 "movdqu (%0),%%xmm2 \n"
2019 "lea 0x10(%0),%0 \n"
2020 "movdqa (%1),%%xmm0 \n"
2021 "movdqa 0x10(%1),%%xmm1 \n"
2022 "movdqa %%xmm2,%%xmm3 \n"
2023 "punpcklbw %%xmm4,%%xmm2 \n"
2024 "punpckhbw %%xmm4,%%xmm3 \n"
2025 "psubusw %%xmm2,%%xmm0 \n"
2026 "psubusw %%xmm3,%%xmm1 \n"
2027 "sub $0x10,%2 \n"
2028 "movdqa %%xmm0,(%1) \n"
2029 "movdqa %%xmm1,0x10(%1) \n"
2030 "lea 0x20(%1),%1 \n"
2031 "jg 1b \n"
2032 : "+r"(src), // %0
2033 "+r"(dst), // %1
2034 "+r"(width) // %2
2035 :
2036 : "memory", "cc"
2037#if defined(__SSE2__)
2038 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2039#endif
2040 );
2041}
2042#endif // HAS_ADDROW_SSE2
2043
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002044#ifdef HAS_SPLITUV_SSE2
2045void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002046 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002047 "pcmpeqb %%xmm5,%%xmm5 \n"
2048 "psrlw $0x8,%%xmm5 \n"
2049 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002050 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002051 "1: \n"
2052 "movdqa (%0),%%xmm0 \n"
2053 "movdqa 0x10(%0),%%xmm1 \n"
2054 "lea 0x20(%0),%0 \n"
2055 "movdqa %%xmm0,%%xmm2 \n"
2056 "movdqa %%xmm1,%%xmm3 \n"
2057 "pand %%xmm5,%%xmm0 \n"
2058 "pand %%xmm5,%%xmm1 \n"
2059 "packuswb %%xmm1,%%xmm0 \n"
2060 "psrlw $0x8,%%xmm2 \n"
2061 "psrlw $0x8,%%xmm3 \n"
2062 "packuswb %%xmm3,%%xmm2 \n"
2063 "movdqa %%xmm0,(%1) \n"
2064 "movdqa %%xmm2,(%1,%2) \n"
2065 "lea 0x10(%1),%1 \n"
2066 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002067 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002068 : "+r"(src_uv), // %0
2069 "+r"(dst_u), // %1
2070 "+r"(dst_v), // %2
2071 "+r"(pix) // %3
2072 :
2073 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002074#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002075 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002076#endif
2077 );
2078}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002079#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002080
fbarchard@google.com19932f82012-02-16 22:19:14 +00002081#ifdef HAS_COPYROW_SSE2
2082void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002083 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002084 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002085 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002086 "1: \n"
2087 "movdqa (%0),%%xmm0 \n"
2088 "movdqa 0x10(%0),%%xmm1 \n"
2089 "movdqa %%xmm0,(%0,%1) \n"
2090 "movdqa %%xmm1,0x10(%0,%1) \n"
2091 "lea 0x20(%0),%0 \n"
2092 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002093 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002094 : "+r"(src), // %0
2095 "+r"(dst), // %1
2096 "+r"(count) // %2
2097 :
2098 : "memory", "cc"
2099#if defined(__SSE2__)
2100 , "xmm0", "xmm1"
2101#endif
2102 );
2103}
2104#endif // HAS_COPYROW_SSE2
2105
2106#ifdef HAS_COPYROW_X86
2107void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2108 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002109 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002110 "shr $0x2,%2 \n"
2111 "rep movsl \n"
2112 : "+S"(src), // %0
2113 "+D"(dst), // %1
2114 "+c"(width_tmp) // %2
2115 :
2116 : "memory", "cc"
2117 );
2118}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002119#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002120
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002121#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002122void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002123 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002124 "pcmpeqb %%xmm5,%%xmm5 \n"
2125 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002126 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002127 "1: \n"
2128 "movdqa (%0),%%xmm0 \n"
2129 "movdqa 0x10(%0),%%xmm1 \n"
2130 "lea 0x20(%0),%0 \n"
2131 "pand %%xmm5,%%xmm0 \n"
2132 "pand %%xmm5,%%xmm1 \n"
2133 "packuswb %%xmm1,%%xmm0 \n"
2134 "movdqa %%xmm0,(%1) \n"
2135 "lea 0x10(%1),%1 \n"
2136 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002137 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002138 : "+r"(src_yuy2), // %0
2139 "+r"(dst_y), // %1
2140 "+r"(pix) // %2
2141 :
2142 : "memory", "cc"
2143#if defined(__SSE2__)
2144 , "xmm0", "xmm1", "xmm5"
2145#endif
2146 );
2147}
2148
2149void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2150 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002151 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002152 "pcmpeqb %%xmm5,%%xmm5 \n"
2153 "psrlw $0x8,%%xmm5 \n"
2154 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002155 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002156 "1: \n"
2157 "movdqa (%0),%%xmm0 \n"
2158 "movdqa 0x10(%0),%%xmm1 \n"
2159 "movdqa (%0,%4,1),%%xmm2 \n"
2160 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2161 "lea 0x20(%0),%0 \n"
2162 "pavgb %%xmm2,%%xmm0 \n"
2163 "pavgb %%xmm3,%%xmm1 \n"
2164 "psrlw $0x8,%%xmm0 \n"
2165 "psrlw $0x8,%%xmm1 \n"
2166 "packuswb %%xmm1,%%xmm0 \n"
2167 "movdqa %%xmm0,%%xmm1 \n"
2168 "pand %%xmm5,%%xmm0 \n"
2169 "packuswb %%xmm0,%%xmm0 \n"
2170 "psrlw $0x8,%%xmm1 \n"
2171 "packuswb %%xmm1,%%xmm1 \n"
2172 "movq %%xmm0,(%1) \n"
2173 "movq %%xmm1,(%1,%2) \n"
2174 "lea 0x8(%1),%1 \n"
2175 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002176 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002177 : "+r"(src_yuy2), // %0
2178 "+r"(dst_u), // %1
2179 "+r"(dst_y), // %2
2180 "+r"(pix) // %3
2181 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2182 : "memory", "cc"
2183#if defined(__SSE2__)
2184 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2185#endif
2186 );
2187}
2188
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002189
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002190void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2191 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002192 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002193 "pcmpeqb %%xmm5,%%xmm5 \n"
2194 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002195 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002196 "1: \n"
2197 "movdqu (%0),%%xmm0 \n"
2198 "movdqu 0x10(%0),%%xmm1 \n"
2199 "lea 0x20(%0),%0 \n"
2200 "pand %%xmm5,%%xmm0 \n"
2201 "pand %%xmm5,%%xmm1 \n"
2202 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002203 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002204 "movdqu %%xmm0,(%1) \n"
2205 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002206 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002207 : "+r"(src_yuy2), // %0
2208 "+r"(dst_y), // %1
2209 "+r"(pix) // %2
2210 :
2211 : "memory", "cc"
2212#if defined(__SSE2__)
2213 , "xmm0", "xmm1", "xmm5"
2214#endif
2215 );
2216}
2217
2218void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2219 int stride_yuy2,
2220 uint8* dst_u, uint8* dst_y,
2221 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002222 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002223 "pcmpeqb %%xmm5,%%xmm5 \n"
2224 "psrlw $0x8,%%xmm5 \n"
2225 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002226 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002227 "1: \n"
2228 "movdqu (%0),%%xmm0 \n"
2229 "movdqu 0x10(%0),%%xmm1 \n"
2230 "movdqu (%0,%4,1),%%xmm2 \n"
2231 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2232 "lea 0x20(%0),%0 \n"
2233 "pavgb %%xmm2,%%xmm0 \n"
2234 "pavgb %%xmm3,%%xmm1 \n"
2235 "psrlw $0x8,%%xmm0 \n"
2236 "psrlw $0x8,%%xmm1 \n"
2237 "packuswb %%xmm1,%%xmm0 \n"
2238 "movdqa %%xmm0,%%xmm1 \n"
2239 "pand %%xmm5,%%xmm0 \n"
2240 "packuswb %%xmm0,%%xmm0 \n"
2241 "psrlw $0x8,%%xmm1 \n"
2242 "packuswb %%xmm1,%%xmm1 \n"
2243 "movq %%xmm0,(%1) \n"
2244 "movq %%xmm1,(%1,%2) \n"
2245 "lea 0x8(%1),%1 \n"
2246 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002247 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002248 : "+r"(src_yuy2), // %0
2249 "+r"(dst_u), // %1
2250 "+r"(dst_y), // %2
2251 "+r"(pix) // %3
2252 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2253 : "memory", "cc"
2254#if defined(__SSE2__)
2255 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2256#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002257 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002258}
2259
2260void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002261 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002262 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002263 "1: \n"
2264 "movdqa (%0),%%xmm0 \n"
2265 "movdqa 0x10(%0),%%xmm1 \n"
2266 "lea 0x20(%0),%0 \n"
2267 "psrlw $0x8,%%xmm0 \n"
2268 "psrlw $0x8,%%xmm1 \n"
2269 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002270 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002271 "movdqa %%xmm0,(%1) \n"
2272 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002273 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002274 : "+r"(src_uyvy), // %0
2275 "+r"(dst_y), // %1
2276 "+r"(pix) // %2
2277 :
2278 : "memory", "cc"
2279#if defined(__SSE2__)
2280 , "xmm0", "xmm1"
2281#endif
2282 );
2283}
2284
2285void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2286 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002287 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002288 "pcmpeqb %%xmm5,%%xmm5 \n"
2289 "psrlw $0x8,%%xmm5 \n"
2290 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002291 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002292 "1: \n"
2293 "movdqa (%0),%%xmm0 \n"
2294 "movdqa 0x10(%0),%%xmm1 \n"
2295 "movdqa (%0,%4,1),%%xmm2 \n"
2296 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2297 "lea 0x20(%0),%0 \n"
2298 "pavgb %%xmm2,%%xmm0 \n"
2299 "pavgb %%xmm3,%%xmm1 \n"
2300 "pand %%xmm5,%%xmm0 \n"
2301 "pand %%xmm5,%%xmm1 \n"
2302 "packuswb %%xmm1,%%xmm0 \n"
2303 "movdqa %%xmm0,%%xmm1 \n"
2304 "pand %%xmm5,%%xmm0 \n"
2305 "packuswb %%xmm0,%%xmm0 \n"
2306 "psrlw $0x8,%%xmm1 \n"
2307 "packuswb %%xmm1,%%xmm1 \n"
2308 "movq %%xmm0,(%1) \n"
2309 "movq %%xmm1,(%1,%2) \n"
2310 "lea 0x8(%1),%1 \n"
2311 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002312 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002313 : "+r"(src_uyvy), // %0
2314 "+r"(dst_u), // %1
2315 "+r"(dst_y), // %2
2316 "+r"(pix) // %3
2317 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2318 : "memory", "cc"
2319#if defined(__SSE2__)
2320 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2321#endif
2322 );
2323}
2324
2325void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2326 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002327 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002328 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002329 "1: \n"
2330 "movdqu (%0),%%xmm0 \n"
2331 "movdqu 0x10(%0),%%xmm1 \n"
2332 "lea 0x20(%0),%0 \n"
2333 "psrlw $0x8,%%xmm0 \n"
2334 "psrlw $0x8,%%xmm1 \n"
2335 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002336 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002337 "movdqu %%xmm0,(%1) \n"
2338 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002339 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002340 : "+r"(src_uyvy), // %0
2341 "+r"(dst_y), // %1
2342 "+r"(pix) // %2
2343 :
2344 : "memory", "cc"
2345#if defined(__SSE2__)
2346 , "xmm0", "xmm1"
2347#endif
2348 );
2349}
2350
2351void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2352 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002353 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002354 "pcmpeqb %%xmm5,%%xmm5 \n"
2355 "psrlw $0x8,%%xmm5 \n"
2356 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002357 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002358 "1: \n"
2359 "movdqu (%0),%%xmm0 \n"
2360 "movdqu 0x10(%0),%%xmm1 \n"
2361 "movdqu (%0,%4,1),%%xmm2 \n"
2362 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2363 "lea 0x20(%0),%0 \n"
2364 "pavgb %%xmm2,%%xmm0 \n"
2365 "pavgb %%xmm3,%%xmm1 \n"
2366 "pand %%xmm5,%%xmm0 \n"
2367 "pand %%xmm5,%%xmm1 \n"
2368 "packuswb %%xmm1,%%xmm0 \n"
2369 "movdqa %%xmm0,%%xmm1 \n"
2370 "pand %%xmm5,%%xmm0 \n"
2371 "packuswb %%xmm0,%%xmm0 \n"
2372 "psrlw $0x8,%%xmm1 \n"
2373 "packuswb %%xmm1,%%xmm1 \n"
2374 "movq %%xmm0,(%1) \n"
2375 "movq %%xmm1,(%1,%2) \n"
2376 "lea 0x8(%1),%1 \n"
2377 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002378 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002379 : "+r"(src_uyvy), // %0
2380 "+r"(dst_u), // %1
2381 "+r"(dst_y), // %2
2382 "+r"(pix) // %3
2383 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2384 : "memory", "cc"
2385#if defined(__SSE2__)
2386 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2387#endif
2388 );
2389}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002390#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002391
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002392#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002393// Blend 8 pixels at a time.
2394// src_argb0 unaligned.
2395// src_argb1 and dst_argb aligned to 16 bytes.
2396// width must be multiple of 4 pixels.
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002397void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.comc757f302012-04-03 00:49:16 +00002398 uint8* dst_argb, int width) {
2399 asm volatile (
2400 "pcmpeqb %%xmm7,%%xmm7 \n"
2401 "psrlw $0xf,%%xmm7 \n"
2402 "pcmpeqb %%xmm6,%%xmm6 \n"
2403 "psrlw $0x8,%%xmm6 \n"
2404 "pcmpeqb %%xmm5,%%xmm5 \n"
2405 "psllw $0x8,%%xmm5 \n"
2406 "pcmpeqb %%xmm4,%%xmm4 \n"
2407 "pslld $0x18,%%xmm4 \n"
2408
2409 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002410 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002411 "1: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002412 "movdqu (%0),%%xmm3 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002413 "movdqa %%xmm3,%%xmm0 \n"
2414 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002415 "movdqu (%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002416 "psrlw $0x8,%%xmm3 \n"
2417 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2418 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2419 "pand %%xmm6,%%xmm2 \n"
2420 "paddw %%xmm7,%%xmm3 \n"
2421 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002422 "movdqu (%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002423 "psrlw $0x8,%%xmm1 \n"
2424 "por %%xmm4,%%xmm0 \n"
2425 "pmullw %%xmm3,%%xmm1 \n"
2426 "movdqu 0x10(%0),%%xmm3 \n"
2427 "lea 0x20(%0),%0 \n"
2428 "psrlw $0x8,%%xmm2 \n"
2429 "paddusb %%xmm2,%%xmm0 \n"
2430 "pand %%xmm5,%%xmm1 \n"
2431 "paddusb %%xmm1,%%xmm0 \n"
2432 "sub $0x4,%3 \n"
2433 "movdqa %%xmm0,(%2) \n"
2434 "jle 9f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002435 "movdqa %%xmm3,%%xmm0 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002436 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002437 "movdqu 0x10(%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002438 "psrlw $0x8,%%xmm3 \n"
2439 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2440 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2441 "pand %%xmm6,%%xmm2 \n"
2442 "paddw %%xmm7,%%xmm3 \n"
2443 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002444 "movdqu 0x10(%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002445 "lea 0x20(%1),%1 \n"
2446 "psrlw $0x8,%%xmm1 \n"
2447 "por %%xmm4,%%xmm0 \n"
2448 "pmullw %%xmm3,%%xmm1 \n"
2449 "psrlw $0x8,%%xmm2 \n"
2450 "paddusb %%xmm2,%%xmm0 \n"
2451 "pand %%xmm5,%%xmm1 \n"
2452 "paddusb %%xmm1,%%xmm0 \n"
2453 "sub $0x4,%3 \n"
2454 "movdqa %%xmm0,0x10(%2) \n"
2455 "lea 0x20(%2),%2 \n"
2456 "jg 1b \n"
2457 "9: \n"
2458 : "+r"(src_argb0), // %0
2459 "+r"(src_argb1), // %1
2460 "+r"(dst_argb), // %2
2461 "+r"(width) // %3
2462 :
2463 : "memory", "cc"
2464#if defined(__SSE2__)
2465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2466#endif
2467 );
2468}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002469#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002470
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002471#ifdef HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002472// Blend 1 pixel at a time, unaligned
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002473void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com96af8702012-04-06 18:22:27 +00002474 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002475 asm volatile (
2476 "pcmpeqb %%xmm7,%%xmm7 \n"
2477 "psrlw $0xf,%%xmm7 \n"
2478 "pcmpeqb %%xmm6,%%xmm6 \n"
2479 "psrlw $0x8,%%xmm6 \n"
2480 "pcmpeqb %%xmm5,%%xmm5 \n"
2481 "psllw $0x8,%%xmm5 \n"
2482 "pcmpeqb %%xmm4,%%xmm4 \n"
2483 "pslld $0x18,%%xmm4 \n"
2484
2485 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002486 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002487 "1: \n"
2488 "movd (%0),%%xmm3 \n"
2489 "lea 0x4(%0),%0 \n"
2490 "movdqa %%xmm3,%%xmm0 \n"
2491 "pxor %%xmm4,%%xmm3 \n"
2492 "movd (%1),%%xmm2 \n"
2493 "psrlw $0x8,%%xmm3 \n"
2494 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2495 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2496 "pand %%xmm6,%%xmm2 \n"
2497 "paddw %%xmm7,%%xmm3 \n"
2498 "pmullw %%xmm3,%%xmm2 \n"
2499 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002500 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002501 "psrlw $0x8,%%xmm1 \n"
2502 "por %%xmm4,%%xmm0 \n"
2503 "pmullw %%xmm3,%%xmm1 \n"
2504 "psrlw $0x8,%%xmm2 \n"
2505 "paddusb %%xmm2,%%xmm0 \n"
2506 "pand %%xmm5,%%xmm1 \n"
2507 "paddusb %%xmm1,%%xmm0 \n"
2508 "sub $0x1,%3 \n"
2509 "movd %%xmm0,(%2) \n"
2510 "lea 0x4(%2),%2 \n"
2511 "jg 1b \n"
2512 : "+r"(src_argb0), // %0
2513 "+r"(src_argb1), // %1
2514 "+r"(dst_argb), // %2
2515 "+r"(width) // %3
2516 :
2517 : "memory", "cc"
2518#if defined(__SSE2__)
2519 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2520#endif
2521 );
2522}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002523#endif // HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002524
fbarchard@google.com96af8702012-04-06 18:22:27 +00002525#ifdef HAS_ARGBBLENDROW_SSSE3
2526// Shuffle table for reversing the bytes.
2527CONST uvec8 kShuffleAlpha = {
2528 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2529 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2530};
2531void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002532 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002533 asm volatile (
2534 "pcmpeqb %%xmm7,%%xmm7 \n"
2535 "psrlw $0xf,%%xmm7 \n"
2536 "pcmpeqb %%xmm6,%%xmm6 \n"
2537 "psrlw $0x8,%%xmm6 \n"
2538 "pcmpeqb %%xmm5,%%xmm5 \n"
2539 "psllw $0x8,%%xmm5 \n"
2540 "pcmpeqb %%xmm4,%%xmm4 \n"
2541 "pslld $0x18,%%xmm4 \n"
2542
2543 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002544 ".p2align 4 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002545 "1: \n"
2546 "movdqu (%0),%%xmm3 \n"
2547 "movdqa %%xmm3,%%xmm0 \n"
2548 "pxor %%xmm4,%%xmm3 \n"
2549 "pshufb %4,%%xmm3 \n"
2550 "movdqu (%1),%%xmm2 \n"
2551 "pand %%xmm6,%%xmm2 \n"
2552 "paddw %%xmm7,%%xmm3 \n"
2553 "pmullw %%xmm3,%%xmm2 \n"
2554 "movdqu (%1),%%xmm1 \n"
2555 "psrlw $0x8,%%xmm1 \n"
2556 "por %%xmm4,%%xmm0 \n"
2557 "pmullw %%xmm3,%%xmm1 \n"
2558 "movdqu 0x10(%0),%%xmm3 \n"
2559 "lea 0x20(%0),%0 \n"
2560 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002561 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002562 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002563 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002564 "sub $0x4,%3 \n"
2565 "movdqa %%xmm0,(%2) \n"
2566 "jle 9f \n"
2567 "movdqa %%xmm3,%%xmm0 \n"
2568 "pxor %%xmm4,%%xmm3 \n"
2569 "movdqu 0x10(%1),%%xmm2 \n"
2570 "pshufb %4,%%xmm3 \n"
2571 "pand %%xmm6,%%xmm2 \n"
2572 "paddw %%xmm7,%%xmm3 \n"
2573 "pmullw %%xmm3,%%xmm2 \n"
2574 "movdqu 0x10(%1),%%xmm1 \n"
2575 "lea 0x20(%1),%1 \n"
2576 "psrlw $0x8,%%xmm1 \n"
2577 "por %%xmm4,%%xmm0 \n"
2578 "pmullw %%xmm3,%%xmm1 \n"
2579 "psrlw $0x8,%%xmm2 \n"
2580 "paddusb %%xmm2,%%xmm0 \n"
2581 "pand %%xmm5,%%xmm1 \n"
2582 "paddusb %%xmm1,%%xmm0 \n"
2583 "sub $0x4,%3 \n"
2584 "movdqa %%xmm0,0x10(%2) \n"
2585 "lea 0x20(%2),%2 \n"
2586 "jg 1b \n"
2587 "9: \n"
2588 : "+r"(src_argb0), // %0
2589 "+r"(src_argb1), // %1
2590 "+r"(dst_argb), // %2
2591 "+r"(width) // %3
2592 : "m"(kShuffleAlpha) // %4
2593 : "memory", "cc"
2594#if defined(__SSE2__)
2595 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2596#endif
2597 );
2598}
2599#endif // HAS_ARGBBLENDROW_SSSE3
2600
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002601
2602#ifdef HAS_ARGBBLENDROW1_SSSE3
2603// Blend 1 pixel at a time, unaligned
2604void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2605 uint8* dst_argb, int width) {
2606 asm volatile (
2607 "pcmpeqb %%xmm7,%%xmm7 \n"
2608 "psrlw $0xf,%%xmm7 \n"
2609 "pcmpeqb %%xmm6,%%xmm6 \n"
2610 "psrlw $0x8,%%xmm6 \n"
2611 "pcmpeqb %%xmm5,%%xmm5 \n"
2612 "psllw $0x8,%%xmm5 \n"
2613 "pcmpeqb %%xmm4,%%xmm4 \n"
2614 "pslld $0x18,%%xmm4 \n"
2615
2616 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002617 ".p2align 4 \n"
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002618 "1: \n"
2619 "movd (%0),%%xmm3 \n"
2620 "lea 0x4(%0),%0 \n"
2621 "movdqa %%xmm3,%%xmm0 \n"
2622 "pxor %%xmm4,%%xmm3 \n"
2623 "movd (%1),%%xmm2 \n"
2624 "pshufb %4,%%xmm3 \n"
2625 "pand %%xmm6,%%xmm2 \n"
2626 "paddw %%xmm7,%%xmm3 \n"
2627 "pmullw %%xmm3,%%xmm2 \n"
2628 "movd (%1),%%xmm1 \n"
2629 "lea 0x4(%1),%1 \n"
2630 "psrlw $0x8,%%xmm1 \n"
2631 "por %%xmm4,%%xmm0 \n"
2632 "pmullw %%xmm3,%%xmm1 \n"
2633 "psrlw $0x8,%%xmm2 \n"
2634 "paddusb %%xmm2,%%xmm0 \n"
2635 "pand %%xmm5,%%xmm1 \n"
2636 "paddusb %%xmm1,%%xmm0 \n"
2637 "sub $0x1,%3 \n"
2638 "movd %%xmm0,(%2) \n"
2639 "lea 0x4(%2),%2 \n"
2640 "jg 1b \n"
2641 : "+r"(src_argb0), // %0
2642 "+r"(src_argb1), // %1
2643 "+r"(dst_argb), // %2
2644 "+r"(width) // %3
2645 : "m"(kShuffleAlpha) // %4
2646 : "memory", "cc"
2647#if defined(__SSE2__)
2648 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2649#endif
2650 );
2651}
2652#endif // HAS_ARGBBLENDROW1_SSSE3
2653
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002654#ifdef HAS_ARGBATTENUATE_SSE2
2655// Attenuate 4 pixels at a time.
2656// aligned to 16 bytes
2657void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2658 asm volatile (
2659 "sub %0,%1 \n"
2660 "pcmpeqb %%xmm4,%%xmm4 \n"
2661 "pslld $0x18,%%xmm4 \n"
2662 "pcmpeqb %%xmm5,%%xmm5 \n"
2663 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002664
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002665 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002666 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002667 "1: \n"
2668 "movdqa (%0),%%xmm0 \n"
2669 "punpcklbw %%xmm0,%%xmm0 \n"
2670 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2671 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2672 "pmulhuw %%xmm2,%%xmm0 \n"
2673 "movdqa (%0),%%xmm1 \n"
2674 "punpckhbw %%xmm1,%%xmm1 \n"
2675 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2676 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2677 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002678 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002679 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002680 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002681 "psrlw $0x8,%%xmm1 \n"
2682 "packuswb %%xmm1,%%xmm0 \n"
2683 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002684 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002685 "sub $0x4,%2 \n"
2686 "movdqa %%xmm0,(%0,%1,1) \n"
2687 "lea 0x10(%0),%0 \n"
2688 "jg 1b \n"
2689 : "+r"(src_argb), // %0
2690 "+r"(dst_argb), // %1
2691 "+r"(width) // %2
2692 :
2693 : "memory", "cc"
2694#if defined(__SSE2__)
2695 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2696#endif
2697 );
2698}
2699#endif // HAS_ARGBATTENUATE_SSE2
2700
fbarchard@google.com810cd912012-04-20 20:15:27 +00002701#ifdef HAS_ARGBATTENUATE_SSSE3
2702// Shuffle table duplicating alpha
2703CONST uvec8 kShuffleAlpha0 = {
2704 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2705};
2706CONST uvec8 kShuffleAlpha1 = {
2707 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2708 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2709};
2710// Attenuate 4 pixels at a time.
2711// aligned to 16 bytes
2712void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2713 asm volatile (
2714 "sub %0,%1 \n"
2715 "pcmpeqb %%xmm3,%%xmm3 \n"
2716 "pslld $0x18,%%xmm3 \n"
2717 "movdqa %3,%%xmm4 \n"
2718 "movdqa %4,%%xmm5 \n"
2719
2720 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002721 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002722 "1: \n"
2723 "movdqa (%0),%%xmm0 \n"
2724 "pshufb %%xmm4,%%xmm0 \n"
2725 "movdqa (%0),%%xmm1 \n"
2726 "punpcklbw %%xmm1,%%xmm1 \n"
2727 "pmulhuw %%xmm1,%%xmm0 \n"
2728 "movdqa (%0),%%xmm1 \n"
2729 "pshufb %%xmm5,%%xmm1 \n"
2730 "movdqa (%0),%%xmm2 \n"
2731 "punpckhbw %%xmm2,%%xmm2 \n"
2732 "pmulhuw %%xmm2,%%xmm1 \n"
2733 "movdqa (%0),%%xmm2 \n"
2734 "pand %%xmm3,%%xmm2 \n"
2735 "psrlw $0x8,%%xmm0 \n"
2736 "psrlw $0x8,%%xmm1 \n"
2737 "packuswb %%xmm1,%%xmm0 \n"
2738 "por %%xmm2,%%xmm0 \n"
2739 "sub $0x4,%2 \n"
2740 "movdqa %%xmm0,(%0,%1,1) \n"
2741 "lea 0x10(%0),%0 \n"
2742 "jg 1b \n"
2743 : "+r"(src_argb), // %0
2744 "+r"(dst_argb), // %1
2745 "+r"(width) // %2
2746 : "m"(kShuffleAlpha0), // %3
2747 "m"(kShuffleAlpha1) // %4
2748 : "memory", "cc"
2749#if defined(__SSE2__)
2750 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2751#endif
2752 );
2753}
2754#endif // HAS_ARGBATTENUATE_SSSE3
2755
2756#ifdef HAS_ARGBUNATTENUATE_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00002757// Unattenuate 4 pixels at a time.
2758// aligned to 16 bytes
2759void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2760 int width) {
2761 uintptr_t alpha = 0;
2762 asm volatile (
2763 "sub %0,%1 \n"
2764 "pcmpeqb %%xmm4,%%xmm4 \n"
2765 "pslld $0x18,%%xmm4 \n"
2766
2767 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002768 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002769 "1: \n"
2770 "movdqa (%0),%%xmm0 \n"
2771 "movzb 0x3(%0),%3 \n"
2772 "punpcklbw %%xmm0,%%xmm0 \n"
2773 "movd 0x0(%4,%3,4),%%xmm2 \n"
2774 "movzb 0x7(%0),%3 \n"
2775 "movd 0x0(%4,%3,4),%%xmm3 \n"
2776 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2777 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2778 "movlhps %%xmm3,%%xmm2 \n"
2779 "pmulhuw %%xmm2,%%xmm0 \n"
2780 "movdqa (%0),%%xmm1 \n"
2781 "movzb 0xb(%0),%3 \n"
2782 "punpckhbw %%xmm1,%%xmm1 \n"
2783 "movd 0x0(%4,%3,4),%%xmm2 \n"
2784 "movzb 0xf(%0),%3 \n"
2785 "movd 0x0(%4,%3,4),%%xmm3 \n"
2786 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2787 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2788 "movlhps %%xmm3,%%xmm2 \n"
2789 "pmulhuw %%xmm2,%%xmm1 \n"
2790 "movdqa (%0),%%xmm2 \n"
2791 "pand %%xmm4,%%xmm2 \n"
2792 "packuswb %%xmm1,%%xmm0 \n"
2793 "por %%xmm2,%%xmm0 \n"
2794 "sub $0x4,%2 \n"
2795 "movdqa %%xmm0,(%0,%1,1) \n"
2796 "lea 0x10(%0),%0 \n"
2797 "jg 1b \n"
2798 : "+r"(src_argb), // %0
2799 "+r"(dst_argb), // %1
2800 "+r"(width), // %2
2801 "+r"(alpha) // %3
2802 : "r"(fixed_invtbl8) // %4
2803 : "memory", "cc"
2804#if defined(__SSE2__)
2805 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2806#endif
2807 );
2808}
2809#endif // HAS_ARGBUNATTENUATE_SSE2
2810
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002811#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002812// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
2813CONST vec8 kARGBToGray = {
2814 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
2815};
2816
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002817// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2818void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) {
2819 asm volatile (
2820 "movdqa %2,%%xmm4 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002821 // 8 pixel loop \n"
2822 ".p2align 4 \n"
2823 "1: \n"
2824 "movdqa (%0),%%xmm0 \n"
2825 "movdqa 0x10(%0),%%xmm1 \n"
2826 "pmaddubsw %%xmm4,%%xmm0 \n"
2827 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002828 "phaddw %%xmm1,%%xmm0 \n"
2829 "psrlw $0x7,%%xmm0 \n"
2830 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002831 "movdqa (%0),%%xmm2 \n"
2832 "movdqa 0x10(%0),%%xmm3 \n"
2833 "psrld $0x18,%%xmm2 \n"
2834 "psrld $0x18,%%xmm3 \n"
2835 "packuswb %%xmm3,%%xmm2 \n"
2836 "packuswb %%xmm2,%%xmm2 \n"
2837 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002838 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002839 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002840 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002841 "punpcklwd %%xmm3,%%xmm0 \n"
2842 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002843 "sub $0x8,%1 \n"
2844 "movdqa %%xmm0,(%0) \n"
2845 "movdqa %%xmm1,0x10(%0) \n"
2846 "lea 0x20(%0),%0 \n"
2847 "jg 1b \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002848 : "+r"(dst_argb), // %0
2849 "+r"(width) // %1
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002850 : "m"(kARGBToGray) // %2
2851 : "memory", "cc"
2852#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00002853 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002854#endif
2855 );
2856}
2857#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002858
2859#ifdef HAS_ARGBSEPIAROW_SSSE3
2860// b = (r * 35 + g * 68 + b * 17) >> 7
2861// g = (r * 45 + g * 88 + b * 22) >> 7
2862// r = (r * 50 + g * 98 + b * 24) >> 7
2863// Constant for ARGB color to sepia tone
2864CONST vec8 kARGBToSepiaB = {
2865 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
2866};
2867
2868CONST vec8 kARGBToSepiaG = {
2869 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
2870};
2871
2872CONST vec8 kARGBToSepiaR = {
2873 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
2874};
2875
2876// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels
2877void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
2878 asm volatile (
2879 "movdqa %2,%%xmm2 \n"
2880 "movdqa %3,%%xmm3 \n"
2881 "movdqa %4,%%xmm4 \n"
2882 // 8 pixel loop \n"
2883 ".p2align 4 \n"
2884 "1: \n"
2885 "movdqa (%0),%%xmm0 \n"
2886 "movdqa 0x10(%0),%%xmm6 \n"
2887 "pmaddubsw %%xmm2,%%xmm0 \n"
2888 "pmaddubsw %%xmm2,%%xmm6 \n"
2889 "phaddw %%xmm6,%%xmm0 \n"
2890 "psrlw $0x7,%%xmm0 \n"
2891 "packuswb %%xmm0,%%xmm0 \n"
2892 "movdqa (%0),%%xmm5 \n"
2893 "movdqa 0x10(%0),%%xmm1 \n"
2894 "pmaddubsw %%xmm3,%%xmm5 \n"
2895 "pmaddubsw %%xmm3,%%xmm1 \n"
2896 "phaddw %%xmm1,%%xmm5 \n"
2897 "psrlw $0x7,%%xmm5 \n"
2898 "packuswb %%xmm5,%%xmm5 \n"
2899 "punpcklbw %%xmm5,%%xmm0 \n"
2900 "movdqa (%0),%%xmm5 \n"
2901 "movdqa 0x10(%0),%%xmm1 \n"
2902 "pmaddubsw %%xmm4,%%xmm5 \n"
2903 "pmaddubsw %%xmm4,%%xmm1 \n"
2904 "phaddw %%xmm1,%%xmm5 \n"
2905 "psrlw $0x7,%%xmm5 \n"
2906 "packuswb %%xmm5,%%xmm5 \n"
2907 "movdqa (%0),%%xmm6 \n"
2908 "movdqa 0x10(%0),%%xmm1 \n"
2909 "psrld $0x18,%%xmm6 \n"
2910 "psrld $0x18,%%xmm1 \n"
2911 "packuswb %%xmm1,%%xmm6 \n"
2912 "packuswb %%xmm6,%%xmm6 \n"
2913 "punpcklbw %%xmm6,%%xmm5 \n"
2914 "movdqa %%xmm0,%%xmm1 \n"
2915 "punpcklwd %%xmm5,%%xmm0 \n"
2916 "punpckhwd %%xmm5,%%xmm1 \n"
2917 "sub $0x8,%1 \n"
2918 "movdqa %%xmm0,(%0) \n"
2919 "movdqa %%xmm1,0x10(%0) \n"
2920 "lea 0x20(%0),%0 \n"
2921 "jg 1b \n"
2922 : "+r"(dst_argb), // %0
2923 "+r"(width) // %1
2924 : "m"(kARGBToSepiaB), // %2
2925 "m"(kARGBToSepiaG), // %3
2926 "m"(kARGBToSepiaR) // %4
2927 : "memory", "cc"
2928#if defined(__SSE2__)
2929 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2930#endif
2931 );
2932}
2933#endif // HAS_ARGBSEPIAROW_SSSE3
2934
fbarchard@google.comf51e8792012-06-10 02:40:04 +00002935#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
2936// Creates a table of cumulative sums where each value is a sum of all values
2937// above and to the left of the value, inclusive of the value.
2938void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
2939 int32* previous_cumsum, int width) {
2940 asm volatile (
2941 "sub %1,%2 \n"
2942 "pxor %%xmm0,%%xmm0 \n"
2943 "pxor %%xmm1,%%xmm1 \n"
2944 "sub $0x4,%3 \n"
2945 "jl 49f \n"
2946 "test $0xf,%1 \n"
2947 "jne 49f \n"
2948
2949 // 4 pixel loop \n"
2950 ".p2align 2 \n"
2951 "40: \n"
2952 "movdqu (%0),%%xmm2 \n"
2953 "lea 0x10(%0),%0 \n"
2954 "movdqa %%xmm2,%%xmm4 \n"
2955 "punpcklbw %%xmm1,%%xmm2 \n"
2956 "movdqa %%xmm2,%%xmm3 \n"
2957 "punpcklwd %%xmm1,%%xmm2 \n"
2958 "punpckhwd %%xmm1,%%xmm3 \n"
2959 "punpckhbw %%xmm1,%%xmm4 \n"
2960 "movdqa %%xmm4,%%xmm5 \n"
2961 "punpcklwd %%xmm1,%%xmm4 \n"
2962 "punpckhwd %%xmm1,%%xmm5 \n"
2963 "paddd %%xmm2,%%xmm0 \n"
2964 "movdqa (%1,%2,1),%%xmm2 \n"
2965 "paddd %%xmm0,%%xmm2 \n"
2966 "paddd %%xmm3,%%xmm0 \n"
2967 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
2968 "paddd %%xmm0,%%xmm3 \n"
2969 "paddd %%xmm4,%%xmm0 \n"
2970 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
2971 "paddd %%xmm0,%%xmm4 \n"
2972 "paddd %%xmm5,%%xmm0 \n"
2973 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
2974 "paddd %%xmm0,%%xmm5 \n"
2975 "movdqa %%xmm2,(%1) \n"
2976 "movdqa %%xmm3,0x10(%1) \n"
2977 "movdqa %%xmm4,0x20(%1) \n"
2978 "movdqa %%xmm5,0x30(%1) \n"
2979 "lea 0x40(%1),%1 \n"
2980 "sub $0x4,%3 \n"
2981 "jge 40b \n"
2982
2983 "49: \n"
2984 "add $0x3,%3 \n"
2985 "jl 19f \n"
2986
2987 // 1 pixel loop \n"
2988 ".p2align 2 \n"
2989 "10: \n"
2990 "movd (%0),%%xmm2 \n"
2991 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00002992 "punpcklbw %%xmm1,%%xmm2 \n"
2993 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00002994 "paddd %%xmm2,%%xmm0 \n"
2995 "movdqu (%1,%2,1),%%xmm2 \n"
2996 "paddd %%xmm0,%%xmm2 \n"
2997 "movdqu %%xmm2,(%1) \n"
2998 "lea 0x10(%1),%1 \n"
2999 "sub $0x1,%3 \n"
3000 "jge 10b \n"
3001
3002 "19: \n"
3003 : "+r"(row), // %0
3004 "+r"(cumsum), // %1
3005 "+r"(previous_cumsum), // %2
3006 "+r"(width) // %3
3007 :
3008 : "memory", "cc"
3009#if defined(__SSE2__)
3010 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3011#endif
3012 );
3013}
3014#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3015
3016#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3017void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3018 int width, int area, uint8* dst, int count) {
3019 asm volatile (
3020 "movd %5,%%xmm4 \n"
3021 "cvtdq2ps %%xmm4,%%xmm4 \n"
3022 "rcpss %%xmm4,%%xmm4 \n"
3023 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3024 "sub $0x4,%3 \n"
3025 "jl 49f \n"
3026
3027 // 4 pixel loop \n"
3028 ".p2align 2 \n"
3029 "40: \n"
3030 "movdqa (%0),%%xmm0 \n"
3031 "movdqa 0x10(%0),%%xmm1 \n"
3032 "movdqa 0x20(%0),%%xmm2 \n"
3033 "movdqa 0x30(%0),%%xmm3 \n"
3034 "psubd (%0,%4,4),%%xmm0 \n"
3035 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3036 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3037 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3038 "lea 0x40(%0),%0 \n"
3039 "psubd (%1),%%xmm0 \n"
3040 "psubd 0x10(%1),%%xmm1 \n"
3041 "psubd 0x20(%1),%%xmm2 \n"
3042 "psubd 0x30(%1),%%xmm3 \n"
3043 "paddd (%1,%4,4),%%xmm0 \n"
3044 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3045 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3046 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3047 "lea 0x40(%1),%1 \n"
3048 "cvtdq2ps %%xmm0,%%xmm0 \n"
3049 "cvtdq2ps %%xmm1,%%xmm1 \n"
3050 "mulps %%xmm4,%%xmm0 \n"
3051 "mulps %%xmm4,%%xmm1 \n"
3052 "cvtdq2ps %%xmm2,%%xmm2 \n"
3053 "cvtdq2ps %%xmm3,%%xmm3 \n"
3054 "mulps %%xmm4,%%xmm2 \n"
3055 "mulps %%xmm4,%%xmm3 \n"
3056 "cvtps2dq %%xmm0,%%xmm0 \n"
3057 "cvtps2dq %%xmm1,%%xmm1 \n"
3058 "cvtps2dq %%xmm2,%%xmm2 \n"
3059 "cvtps2dq %%xmm3,%%xmm3 \n"
3060 "packssdw %%xmm1,%%xmm0 \n"
3061 "packssdw %%xmm3,%%xmm2 \n"
3062 "packuswb %%xmm2,%%xmm0 \n"
3063 "movdqu %%xmm0,(%2) \n"
3064 "lea 0x10(%2),%2 \n"
3065 "sub $0x4,%3 \n"
3066 "jge 40b \n"
3067
3068 "49: \n"
3069 "add $0x3,%3 \n"
3070 "jl 19f \n"
3071
3072 // 1 pixel loop \n"
3073 ".p2align 2 \n"
3074 "10: \n"
3075 "movdqa (%0),%%xmm0 \n"
3076 "psubd (%0,%4,4),%%xmm0 \n"
3077 "lea 0x10(%0),%0 \n"
3078 "psubd (%1),%%xmm0 \n"
3079 "paddd (%1,%4,4),%%xmm0 \n"
3080 "lea 0x10(%1),%1 \n"
3081 "cvtdq2ps %%xmm0,%%xmm0 \n"
3082 "mulps %%xmm4,%%xmm0 \n"
3083 "cvtps2dq %%xmm0,%%xmm0 \n"
3084 "packssdw %%xmm0,%%xmm0 \n"
3085 "packuswb %%xmm0,%%xmm0 \n"
3086 "movd %%xmm0,(%2) \n"
3087 "lea 0x4(%2),%2 \n"
3088 "sub $0x1,%3 \n"
3089 "jge 10b \n"
3090 "19: \n"
3091 : "+r"(topleft), // %0
3092 "+r"(botleft), // %1
3093 "+r"(dst), // %2
3094 "+rm"(count) // %3
3095 : "r"(static_cast<intptr_t>(width)), // %4
3096 "rm"(area) // %5
3097 : "memory", "cc"
3098#if defined(__SSE2__)
3099 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3100#endif
3101 );
3102}
3103#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
3104
3105
fbarchard@google.com2d11d432012-02-16 02:50:39 +00003106#endif // defined(__x86_64__) || defined(__i386__)
3107
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003108#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003109} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003110} // namespace libyuv
3111#endif