blob: 479ece0ac11eb233e3b98dc96412f043f4bda25c [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000115 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000116 "1: \n"
117 "movq (%0),%%xmm0 \n"
118 "lea 0x8(%0),%0 \n"
119 "punpcklbw %%xmm0,%%xmm0 \n"
120 "movdqa %%xmm0,%%xmm1 \n"
121 "punpcklwd %%xmm0,%%xmm0 \n"
122 "punpckhwd %%xmm1,%%xmm1 \n"
123 "por %%xmm5,%%xmm0 \n"
124 "por %%xmm5,%%xmm1 \n"
125 "movdqa %%xmm0,(%1) \n"
126 "movdqa %%xmm1,0x10(%1) \n"
127 "lea 0x20(%1),%1 \n"
128 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000129 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000130 : "+r"(src_y), // %0
131 "+r"(dst_argb), // %1
132 "+r"(pix) // %2
133 :
134 : "memory", "cc"
135#if defined(__SSE2__)
136 , "xmm0", "xmm1", "xmm5"
137#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000138 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000139}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140
141void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000142 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000143 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000144 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000145 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "1: \n"
147 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000148 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000150 "movdqa %%xmm0,(%0,%1,1) \n"
151 "lea 0x10(%0),%0 \n"
152 "jg 1b \n"
153
fbarchard@google.comb6149762011-11-07 21:58:52 +0000154 : "+r"(src_abgr), // %0
155 "+r"(dst_argb), // %1
156 "+r"(pix) // %2
157 : "m"(kShuffleMaskABGRToARGB) // %3
158 : "memory", "cc"
159#if defined(__SSE2__)
160 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000161#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000162 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163}
164
165void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000166 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000168 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000169 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "1: \n"
171 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000172 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000173 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000174 "movdqa %%xmm0,(%0,%1,1) \n"
175 "lea 0x10(%0),%0 \n"
176 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000177 : "+r"(src_bgra), // %0
178 "+r"(dst_argb), // %1
179 "+r"(pix) // %2
180 : "m"(kShuffleMaskBGRAToARGB) // %3
181 : "memory", "cc"
182#if defined(__SSE2__)
183 , "xmm0", "xmm5"
184#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000185 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000186}
187
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000188void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000189 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000190 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
191 "pslld $0x18,%%xmm5 \n"
192 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000193 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000194 "1: \n"
195 "movdqu (%0),%%xmm0 \n"
196 "movdqu 0x10(%0),%%xmm1 \n"
197 "movdqu 0x20(%0),%%xmm3 \n"
198 "lea 0x30(%0),%0 \n"
199 "movdqa %%xmm3,%%xmm2 \n"
200 "palignr $0x8,%%xmm1,%%xmm2 \n"
201 "pshufb %%xmm4,%%xmm2 \n"
202 "por %%xmm5,%%xmm2 \n"
203 "palignr $0xc,%%xmm0,%%xmm1 \n"
204 "pshufb %%xmm4,%%xmm0 \n"
205 "movdqa %%xmm2,0x20(%1) \n"
206 "por %%xmm5,%%xmm0 \n"
207 "pshufb %%xmm4,%%xmm1 \n"
208 "movdqa %%xmm0,(%1) \n"
209 "por %%xmm5,%%xmm1 \n"
210 "palignr $0x4,%%xmm3,%%xmm3 \n"
211 "pshufb %%xmm4,%%xmm3 \n"
212 "movdqa %%xmm1,0x10(%1) \n"
213 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000215 "movdqa %%xmm3,0x30(%1) \n"
216 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000217 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000218 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000221 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000227}
228
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000229void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
232 "pslld $0x18,%%xmm5 \n"
233 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000234 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000235 "1: \n"
236 "movdqu (%0),%%xmm0 \n"
237 "movdqu 0x10(%0),%%xmm1 \n"
238 "movdqu 0x20(%0),%%xmm3 \n"
239 "lea 0x30(%0),%0 \n"
240 "movdqa %%xmm3,%%xmm2 \n"
241 "palignr $0x8,%%xmm1,%%xmm2 \n"
242 "pshufb %%xmm4,%%xmm2 \n"
243 "por %%xmm5,%%xmm2 \n"
244 "palignr $0xc,%%xmm0,%%xmm1 \n"
245 "pshufb %%xmm4,%%xmm0 \n"
246 "movdqa %%xmm2,0x20(%1) \n"
247 "por %%xmm5,%%xmm0 \n"
248 "pshufb %%xmm4,%%xmm1 \n"
249 "movdqa %%xmm0,(%1) \n"
250 "por %%xmm5,%%xmm1 \n"
251 "palignr $0x4,%%xmm3,%%xmm3 \n"
252 "pshufb %%xmm4,%%xmm3 \n"
253 "movdqa %%xmm1,0x10(%1) \n"
254 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000255 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000256 "movdqa %%xmm3,0x30(%1) \n"
257 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000258 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000259 : "+r"(src_raw), // %0
260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000262 : "m"(kShuffleMaskRAWToARGB) // %3
263 : "memory", "cc"
264#if defined(__SSE2__)
265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
266#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000268}
269
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000270void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000271 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000272 "mov $0x1080108,%%eax \n"
273 "movd %%eax,%%xmm5 \n"
274 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000275 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000276 "movd %%eax,%%xmm6 \n"
277 "pshufd $0x0,%%xmm6,%%xmm6 \n"
278 "pcmpeqb %%xmm3,%%xmm3 \n"
279 "psllw $0xb,%%xmm3 \n"
280 "pcmpeqb %%xmm4,%%xmm4 \n"
281 "psllw $0xa,%%xmm4 \n"
282 "psrlw $0x5,%%xmm4 \n"
283 "pcmpeqb %%xmm7,%%xmm7 \n"
284 "psllw $0x8,%%xmm7 \n"
285 "sub %0,%1 \n"
286 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000287 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000288 "1: \n"
289 "movdqu (%0),%%xmm0 \n"
290 "movdqa %%xmm0,%%xmm1 \n"
291 "movdqa %%xmm0,%%xmm2 \n"
292 "pand %%xmm3,%%xmm1 \n"
293 "psllw $0xb,%%xmm2 \n"
294 "pmulhuw %%xmm5,%%xmm1 \n"
295 "pmulhuw %%xmm5,%%xmm2 \n"
296 "psllw $0x8,%%xmm1 \n"
297 "por %%xmm2,%%xmm1 \n"
298 "pand %%xmm4,%%xmm0 \n"
299 "pmulhuw %%xmm6,%%xmm0 \n"
300 "por %%xmm7,%%xmm0 \n"
301 "movdqa %%xmm1,%%xmm2 \n"
302 "punpcklbw %%xmm0,%%xmm1 \n"
303 "punpckhbw %%xmm0,%%xmm2 \n"
304 "movdqa %%xmm1,(%1,%0,2) \n"
305 "movdqa %%xmm2,0x10(%1,%0,2) \n"
306 "lea 0x10(%0),%0 \n"
307 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000308 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000309 : "+r"(src), // %0
310 "+r"(dst), // %1
311 "+r"(pix) // %2
312 :
313 : "memory", "cc", "eax"
314#if defined(__SSE2__)
315 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
316#endif
317 );
318}
319
320void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000321 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000322 "mov $0x1080108,%%eax \n"
323 "movd %%eax,%%xmm5 \n"
324 "pshufd $0x0,%%xmm5,%%xmm5 \n"
325 "mov $0x42004200,%%eax \n"
326 "movd %%eax,%%xmm6 \n"
327 "pshufd $0x0,%%xmm6,%%xmm6 \n"
328 "pcmpeqb %%xmm3,%%xmm3 \n"
329 "psllw $0xb,%%xmm3 \n"
330 "movdqa %%xmm3,%%xmm4 \n"
331 "psrlw $0x6,%%xmm4 \n"
332 "pcmpeqb %%xmm7,%%xmm7 \n"
333 "psllw $0x8,%%xmm7 \n"
334 "sub %0,%1 \n"
335 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000336 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000337 "1: \n"
338 "movdqu (%0),%%xmm0 \n"
339 "movdqa %%xmm0,%%xmm1 \n"
340 "movdqa %%xmm0,%%xmm2 \n"
341 "psllw $0x1,%%xmm1 \n"
342 "psllw $0xb,%%xmm2 \n"
343 "pand %%xmm3,%%xmm1 \n"
344 "pmulhuw %%xmm5,%%xmm2 \n"
345 "pmulhuw %%xmm5,%%xmm1 \n"
346 "psllw $0x8,%%xmm1 \n"
347 "por %%xmm2,%%xmm1 \n"
348 "movdqa %%xmm0,%%xmm2 \n"
349 "pand %%xmm4,%%xmm0 \n"
350 "psraw $0x8,%%xmm2 \n"
351 "pmulhuw %%xmm6,%%xmm0 \n"
352 "pand %%xmm7,%%xmm2 \n"
353 "por %%xmm2,%%xmm0 \n"
354 "movdqa %%xmm1,%%xmm2 \n"
355 "punpcklbw %%xmm0,%%xmm1 \n"
356 "punpckhbw %%xmm0,%%xmm2 \n"
357 "movdqa %%xmm1,(%1,%0,2) \n"
358 "movdqa %%xmm2,0x10(%1,%0,2) \n"
359 "lea 0x10(%0),%0 \n"
360 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000361 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000362 : "+r"(src), // %0
363 "+r"(dst), // %1
364 "+r"(pix) // %2
365 :
366 : "memory", "cc", "eax"
367#if defined(__SSE2__)
368 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
369#endif
370 );
371}
372
373void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000374 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000375 "mov $0xf0f0f0f,%%eax \n"
376 "movd %%eax,%%xmm4 \n"
377 "pshufd $0x0,%%xmm4,%%xmm4 \n"
378 "movdqa %%xmm4,%%xmm5 \n"
379 "pslld $0x4,%%xmm5 \n"
380 "sub %0,%1 \n"
381 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000382 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000383 "1: \n"
384 "movdqu (%0),%%xmm0 \n"
385 "movdqa %%xmm0,%%xmm2 \n"
386 "pand %%xmm4,%%xmm0 \n"
387 "pand %%xmm5,%%xmm2 \n"
388 "movdqa %%xmm0,%%xmm1 \n"
389 "movdqa %%xmm2,%%xmm3 \n"
390 "psllw $0x4,%%xmm1 \n"
391 "psrlw $0x4,%%xmm3 \n"
392 "por %%xmm1,%%xmm0 \n"
393 "por %%xmm3,%%xmm2 \n"
394 "movdqa %%xmm0,%%xmm1 \n"
395 "punpcklbw %%xmm2,%%xmm0 \n"
396 "punpckhbw %%xmm2,%%xmm1 \n"
397 "movdqa %%xmm0,(%1,%0,2) \n"
398 "movdqa %%xmm1,0x10(%1,%0,2) \n"
399 "lea 0x10(%0),%0 \n"
400 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000401 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000402 : "+r"(src), // %0
403 "+r"(dst), // %1
404 "+r"(pix) // %2
405 :
406 : "memory", "cc", "eax"
407#if defined(__SSE2__)
408 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
409#endif
410 );
411}
412
413void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000414 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000415 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000416 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000417 "1: \n"
418 "movdqa (%0),%%xmm0 \n"
419 "movdqa 0x10(%0),%%xmm1 \n"
420 "movdqa 0x20(%0),%%xmm2 \n"
421 "movdqa 0x30(%0),%%xmm3 \n"
422 "lea 0x40(%0),%0 \n"
423 "pshufb %%xmm6,%%xmm0 \n"
424 "pshufb %%xmm6,%%xmm1 \n"
425 "pshufb %%xmm6,%%xmm2 \n"
426 "pshufb %%xmm6,%%xmm3 \n"
427 "movdqa %%xmm1,%%xmm4 \n"
428 "psrldq $0x4,%%xmm1 \n"
429 "pslldq $0xc,%%xmm4 \n"
430 "movdqa %%xmm2,%%xmm5 \n"
431 "por %%xmm4,%%xmm0 \n"
432 "pslldq $0x8,%%xmm5 \n"
433 "movdqa %%xmm0,(%1) \n"
434 "por %%xmm5,%%xmm1 \n"
435 "psrldq $0x8,%%xmm2 \n"
436 "pslldq $0x4,%%xmm3 \n"
437 "por %%xmm3,%%xmm2 \n"
438 "movdqa %%xmm1,0x10(%1) \n"
439 "movdqa %%xmm2,0x20(%1) \n"
440 "lea 0x30(%1),%1 \n"
441 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 : "m"(kShuffleMaskARGBToRGB24) // %3
447 : "memory", "cc"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
450#endif
451 );
452}
453
454void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000457 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000458 "1: \n"
459 "movdqa (%0),%%xmm0 \n"
460 "movdqa 0x10(%0),%%xmm1 \n"
461 "movdqa 0x20(%0),%%xmm2 \n"
462 "movdqa 0x30(%0),%%xmm3 \n"
463 "lea 0x40(%0),%0 \n"
464 "pshufb %%xmm6,%%xmm0 \n"
465 "pshufb %%xmm6,%%xmm1 \n"
466 "pshufb %%xmm6,%%xmm2 \n"
467 "pshufb %%xmm6,%%xmm3 \n"
468 "movdqa %%xmm1,%%xmm4 \n"
469 "psrldq $0x4,%%xmm1 \n"
470 "pslldq $0xc,%%xmm4 \n"
471 "movdqa %%xmm2,%%xmm5 \n"
472 "por %%xmm4,%%xmm0 \n"
473 "pslldq $0x8,%%xmm5 \n"
474 "movdqa %%xmm0,(%1) \n"
475 "por %%xmm5,%%xmm1 \n"
476 "psrldq $0x8,%%xmm2 \n"
477 "pslldq $0x4,%%xmm3 \n"
478 "por %%xmm3,%%xmm2 \n"
479 "movdqa %%xmm1,0x10(%1) \n"
480 "movdqa %%xmm2,0x20(%1) \n"
481 "lea 0x30(%1),%1 \n"
482 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000483 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000484 : "+r"(src), // %0
485 "+r"(dst), // %1
486 "+r"(pix) // %2
487 : "m"(kShuffleMaskARGBToRAW) // %3
488 : "memory", "cc"
489#if defined(__SSE2__)
490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
491#endif
492 );
493}
494
495void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000496 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000497 "pcmpeqb %%xmm3,%%xmm3 \n"
498 "psrld $0x1b,%%xmm3 \n"
499 "pcmpeqb %%xmm4,%%xmm4 \n"
500 "psrld $0x1a,%%xmm4 \n"
501 "pslld $0x5,%%xmm4 \n"
502 "pcmpeqb %%xmm5,%%xmm5 \n"
503 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000504 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000505 "1: \n"
506 "movdqa (%0),%%xmm0 \n"
507 "movdqa %%xmm0,%%xmm1 \n"
508 "movdqa %%xmm0,%%xmm2 \n"
509 "pslld $0x8,%%xmm0 \n"
510 "psrld $0x3,%%xmm1 \n"
511 "psrld $0x5,%%xmm2 \n"
512 "psrad $0x10,%%xmm0 \n"
513 "pand %%xmm3,%%xmm1 \n"
514 "pand %%xmm4,%%xmm2 \n"
515 "pand %%xmm5,%%xmm0 \n"
516 "por %%xmm2,%%xmm1 \n"
517 "por %%xmm1,%%xmm0 \n"
518 "packssdw %%xmm0,%%xmm0 \n"
519 "lea 0x10(%0),%0 \n"
520 "movq %%xmm0,(%1) \n"
521 "lea 0x8(%1),%1 \n"
522 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 :
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
531#endif
532 );
533}
534
535void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "pcmpeqb %%xmm4,%%xmm4 \n"
538 "psrld $0x1b,%%xmm4 \n"
539 "movdqa %%xmm4,%%xmm5 \n"
540 "pslld $0x5,%%xmm5 \n"
541 "movdqa %%xmm4,%%xmm6 \n"
542 "pslld $0xa,%%xmm6 \n"
543 "pcmpeqb %%xmm7,%%xmm7 \n"
544 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000545 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000546 "1: \n"
547 "movdqa (%0),%%xmm0 \n"
548 "movdqa %%xmm0,%%xmm1 \n"
549 "movdqa %%xmm0,%%xmm2 \n"
550 "movdqa %%xmm0,%%xmm3 \n"
551 "psrad $0x10,%%xmm0 \n"
552 "psrld $0x3,%%xmm1 \n"
553 "psrld $0x6,%%xmm2 \n"
554 "psrld $0x9,%%xmm3 \n"
555 "pand %%xmm7,%%xmm0 \n"
556 "pand %%xmm4,%%xmm1 \n"
557 "pand %%xmm5,%%xmm2 \n"
558 "pand %%xmm6,%%xmm3 \n"
559 "por %%xmm1,%%xmm0 \n"
560 "por %%xmm3,%%xmm2 \n"
561 "por %%xmm2,%%xmm0 \n"
562 "packssdw %%xmm0,%%xmm0 \n"
563 "lea 0x10(%0),%0 \n"
564 "movq %%xmm0,(%1) \n"
565 "lea 0x8(%1),%1 \n"
566 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000567 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 : "+r"(src), // %0
569 "+r"(dst), // %1
570 "+r"(pix) // %2
571 :
572 : "memory", "cc"
573#if defined(__SSE2__)
574 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
575#endif
576 );
577}
578
579void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000580 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000581 "pcmpeqb %%xmm4,%%xmm4 \n"
582 "psllw $0xc,%%xmm4 \n"
583 "movdqa %%xmm4,%%xmm3 \n"
584 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "pand %%xmm3,%%xmm0 \n"
590 "pand %%xmm4,%%xmm1 \n"
591 "psrlq $0x4,%%xmm0 \n"
592 "psrlq $0x8,%%xmm1 \n"
593 "por %%xmm1,%%xmm0 \n"
594 "packuswb %%xmm0,%%xmm0 \n"
595 "lea 0x10(%0),%0 \n"
596 "movq %%xmm0,(%1) \n"
597 "lea 0x8(%1),%1 \n"
598 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000599 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000600 : "+r"(src), // %0
601 "+r"(dst), // %1
602 "+r"(pix) // %2
603 :
604 : "memory", "cc"
605#if defined(__SSE2__)
606 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
607#endif
608 );
609}
610
fbarchard@google.comb6149762011-11-07 21:58:52 +0000611void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000612 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000613 "movdqa %4,%%xmm5 \n"
614 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000615 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000616 "1: \n"
617 "movdqa (%0),%%xmm0 \n"
618 "movdqa 0x10(%0),%%xmm1 \n"
619 "movdqa 0x20(%0),%%xmm2 \n"
620 "movdqa 0x30(%0),%%xmm3 \n"
621 "pmaddubsw %%xmm4,%%xmm0 \n"
622 "pmaddubsw %%xmm4,%%xmm1 \n"
623 "pmaddubsw %%xmm4,%%xmm2 \n"
624 "pmaddubsw %%xmm4,%%xmm3 \n"
625 "lea 0x40(%0),%0 \n"
626 "phaddw %%xmm1,%%xmm0 \n"
627 "phaddw %%xmm3,%%xmm2 \n"
628 "psrlw $0x7,%%xmm0 \n"
629 "psrlw $0x7,%%xmm2 \n"
630 "packuswb %%xmm2,%%xmm0 \n"
631 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000632 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000633 "movdqa %%xmm0,(%1) \n"
634 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000636 : "+r"(src_argb), // %0
637 "+r"(dst_y), // %1
638 "+r"(pix) // %2
639 : "m"(kARGBToY), // %3
640 "m"(kAddY16) // %4
641 : "memory", "cc"
642#if defined(__SSE2__)
643 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
644#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000645 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000646}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000647
648void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000649 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000650 "movdqa %4,%%xmm5 \n"
651 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000652 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000653 "1: \n"
654 "movdqu (%0),%%xmm0 \n"
655 "movdqu 0x10(%0),%%xmm1 \n"
656 "movdqu 0x20(%0),%%xmm2 \n"
657 "movdqu 0x30(%0),%%xmm3 \n"
658 "pmaddubsw %%xmm4,%%xmm0 \n"
659 "pmaddubsw %%xmm4,%%xmm1 \n"
660 "pmaddubsw %%xmm4,%%xmm2 \n"
661 "pmaddubsw %%xmm4,%%xmm3 \n"
662 "lea 0x40(%0),%0 \n"
663 "phaddw %%xmm1,%%xmm0 \n"
664 "phaddw %%xmm3,%%xmm2 \n"
665 "psrlw $0x7,%%xmm0 \n"
666 "psrlw $0x7,%%xmm2 \n"
667 "packuswb %%xmm2,%%xmm0 \n"
668 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000669 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000670 "movdqu %%xmm0,(%1) \n"
671 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000672 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000673 : "+r"(src_argb), // %0
674 "+r"(dst_y), // %1
675 "+r"(pix) // %2
676 : "m"(kARGBToY), // %3
677 "m"(kAddY16) // %4
678 : "memory", "cc"
679#if defined(__SSE2__)
680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
681#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000682 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000683}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000684
fbarchard@google.com714050a2012-02-17 22:59:56 +0000685// TODO(fbarchard): pass xmm constants to single block of assembly.
686// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
687// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
688// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
689// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000690void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
691 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000693 "movdqa %0,%%xmm4 \n"
694 "movdqa %1,%%xmm3 \n"
695 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000696 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000697 : "m"(kARGBToU), // %0
698 "m"(kARGBToV), // %1
699 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000700 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000701 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000702 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000703 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000704 "1: \n"
705 "movdqa (%0),%%xmm0 \n"
706 "movdqa 0x10(%0),%%xmm1 \n"
707 "movdqa 0x20(%0),%%xmm2 \n"
708 "movdqa 0x30(%0),%%xmm6 \n"
709 "pavgb (%0,%4,1),%%xmm0 \n"
710 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
711 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
712 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
713 "lea 0x40(%0),%0 \n"
714 "movdqa %%xmm0,%%xmm7 \n"
715 "shufps $0x88,%%xmm1,%%xmm0 \n"
716 "shufps $0xdd,%%xmm1,%%xmm7 \n"
717 "pavgb %%xmm7,%%xmm0 \n"
718 "movdqa %%xmm2,%%xmm7 \n"
719 "shufps $0x88,%%xmm6,%%xmm2 \n"
720 "shufps $0xdd,%%xmm6,%%xmm7 \n"
721 "pavgb %%xmm7,%%xmm2 \n"
722 "movdqa %%xmm0,%%xmm1 \n"
723 "movdqa %%xmm2,%%xmm6 \n"
724 "pmaddubsw %%xmm4,%%xmm0 \n"
725 "pmaddubsw %%xmm4,%%xmm2 \n"
726 "pmaddubsw %%xmm3,%%xmm1 \n"
727 "pmaddubsw %%xmm3,%%xmm6 \n"
728 "phaddw %%xmm2,%%xmm0 \n"
729 "phaddw %%xmm6,%%xmm1 \n"
730 "psraw $0x8,%%xmm0 \n"
731 "psraw $0x8,%%xmm1 \n"
732 "packsswb %%xmm1,%%xmm0 \n"
733 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000734 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000735 "movlps %%xmm0,(%1) \n"
736 "movhps %%xmm0,(%1,%2,1) \n"
737 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000738 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000739 : "+r"(src_argb0), // %0
740 "+r"(dst_u), // %1
741 "+r"(dst_v), // %2
742 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000743 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000744 : "memory", "cc"
745#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000746 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000747#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000748 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000749}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000750
751void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
752 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000753 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000754 "movdqa %0,%%xmm4 \n"
755 "movdqa %1,%%xmm3 \n"
756 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000757 :
758 : "m"(kARGBToU), // %0
759 "m"(kARGBToV), // %1
760 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000761 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000762 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000763 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm6 \n"
770 "movdqu (%0,%4,1),%%xmm7 \n"
771 "pavgb %%xmm7,%%xmm0 \n"
772 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
773 "pavgb %%xmm7,%%xmm1 \n"
774 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm2 \n"
776 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
777 "pavgb %%xmm7,%%xmm6 \n"
778 "lea 0x40(%0),%0 \n"
779 "movdqa %%xmm0,%%xmm7 \n"
780 "shufps $0x88,%%xmm1,%%xmm0 \n"
781 "shufps $0xdd,%%xmm1,%%xmm7 \n"
782 "pavgb %%xmm7,%%xmm0 \n"
783 "movdqa %%xmm2,%%xmm7 \n"
784 "shufps $0x88,%%xmm6,%%xmm2 \n"
785 "shufps $0xdd,%%xmm6,%%xmm7 \n"
786 "pavgb %%xmm7,%%xmm2 \n"
787 "movdqa %%xmm0,%%xmm1 \n"
788 "movdqa %%xmm2,%%xmm6 \n"
789 "pmaddubsw %%xmm4,%%xmm0 \n"
790 "pmaddubsw %%xmm4,%%xmm2 \n"
791 "pmaddubsw %%xmm3,%%xmm1 \n"
792 "pmaddubsw %%xmm3,%%xmm6 \n"
793 "phaddw %%xmm2,%%xmm0 \n"
794 "phaddw %%xmm6,%%xmm1 \n"
795 "psraw $0x8,%%xmm0 \n"
796 "psraw $0x8,%%xmm1 \n"
797 "packsswb %%xmm1,%%xmm0 \n"
798 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000799 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000800 "movlps %%xmm0,(%1) \n"
801 "movhps %%xmm0,(%1,%2,1) \n"
802 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000803 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000804 : "+r"(src_argb0), // %0
805 "+r"(dst_u), // %1
806 "+r"(dst_v), // %2
807 "+rm"(width) // %3
808 : "r"(static_cast<intptr_t>(src_stride_argb))
809 : "memory", "cc"
810#if defined(__SSE2__)
811 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
812#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000813 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000814}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000815
fbarchard@google.com714050a2012-02-17 22:59:56 +0000816void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000817 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000818 "movdqa %4,%%xmm5 \n"
819 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000820 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000821 "1: \n"
822 "movdqa (%0),%%xmm0 \n"
823 "movdqa 0x10(%0),%%xmm1 \n"
824 "movdqa 0x20(%0),%%xmm2 \n"
825 "movdqa 0x30(%0),%%xmm3 \n"
826 "pmaddubsw %%xmm4,%%xmm0 \n"
827 "pmaddubsw %%xmm4,%%xmm1 \n"
828 "pmaddubsw %%xmm4,%%xmm2 \n"
829 "pmaddubsw %%xmm4,%%xmm3 \n"
830 "lea 0x40(%0),%0 \n"
831 "phaddw %%xmm1,%%xmm0 \n"
832 "phaddw %%xmm3,%%xmm2 \n"
833 "psrlw $0x7,%%xmm0 \n"
834 "psrlw $0x7,%%xmm2 \n"
835 "packuswb %%xmm2,%%xmm0 \n"
836 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000837 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000838 "movdqa %%xmm0,(%1) \n"
839 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000840 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000841 : "+r"(src_bgra), // %0
842 "+r"(dst_y), // %1
843 "+r"(pix) // %2
844 : "m"(kBGRAToY), // %3
845 "m"(kAddY16) // %4
846 : "memory", "cc"
847#if defined(__SSE2__)
848 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000849#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000850 );
851}
852
853void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000854 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000855 "movdqa %4,%%xmm5 \n"
856 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000857 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000858 "1: \n"
859 "movdqu (%0),%%xmm0 \n"
860 "movdqu 0x10(%0),%%xmm1 \n"
861 "movdqu 0x20(%0),%%xmm2 \n"
862 "movdqu 0x30(%0),%%xmm3 \n"
863 "pmaddubsw %%xmm4,%%xmm0 \n"
864 "pmaddubsw %%xmm4,%%xmm1 \n"
865 "pmaddubsw %%xmm4,%%xmm2 \n"
866 "pmaddubsw %%xmm4,%%xmm3 \n"
867 "lea 0x40(%0),%0 \n"
868 "phaddw %%xmm1,%%xmm0 \n"
869 "phaddw %%xmm3,%%xmm2 \n"
870 "psrlw $0x7,%%xmm0 \n"
871 "psrlw $0x7,%%xmm2 \n"
872 "packuswb %%xmm2,%%xmm0 \n"
873 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000874 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000875 "movdqu %%xmm0,(%1) \n"
876 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000877 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000878 : "+r"(src_bgra), // %0
879 "+r"(dst_y), // %1
880 "+r"(pix) // %2
881 : "m"(kBGRAToY), // %3
882 "m"(kAddY16) // %4
883 : "memory", "cc"
884#if defined(__SSE2__)
885 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
886#endif
887 );
888}
889
890void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
891 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000892 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000893 "movdqa %0,%%xmm4 \n"
894 "movdqa %1,%%xmm3 \n"
895 "movdqa %2,%%xmm5 \n"
896 :
897 : "m"(kBGRAToU), // %0
898 "m"(kBGRAToV), // %1
899 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000900 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000901 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000902 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000903 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000904 "1: \n"
905 "movdqa (%0),%%xmm0 \n"
906 "movdqa 0x10(%0),%%xmm1 \n"
907 "movdqa 0x20(%0),%%xmm2 \n"
908 "movdqa 0x30(%0),%%xmm6 \n"
909 "pavgb (%0,%4,1),%%xmm0 \n"
910 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
911 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
912 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
913 "lea 0x40(%0),%0 \n"
914 "movdqa %%xmm0,%%xmm7 \n"
915 "shufps $0x88,%%xmm1,%%xmm0 \n"
916 "shufps $0xdd,%%xmm1,%%xmm7 \n"
917 "pavgb %%xmm7,%%xmm0 \n"
918 "movdqa %%xmm2,%%xmm7 \n"
919 "shufps $0x88,%%xmm6,%%xmm2 \n"
920 "shufps $0xdd,%%xmm6,%%xmm7 \n"
921 "pavgb %%xmm7,%%xmm2 \n"
922 "movdqa %%xmm0,%%xmm1 \n"
923 "movdqa %%xmm2,%%xmm6 \n"
924 "pmaddubsw %%xmm4,%%xmm0 \n"
925 "pmaddubsw %%xmm4,%%xmm2 \n"
926 "pmaddubsw %%xmm3,%%xmm1 \n"
927 "pmaddubsw %%xmm3,%%xmm6 \n"
928 "phaddw %%xmm2,%%xmm0 \n"
929 "phaddw %%xmm6,%%xmm1 \n"
930 "psraw $0x8,%%xmm0 \n"
931 "psraw $0x8,%%xmm1 \n"
932 "packsswb %%xmm1,%%xmm0 \n"
933 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000934 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000935 "movlps %%xmm0,(%1) \n"
936 "movhps %%xmm0,(%1,%2,1) \n"
937 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000938 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000939 : "+r"(src_bgra0), // %0
940 "+r"(dst_u), // %1
941 "+r"(dst_v), // %2
942 "+rm"(width) // %3
943 : "r"(static_cast<intptr_t>(src_stride_bgra))
944 : "memory", "cc"
945#if defined(__SSE2__)
946 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
947#endif
948 );
949}
950
951void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
952 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000953 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000954 "movdqa %0,%%xmm4 \n"
955 "movdqa %1,%%xmm3 \n"
956 "movdqa %2,%%xmm5 \n"
957 :
958 : "m"(kBGRAToU), // %0
959 "m"(kBGRAToV), // %1
960 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000961 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000962 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000963 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000964 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000965 "1: \n"
966 "movdqu (%0),%%xmm0 \n"
967 "movdqu 0x10(%0),%%xmm1 \n"
968 "movdqu 0x20(%0),%%xmm2 \n"
969 "movdqu 0x30(%0),%%xmm6 \n"
970 "movdqu (%0,%4,1),%%xmm7 \n"
971 "pavgb %%xmm7,%%xmm0 \n"
972 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
973 "pavgb %%xmm7,%%xmm1 \n"
974 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm2 \n"
976 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm6 \n"
978 "lea 0x40(%0),%0 \n"
979 "movdqa %%xmm0,%%xmm7 \n"
980 "shufps $0x88,%%xmm1,%%xmm0 \n"
981 "shufps $0xdd,%%xmm1,%%xmm7 \n"
982 "pavgb %%xmm7,%%xmm0 \n"
983 "movdqa %%xmm2,%%xmm7 \n"
984 "shufps $0x88,%%xmm6,%%xmm2 \n"
985 "shufps $0xdd,%%xmm6,%%xmm7 \n"
986 "pavgb %%xmm7,%%xmm2 \n"
987 "movdqa %%xmm0,%%xmm1 \n"
988 "movdqa %%xmm2,%%xmm6 \n"
989 "pmaddubsw %%xmm4,%%xmm0 \n"
990 "pmaddubsw %%xmm4,%%xmm2 \n"
991 "pmaddubsw %%xmm3,%%xmm1 \n"
992 "pmaddubsw %%xmm3,%%xmm6 \n"
993 "phaddw %%xmm2,%%xmm0 \n"
994 "phaddw %%xmm6,%%xmm1 \n"
995 "psraw $0x8,%%xmm0 \n"
996 "psraw $0x8,%%xmm1 \n"
997 "packsswb %%xmm1,%%xmm0 \n"
998 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000999 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001000 "movlps %%xmm0,(%1) \n"
1001 "movhps %%xmm0,(%1,%2,1) \n"
1002 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001003 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001004 : "+r"(src_bgra0), // %0
1005 "+r"(dst_u), // %1
1006 "+r"(dst_v), // %2
1007 "+rm"(width) // %3
1008 : "r"(static_cast<intptr_t>(src_stride_bgra))
1009 : "memory", "cc"
1010#if defined(__SSE2__)
1011 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1012#endif
1013 );
1014}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001015
1016void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001017 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001018 "movdqa %4,%%xmm5 \n"
1019 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001020 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001021 "1: \n"
1022 "movdqa (%0),%%xmm0 \n"
1023 "movdqa 0x10(%0),%%xmm1 \n"
1024 "movdqa 0x20(%0),%%xmm2 \n"
1025 "movdqa 0x30(%0),%%xmm3 \n"
1026 "pmaddubsw %%xmm4,%%xmm0 \n"
1027 "pmaddubsw %%xmm4,%%xmm1 \n"
1028 "pmaddubsw %%xmm4,%%xmm2 \n"
1029 "pmaddubsw %%xmm4,%%xmm3 \n"
1030 "lea 0x40(%0),%0 \n"
1031 "phaddw %%xmm1,%%xmm0 \n"
1032 "phaddw %%xmm3,%%xmm2 \n"
1033 "psrlw $0x7,%%xmm0 \n"
1034 "psrlw $0x7,%%xmm2 \n"
1035 "packuswb %%xmm2,%%xmm0 \n"
1036 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001037 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001038 "movdqa %%xmm0,(%1) \n"
1039 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001040 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001041 : "+r"(src_abgr), // %0
1042 "+r"(dst_y), // %1
1043 "+r"(pix) // %2
1044 : "m"(kABGRToY), // %3
1045 "m"(kAddY16) // %4
1046 : "memory", "cc"
1047#if defined(__SSE2__)
1048 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1049#endif
1050 );
1051}
1052
1053void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001054 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001055 "movdqa %4,%%xmm5 \n"
1056 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001057 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001058 "1: \n"
1059 "movdqu (%0),%%xmm0 \n"
1060 "movdqu 0x10(%0),%%xmm1 \n"
1061 "movdqu 0x20(%0),%%xmm2 \n"
1062 "movdqu 0x30(%0),%%xmm3 \n"
1063 "pmaddubsw %%xmm4,%%xmm0 \n"
1064 "pmaddubsw %%xmm4,%%xmm1 \n"
1065 "pmaddubsw %%xmm4,%%xmm2 \n"
1066 "pmaddubsw %%xmm4,%%xmm3 \n"
1067 "lea 0x40(%0),%0 \n"
1068 "phaddw %%xmm1,%%xmm0 \n"
1069 "phaddw %%xmm3,%%xmm2 \n"
1070 "psrlw $0x7,%%xmm0 \n"
1071 "psrlw $0x7,%%xmm2 \n"
1072 "packuswb %%xmm2,%%xmm0 \n"
1073 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001074 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001075 "movdqu %%xmm0,(%1) \n"
1076 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001077 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001078 : "+r"(src_abgr), // %0
1079 "+r"(dst_y), // %1
1080 "+r"(pix) // %2
1081 : "m"(kABGRToY), // %3
1082 "m"(kAddY16) // %4
1083 : "memory", "cc"
1084#if defined(__SSE2__)
1085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1086#endif
1087 );
1088}
1089
1090void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1091 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001092 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001093 "movdqa %0,%%xmm4 \n"
1094 "movdqa %1,%%xmm3 \n"
1095 "movdqa %2,%%xmm5 \n"
1096 :
1097 : "m"(kABGRToU), // %0
1098 "m"(kABGRToV), // %1
1099 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001100 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001101 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001102 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001103 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001104 "1: \n"
1105 "movdqa (%0),%%xmm0 \n"
1106 "movdqa 0x10(%0),%%xmm1 \n"
1107 "movdqa 0x20(%0),%%xmm2 \n"
1108 "movdqa 0x30(%0),%%xmm6 \n"
1109 "pavgb (%0,%4,1),%%xmm0 \n"
1110 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1111 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1112 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1113 "lea 0x40(%0),%0 \n"
1114 "movdqa %%xmm0,%%xmm7 \n"
1115 "shufps $0x88,%%xmm1,%%xmm0 \n"
1116 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1117 "pavgb %%xmm7,%%xmm0 \n"
1118 "movdqa %%xmm2,%%xmm7 \n"
1119 "shufps $0x88,%%xmm6,%%xmm2 \n"
1120 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1121 "pavgb %%xmm7,%%xmm2 \n"
1122 "movdqa %%xmm0,%%xmm1 \n"
1123 "movdqa %%xmm2,%%xmm6 \n"
1124 "pmaddubsw %%xmm4,%%xmm0 \n"
1125 "pmaddubsw %%xmm4,%%xmm2 \n"
1126 "pmaddubsw %%xmm3,%%xmm1 \n"
1127 "pmaddubsw %%xmm3,%%xmm6 \n"
1128 "phaddw %%xmm2,%%xmm0 \n"
1129 "phaddw %%xmm6,%%xmm1 \n"
1130 "psraw $0x8,%%xmm0 \n"
1131 "psraw $0x8,%%xmm1 \n"
1132 "packsswb %%xmm1,%%xmm0 \n"
1133 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001134 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 "movlps %%xmm0,(%1) \n"
1136 "movhps %%xmm0,(%1,%2,1) \n"
1137 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001138 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 : "+r"(src_abgr0), // %0
1140 "+r"(dst_u), // %1
1141 "+r"(dst_v), // %2
1142 "+rm"(width) // %3
1143 : "r"(static_cast<intptr_t>(src_stride_abgr))
1144 : "memory", "cc"
1145#if defined(__SSE2__)
1146 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1147#endif
1148 );
1149}
1150
1151void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1152 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001153 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001154 "movdqa %0,%%xmm4 \n"
1155 "movdqa %1,%%xmm3 \n"
1156 "movdqa %2,%%xmm5 \n"
1157 :
1158 : "m"(kABGRToU), // %0
1159 "m"(kABGRToV), // %1
1160 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001161 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001162 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001163 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001164 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001165 "1: \n"
1166 "movdqu (%0),%%xmm0 \n"
1167 "movdqu 0x10(%0),%%xmm1 \n"
1168 "movdqu 0x20(%0),%%xmm2 \n"
1169 "movdqu 0x30(%0),%%xmm6 \n"
1170 "movdqu (%0,%4,1),%%xmm7 \n"
1171 "pavgb %%xmm7,%%xmm0 \n"
1172 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1173 "pavgb %%xmm7,%%xmm1 \n"
1174 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm2 \n"
1176 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1177 "pavgb %%xmm7,%%xmm6 \n"
1178 "lea 0x40(%0),%0 \n"
1179 "movdqa %%xmm0,%%xmm7 \n"
1180 "shufps $0x88,%%xmm1,%%xmm0 \n"
1181 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1182 "pavgb %%xmm7,%%xmm0 \n"
1183 "movdqa %%xmm2,%%xmm7 \n"
1184 "shufps $0x88,%%xmm6,%%xmm2 \n"
1185 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1186 "pavgb %%xmm7,%%xmm2 \n"
1187 "movdqa %%xmm0,%%xmm1 \n"
1188 "movdqa %%xmm2,%%xmm6 \n"
1189 "pmaddubsw %%xmm4,%%xmm0 \n"
1190 "pmaddubsw %%xmm4,%%xmm2 \n"
1191 "pmaddubsw %%xmm3,%%xmm1 \n"
1192 "pmaddubsw %%xmm3,%%xmm6 \n"
1193 "phaddw %%xmm2,%%xmm0 \n"
1194 "phaddw %%xmm6,%%xmm1 \n"
1195 "psraw $0x8,%%xmm0 \n"
1196 "psraw $0x8,%%xmm1 \n"
1197 "packsswb %%xmm1,%%xmm0 \n"
1198 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001199 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001200 "movlps %%xmm0,(%1) \n"
1201 "movhps %%xmm0,(%1,%2,1) \n"
1202 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001203 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001204 : "+r"(src_abgr0), // %0
1205 "+r"(dst_u), // %1
1206 "+r"(dst_v), // %2
1207 "+rm"(width) // %3
1208 : "r"(static_cast<intptr_t>(src_stride_abgr))
1209 : "memory", "cc"
1210#if defined(__SSE2__)
1211 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1212#endif
1213 );
1214}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001215
1216#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001217
fbarchard@google.come214fe32012-06-04 23:47:11 +00001218#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001219#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1220#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1221#define UR 0
1222
1223#define VB 0
1224#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1225#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1226
1227// Bias
1228#define BB UB * 128 + VB * 128
1229#define BG UG * 128 + VG * 128
1230#define BR UR * 128 + VR * 128
1231
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001232#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001233
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001234struct {
1235 vec8 kUVToB;
1236 vec8 kUVToG;
1237 vec8 kUVToR;
1238 vec16 kUVBiasB;
1239 vec16 kUVBiasG;
1240 vec16 kUVBiasR;
1241 vec16 kYSub16;
1242 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001243} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001244 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1245 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1246 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1247 { BB, BB, BB, BB, BB, BB, BB, BB },
1248 { BG, BG, BG, BG, BG, BG, BG, BG },
1249 { BR, BR, BR, BR, BR, BR, BR, BR },
1250 { 16, 16, 16, 16, 16, 16, 16, 16 },
1251 { YG, YG, YG, YG, YG, YG, YG, YG }
1252};
1253
fbarchard@google.come214fe32012-06-04 23:47:11 +00001254// Convert 8 pixels: 8 UV and 8 Y
1255#define YUV444TORGB \
1256 "movq (%1),%%xmm0 \n" \
1257 "movq (%1,%2,1),%%xmm1 \n" \
1258 "lea 0x8(%1),%1 \n" \
1259 "punpcklbw %%xmm1,%%xmm0 \n" \
1260 "movdqa %%xmm0,%%xmm1 \n" \
1261 "movdqa %%xmm0,%%xmm2 \n" \
1262 "pmaddubsw (%5),%%xmm0 \n" \
1263 "pmaddubsw 16(%5),%%xmm1 \n" \
1264 "pmaddubsw 32(%5),%%xmm2 \n" \
1265 "psubw 48(%5),%%xmm0 \n" \
1266 "psubw 64(%5),%%xmm1 \n" \
1267 "psubw 80(%5),%%xmm2 \n" \
1268 "movq (%0),%%xmm3 \n" \
1269 "lea 0x8(%0),%0 \n" \
1270 "punpcklbw %%xmm4,%%xmm3 \n" \
1271 "psubsw 96(%5),%%xmm3 \n" \
1272 "pmullw 112(%5),%%xmm3 \n" \
1273 "paddsw %%xmm3,%%xmm0 \n" \
1274 "paddsw %%xmm3,%%xmm1 \n" \
1275 "paddsw %%xmm3,%%xmm2 \n" \
1276 "psraw $0x6,%%xmm0 \n" \
1277 "psraw $0x6,%%xmm1 \n" \
1278 "psraw $0x6,%%xmm2 \n" \
1279 "packuswb %%xmm0,%%xmm0 \n" \
1280 "packuswb %%xmm1,%%xmm1 \n" \
1281 "packuswb %%xmm2,%%xmm2 \n"
1282
1283// Convert 8 pixels: 4 UV and 8 Y
1284#define YUV422TORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001285 "movd (%1),%%xmm0 \n" \
1286 "movd (%1,%2,1),%%xmm1 \n" \
1287 "lea 0x4(%1),%1 \n" \
1288 "punpcklbw %%xmm1,%%xmm0 \n" \
1289 "punpcklwd %%xmm0,%%xmm0 \n" \
1290 "movdqa %%xmm0,%%xmm1 \n" \
1291 "movdqa %%xmm0,%%xmm2 \n" \
1292 "pmaddubsw (%5),%%xmm0 \n" \
1293 "pmaddubsw 16(%5),%%xmm1 \n" \
1294 "pmaddubsw 32(%5),%%xmm2 \n" \
1295 "psubw 48(%5),%%xmm0 \n" \
1296 "psubw 64(%5),%%xmm1 \n" \
1297 "psubw 80(%5),%%xmm2 \n" \
1298 "movq (%0),%%xmm3 \n" \
1299 "lea 0x8(%0),%0 \n" \
1300 "punpcklbw %%xmm4,%%xmm3 \n" \
1301 "psubsw 96(%5),%%xmm3 \n" \
1302 "pmullw 112(%5),%%xmm3 \n" \
1303 "paddsw %%xmm3,%%xmm0 \n" \
1304 "paddsw %%xmm3,%%xmm1 \n" \
1305 "paddsw %%xmm3,%%xmm2 \n" \
1306 "psraw $0x6,%%xmm0 \n" \
1307 "psraw $0x6,%%xmm1 \n" \
1308 "psraw $0x6,%%xmm2 \n" \
1309 "packuswb %%xmm0,%%xmm0 \n" \
1310 "packuswb %%xmm1,%%xmm1 \n" \
1311 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001312
fbarchard@google.come214fe32012-06-04 23:47:11 +00001313// Convert 8 pixels: 2 UV and 8 Y
1314#define YUV411TORGB \
1315 "movd (%1),%%xmm0 \n" \
1316 "movd (%1,%2,1),%%xmm1 \n" \
1317 "lea 0x2(%1),%1 \n" \
1318 "punpcklbw %%xmm1,%%xmm0 \n" \
1319 "punpcklwd %%xmm0,%%xmm0 \n" \
1320 "punpckldq %%xmm0,%%xmm0 \n" \
1321 "movdqa %%xmm0,%%xmm1 \n" \
1322 "movdqa %%xmm0,%%xmm2 \n" \
1323 "pmaddubsw (%5),%%xmm0 \n" \
1324 "pmaddubsw 16(%5),%%xmm1 \n" \
1325 "pmaddubsw 32(%5),%%xmm2 \n" \
1326 "psubw 48(%5),%%xmm0 \n" \
1327 "psubw 64(%5),%%xmm1 \n" \
1328 "psubw 80(%5),%%xmm2 \n" \
1329 "movq (%0),%%xmm3 \n" \
1330 "lea 0x8(%0),%0 \n" \
1331 "punpcklbw %%xmm4,%%xmm3 \n" \
1332 "psubsw 96(%5),%%xmm3 \n" \
1333 "pmullw 112(%5),%%xmm3 \n" \
1334 "paddsw %%xmm3,%%xmm0 \n" \
1335 "paddsw %%xmm3,%%xmm1 \n" \
1336 "paddsw %%xmm3,%%xmm2 \n" \
1337 "psraw $0x6,%%xmm0 \n" \
1338 "psraw $0x6,%%xmm1 \n" \
1339 "psraw $0x6,%%xmm2 \n" \
1340 "packuswb %%xmm0,%%xmm0 \n" \
1341 "packuswb %%xmm1,%%xmm1 \n" \
1342 "packuswb %%xmm2,%%xmm2 \n"
1343
1344void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001345 const uint8* u_buf,
1346 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001347 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001348 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001349 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001350 "sub %1,%2 \n"
1351 "pcmpeqb %%xmm5,%%xmm5 \n"
1352 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001353 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001354 "1: \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001355 YUV444TORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001356 "punpcklbw %%xmm1,%%xmm0 \n"
1357 "punpcklbw %%xmm5,%%xmm2 \n"
1358 "movdqa %%xmm0,%%xmm1 \n"
1359 "punpcklwd %%xmm2,%%xmm0 \n"
1360 "punpckhwd %%xmm2,%%xmm1 \n"
1361 "movdqa %%xmm0,(%3) \n"
1362 "movdqa %%xmm1,0x10(%3) \n"
1363 "lea 0x20(%3),%3 \n"
1364 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001365 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001366 : "+r"(y_buf), // %0
1367 "+r"(u_buf), // %1
1368 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001369 "+r"(argb_buf), // %3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001370 "+rm"(width) // %4
1371 : "r"(&kYuvConstants.kUVToB) // %5
1372 : "memory", "cc"
1373#if defined(__SSE2__)
1374 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1375#endif
1376 );
1377}
1378
fbarchard@google.come214fe32012-06-04 23:47:11 +00001379void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001380 const uint8* u_buf,
1381 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001382 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001383 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001384 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001385 "sub %1,%2 \n"
1386 "pcmpeqb %%xmm5,%%xmm5 \n"
1387 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001388 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001389 "1: \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001390 YUV422TORGB
1391 "punpcklbw %%xmm1,%%xmm0 \n"
1392 "punpcklbw %%xmm5,%%xmm2 \n"
1393 "movdqa %%xmm0,%%xmm1 \n"
1394 "punpcklwd %%xmm2,%%xmm0 \n"
1395 "punpckhwd %%xmm2,%%xmm1 \n"
1396 "movdqa %%xmm0,(%3) \n"
1397 "movdqa %%xmm1,0x10(%3) \n"
1398 "lea 0x20(%3),%3 \n"
1399 "sub $0x8,%4 \n"
1400 "jg 1b \n"
1401 : "+r"(y_buf), // %0
1402 "+r"(u_buf), // %1
1403 "+r"(v_buf), // %2
1404 "+r"(argb_buf), // %3
1405 "+rm"(width) // %4
1406 : "r"(&kYuvConstants.kUVToB) // %5
1407 : "memory", "cc"
1408#if defined(__SSE2__)
1409 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1410#endif
1411 );
1412}
1413
1414void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1415 const uint8* u_buf,
1416 const uint8* v_buf,
1417 uint8* argb_buf,
1418 int width) {
1419 asm volatile (
1420 "sub %1,%2 \n"
1421 "pcmpeqb %%xmm5,%%xmm5 \n"
1422 "pxor %%xmm4,%%xmm4 \n"
1423 ".p2align 4 \n"
1424 "1: \n"
1425 YUV411TORGB
1426 "punpcklbw %%xmm1,%%xmm0 \n"
1427 "punpcklbw %%xmm5,%%xmm2 \n"
1428 "movdqa %%xmm0,%%xmm1 \n"
1429 "punpcklwd %%xmm2,%%xmm0 \n"
1430 "punpckhwd %%xmm2,%%xmm1 \n"
1431 "movdqa %%xmm0,(%3) \n"
1432 "movdqa %%xmm1,0x10(%3) \n"
1433 "lea 0x20(%3),%3 \n"
1434 "sub $0x8,%4 \n"
1435 "jg 1b \n"
1436 : "+r"(y_buf), // %0
1437 "+r"(u_buf), // %1
1438 "+r"(v_buf), // %2
1439 "+r"(argb_buf), // %3
1440 "+rm"(width) // %4
1441 : "r"(&kYuvConstants.kUVToB) // %5
1442 : "memory", "cc"
1443#if defined(__SSE2__)
1444 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1445#endif
1446 );
1447}
1448
1449void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1450 const uint8* u_buf,
1451 const uint8* v_buf,
1452 uint8* argb_buf,
1453 int width) {
1454 asm volatile (
1455 "sub %1,%2 \n"
1456 "pcmpeqb %%xmm5,%%xmm5 \n"
1457 "pxor %%xmm4,%%xmm4 \n"
1458 ".p2align 4 \n"
1459 "1: \n"
1460 YUV444TORGB
1461 "punpcklbw %%xmm1,%%xmm0 \n"
1462 "punpcklbw %%xmm5,%%xmm2 \n"
1463 "movdqa %%xmm0,%%xmm1 \n"
1464 "punpcklwd %%xmm2,%%xmm0 \n"
1465 "punpckhwd %%xmm2,%%xmm1 \n"
1466 "movdqu %%xmm0,(%3) \n"
1467 "movdqu %%xmm1,0x10(%3) \n"
1468 "lea 0x20(%3),%3 \n"
1469 "sub $0x8,%4 \n"
1470 "jg 1b \n"
1471 : "+r"(y_buf), // %0
1472 "+r"(u_buf), // %1
1473 "+r"(v_buf), // %2
1474 "+r"(argb_buf), // %3
1475 "+rm"(width) // %4
1476 : "r"(&kYuvConstants.kUVToB) // %5
1477 : "memory", "cc"
1478#if defined(__SSE2__)
1479 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1480#endif
1481 );
1482}
1483
1484void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1485 const uint8* u_buf,
1486 const uint8* v_buf,
1487 uint8* argb_buf,
1488 int width) {
1489 asm volatile (
1490 "sub %1,%2 \n"
1491 "pcmpeqb %%xmm5,%%xmm5 \n"
1492 "pxor %%xmm4,%%xmm4 \n"
1493 ".p2align 4 \n"
1494 "1: \n"
1495 YUV422TORGB
1496 "punpcklbw %%xmm1,%%xmm0 \n"
1497 "punpcklbw %%xmm5,%%xmm2 \n"
1498 "movdqa %%xmm0,%%xmm1 \n"
1499 "punpcklwd %%xmm2,%%xmm0 \n"
1500 "punpckhwd %%xmm2,%%xmm1 \n"
1501 "movdqu %%xmm0,(%3) \n"
1502 "movdqu %%xmm1,0x10(%3) \n"
1503 "lea 0x20(%3),%3 \n"
1504 "sub $0x8,%4 \n"
1505 "jg 1b \n"
1506 : "+r"(y_buf), // %0
1507 "+r"(u_buf), // %1
1508 "+r"(v_buf), // %2
1509 "+r"(argb_buf), // %3
1510 "+rm"(width) // %4
1511 : "r"(&kYuvConstants.kUVToB) // %5
1512 : "memory", "cc"
1513#if defined(__SSE2__)
1514 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1515#endif
1516 );
1517}
1518
1519void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1520 const uint8* u_buf,
1521 const uint8* v_buf,
1522 uint8* argb_buf,
1523 int width) {
1524 asm volatile (
1525 "sub %1,%2 \n"
1526 "pcmpeqb %%xmm5,%%xmm5 \n"
1527 "pxor %%xmm4,%%xmm4 \n"
1528 ".p2align 4 \n"
1529 "1: \n"
1530 YUV411TORGB
1531 "punpcklbw %%xmm1,%%xmm0 \n"
1532 "punpcklbw %%xmm5,%%xmm2 \n"
1533 "movdqa %%xmm0,%%xmm1 \n"
1534 "punpcklwd %%xmm2,%%xmm0 \n"
1535 "punpckhwd %%xmm2,%%xmm1 \n"
1536 "movdqu %%xmm0,(%3) \n"
1537 "movdqu %%xmm1,0x10(%3) \n"
1538 "lea 0x20(%3),%3 \n"
1539 "sub $0x8,%4 \n"
1540 "jg 1b \n"
1541 : "+r"(y_buf), // %0
1542 "+r"(u_buf), // %1
1543 "+r"(v_buf), // %2
1544 "+r"(argb_buf), // %3
1545 "+rm"(width) // %4
1546 : "r"(&kYuvConstants.kUVToB) // %5
1547 : "memory", "cc"
1548#if defined(__SSE2__)
1549 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1550#endif
1551 );
1552}
1553
1554void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1555 const uint8* u_buf,
1556 const uint8* v_buf,
1557 uint8* bgra_buf,
1558 int width) {
1559 asm volatile (
1560 "sub %1,%2 \n"
1561 "pcmpeqb %%xmm5,%%xmm5 \n"
1562 "pxor %%xmm4,%%xmm4 \n"
1563 ".p2align 4 \n"
1564 "1: \n"
1565 YUV422TORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001566 "pcmpeqb %%xmm5,%%xmm5 \n"
1567 "punpcklbw %%xmm0,%%xmm1 \n"
1568 "punpcklbw %%xmm2,%%xmm5 \n"
1569 "movdqa %%xmm5,%%xmm0 \n"
1570 "punpcklwd %%xmm1,%%xmm5 \n"
1571 "punpckhwd %%xmm1,%%xmm0 \n"
1572 "movdqa %%xmm5,(%3) \n"
1573 "movdqa %%xmm0,0x10(%3) \n"
1574 "lea 0x20(%3),%3 \n"
1575 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001576 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001577 : "+r"(y_buf), // %0
1578 "+r"(u_buf), // %1
1579 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001580 "+r"(bgra_buf), // %3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001581 "+rm"(width) // %4
1582 : "r"(&kYuvConstants.kUVToB) // %5
1583 : "memory", "cc"
1584#if defined(__SSE2__)
1585 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1586#endif
1587 );
1588}
1589
fbarchard@google.come214fe32012-06-04 23:47:11 +00001590void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001591 const uint8* u_buf,
1592 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001593 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001594 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001595 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001596 "sub %1,%2 \n"
1597 "pcmpeqb %%xmm5,%%xmm5 \n"
1598 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001599 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001600 "1: \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001601 YUV422TORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001602 "punpcklbw %%xmm1,%%xmm2 \n"
1603 "punpcklbw %%xmm5,%%xmm0 \n"
1604 "movdqa %%xmm2,%%xmm1 \n"
1605 "punpcklwd %%xmm0,%%xmm2 \n"
1606 "punpckhwd %%xmm0,%%xmm1 \n"
1607 "movdqa %%xmm2,(%3) \n"
1608 "movdqa %%xmm1,0x10(%3) \n"
1609 "lea 0x20(%3),%3 \n"
1610 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001611 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001612 : "+r"(y_buf), // %0
1613 "+r"(u_buf), // %1
1614 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001615 "+r"(abgr_buf), // %3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001616 "+rm"(width) // %4
1617 : "r"(&kYuvConstants.kUVToB) // %5
1618 : "memory", "cc"
1619#if defined(__SSE2__)
1620 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1621#endif
1622 );
1623}
1624
fbarchard@google.come214fe32012-06-04 23:47:11 +00001625void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001626 const uint8* u_buf,
1627 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001628 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001629 int width) {
1630 asm volatile (
1631 "sub %1,%2 \n"
1632 "pcmpeqb %%xmm5,%%xmm5 \n"
1633 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001634 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001635 "1: \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001636 YUV422TORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001637 "pcmpeqb %%xmm5,%%xmm5 \n"
1638 "punpcklbw %%xmm0,%%xmm1 \n"
1639 "punpcklbw %%xmm2,%%xmm5 \n"
1640 "movdqa %%xmm5,%%xmm0 \n"
1641 "punpcklwd %%xmm1,%%xmm5 \n"
1642 "punpckhwd %%xmm1,%%xmm0 \n"
1643 "movdqu %%xmm5,(%3) \n"
1644 "movdqu %%xmm0,0x10(%3) \n"
1645 "lea 0x20(%3),%3 \n"
1646 "sub $0x8,%4 \n"
1647 "jg 1b \n"
1648 : "+r"(y_buf), // %0
1649 "+r"(u_buf), // %1
1650 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001651 "+r"(bgra_buf), // %3
fbarchard@google.com952a5072012-03-30 18:10:50 +00001652 "+rm"(width) // %4
1653 : "r"(&kYuvConstants.kUVToB) // %5
1654 : "memory", "cc"
1655#if defined(__SSE2__)
1656 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1657#endif
1658 );
1659}
1660
fbarchard@google.come214fe32012-06-04 23:47:11 +00001661void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001662 const uint8* u_buf,
1663 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001664 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001665 int width) {
1666 asm volatile (
1667 "sub %1,%2 \n"
1668 "pcmpeqb %%xmm5,%%xmm5 \n"
1669 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001670 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001671 "1: \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001672 YUV422TORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001673 "punpcklbw %%xmm1,%%xmm2 \n"
1674 "punpcklbw %%xmm5,%%xmm0 \n"
1675 "movdqa %%xmm2,%%xmm1 \n"
1676 "punpcklwd %%xmm0,%%xmm2 \n"
1677 "punpckhwd %%xmm0,%%xmm1 \n"
1678 "movdqu %%xmm2,(%3) \n"
1679 "movdqu %%xmm1,0x10(%3) \n"
1680 "lea 0x20(%3),%3 \n"
1681 "sub $0x8,%4 \n"
1682 "jg 1b \n"
1683 : "+r"(y_buf), // %0
1684 "+r"(u_buf), // %1
1685 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001686 "+r"(abgr_buf), // %3
fbarchard@google.com952a5072012-03-30 18:10:50 +00001687 "+rm"(width) // %4
1688 : "r"(&kYuvConstants.kUVToB) // %5
1689 : "memory", "cc"
1690#if defined(__SSE2__)
1691 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1692#endif
1693 );
1694}
1695
fbarchard@google.come214fe32012-06-04 23:47:11 +00001696#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001697
1698#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001699void YToARGBRow_SSE2(const uint8* y_buf,
1700 uint8* rgb_buf,
1701 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001702 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001703 "pcmpeqb %%xmm4,%%xmm4 \n"
1704 "pslld $0x18,%%xmm4 \n"
1705 "mov $0x10001000,%%eax \n"
1706 "movd %%eax,%%xmm3 \n"
1707 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1708 "mov $0x012a012a,%%eax \n"
1709 "movd %%eax,%%xmm2 \n"
1710 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001711 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001712 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001713 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001714 "movq (%0),%%xmm0 \n"
1715 "lea 0x8(%0),%0 \n"
1716 "punpcklbw %%xmm0,%%xmm0 \n"
1717 "psubusw %%xmm3,%%xmm0 \n"
1718 "pmulhuw %%xmm2,%%xmm0 \n"
1719 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001720
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001721 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001722 "punpcklbw %%xmm0,%%xmm0 \n"
1723 "movdqa %%xmm0,%%xmm1 \n"
1724 "punpcklwd %%xmm0,%%xmm0 \n"
1725 "punpckhwd %%xmm1,%%xmm1 \n"
1726 "por %%xmm4,%%xmm0 \n"
1727 "por %%xmm4,%%xmm1 \n"
1728 "movdqa %%xmm0,(%1) \n"
1729 "movdqa %%xmm1,16(%1) \n"
1730 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001731
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001732 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001733 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001734 : "+r"(y_buf), // %0
1735 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001736 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001737 :
1738 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001739#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001740 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001741#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001742 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001743}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001744#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001745
fbarchard@google.com42831e02012-01-21 02:54:17 +00001746#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001747// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001748CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001749 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1750};
1751
fbarchard@google.com42831e02012-01-21 02:54:17 +00001752void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001753 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001754 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001755 "movdqa %3,%%xmm5 \n"
1756 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001757 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001758 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001759 "movdqa (%0,%2),%%xmm0 \n"
1760 "pshufb %%xmm5,%%xmm0 \n"
1761 "sub $0x10,%2 \n"
1762 "movdqa %%xmm0,(%1) \n"
1763 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001764 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001765 : "+r"(src), // %0
1766 "+r"(dst), // %1
1767 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001768 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001769 : "memory", "cc"
1770#if defined(__SSE2__)
1771 , "xmm0", "xmm5"
1772#endif
1773 );
1774}
1775#endif
1776
fbarchard@google.com42831e02012-01-21 02:54:17 +00001777#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001778void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001779 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001780 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001781 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001782 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001783 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001784 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001785 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001786 "psllw $0x8,%%xmm0 \n"
1787 "psrlw $0x8,%%xmm1 \n"
1788 "por %%xmm1,%%xmm0 \n"
1789 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1790 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1791 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1792 "sub $0x10,%2 \n"
1793 "movdqu %%xmm0,(%1) \n"
1794 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001795 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001796 : "+r"(src), // %0
1797 "+r"(dst), // %1
1798 "+r"(temp_width) // %2
1799 :
1800 : "memory", "cc"
1801#if defined(__SSE2__)
1802 , "xmm0", "xmm1"
1803#endif
1804 );
1805}
1806#endif
1807
fbarchard@google.com16a96642012-03-02 22:38:09 +00001808#ifdef HAS_MIRRORROW_UV_SSSE3
1809// Shuffle table for reversing the bytes of UV channels.
1810CONST uvec8 kShuffleMirrorUV = {
1811 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1812};
1813void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1814 int width) {
1815 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001816 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001817 "movdqa %4,%%xmm1 \n"
1818 "lea -16(%0,%3,2),%0 \n"
1819 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001820 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001821 "1: \n"
1822 "movdqa (%0),%%xmm0 \n"
1823 "lea -16(%0),%0 \n"
1824 "pshufb %%xmm1,%%xmm0 \n"
1825 "sub $8,%3 \n"
1826 "movlpd %%xmm0,(%1) \n"
1827 "movhpd %%xmm0,(%1,%2) \n"
1828 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001829 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001830 : "+r"(src), // %0
1831 "+r"(dst_u), // %1
1832 "+r"(dst_v), // %2
1833 "+r"(temp_width) // %3
1834 : "m"(kShuffleMirrorUV) // %4
1835 : "memory", "cc"
1836#if defined(__SSE2__)
1837 , "xmm0", "xmm1"
1838#endif
1839 );
1840}
1841#endif
1842
fbarchard@google.com55663022012-04-26 00:01:41 +00001843#ifdef HAS_ADDROW_SSE2
1844// dst and width aligned to 16
1845void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
1846 asm volatile (
1847 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001848 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001849 "1: \n"
1850 "movdqu (%0),%%xmm2 \n"
1851 "lea 0x10(%0),%0 \n"
1852 "movdqa (%1),%%xmm0 \n"
1853 "movdqa 0x10(%1),%%xmm1 \n"
1854 "movdqa %%xmm2,%%xmm3 \n"
1855 "punpcklbw %%xmm4,%%xmm2 \n"
1856 "punpckhbw %%xmm4,%%xmm3 \n"
1857 "paddusw %%xmm2,%%xmm0 \n"
1858 "paddusw %%xmm3,%%xmm1 \n"
1859 "sub $0x10,%2 \n"
1860 "movdqa %%xmm0,(%1) \n"
1861 "movdqa %%xmm1,0x10(%1) \n"
1862 "lea 0x20(%1),%1 \n"
1863 "jg 1b \n"
1864 : "+r"(src), // %0
1865 "+r"(dst), // %1
1866 "+r"(width) // %2
1867 :
1868 : "memory", "cc"
1869#if defined(__SSE2__)
1870 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1871#endif
1872 );
1873}
1874
1875// dst and width aligned to 16
1876void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
1877 asm volatile (
1878 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001879 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001880 "1: \n"
1881 "movdqu (%0),%%xmm2 \n"
1882 "lea 0x10(%0),%0 \n"
1883 "movdqa (%1),%%xmm0 \n"
1884 "movdqa 0x10(%1),%%xmm1 \n"
1885 "movdqa %%xmm2,%%xmm3 \n"
1886 "punpcklbw %%xmm4,%%xmm2 \n"
1887 "punpckhbw %%xmm4,%%xmm3 \n"
1888 "psubusw %%xmm2,%%xmm0 \n"
1889 "psubusw %%xmm3,%%xmm1 \n"
1890 "sub $0x10,%2 \n"
1891 "movdqa %%xmm0,(%1) \n"
1892 "movdqa %%xmm1,0x10(%1) \n"
1893 "lea 0x20(%1),%1 \n"
1894 "jg 1b \n"
1895 : "+r"(src), // %0
1896 "+r"(dst), // %1
1897 "+r"(width) // %2
1898 :
1899 : "memory", "cc"
1900#if defined(__SSE2__)
1901 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1902#endif
1903 );
1904}
1905#endif // HAS_ADDROW_SSE2
1906
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001907#ifdef HAS_SPLITUV_SSE2
1908void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001909 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001910 "pcmpeqb %%xmm5,%%xmm5 \n"
1911 "psrlw $0x8,%%xmm5 \n"
1912 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001913 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001914 "1: \n"
1915 "movdqa (%0),%%xmm0 \n"
1916 "movdqa 0x10(%0),%%xmm1 \n"
1917 "lea 0x20(%0),%0 \n"
1918 "movdqa %%xmm0,%%xmm2 \n"
1919 "movdqa %%xmm1,%%xmm3 \n"
1920 "pand %%xmm5,%%xmm0 \n"
1921 "pand %%xmm5,%%xmm1 \n"
1922 "packuswb %%xmm1,%%xmm0 \n"
1923 "psrlw $0x8,%%xmm2 \n"
1924 "psrlw $0x8,%%xmm3 \n"
1925 "packuswb %%xmm3,%%xmm2 \n"
1926 "movdqa %%xmm0,(%1) \n"
1927 "movdqa %%xmm2,(%1,%2) \n"
1928 "lea 0x10(%1),%1 \n"
1929 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001930 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001931 : "+r"(src_uv), // %0
1932 "+r"(dst_u), // %1
1933 "+r"(dst_v), // %2
1934 "+r"(pix) // %3
1935 :
1936 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001937#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001938 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001939#endif
1940 );
1941}
1942#endif
1943
fbarchard@google.com19932f82012-02-16 22:19:14 +00001944#ifdef HAS_COPYROW_SSE2
1945void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001946 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001947 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00001948 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001949 "1: \n"
1950 "movdqa (%0),%%xmm0 \n"
1951 "movdqa 0x10(%0),%%xmm1 \n"
1952 "movdqa %%xmm0,(%0,%1) \n"
1953 "movdqa %%xmm1,0x10(%0,%1) \n"
1954 "lea 0x20(%0),%0 \n"
1955 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001956 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001957 : "+r"(src), // %0
1958 "+r"(dst), // %1
1959 "+r"(count) // %2
1960 :
1961 : "memory", "cc"
1962#if defined(__SSE2__)
1963 , "xmm0", "xmm1"
1964#endif
1965 );
1966}
1967#endif // HAS_COPYROW_SSE2
1968
1969#ifdef HAS_COPYROW_X86
1970void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1971 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001972 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00001973 "shr $0x2,%2 \n"
1974 "rep movsl \n"
1975 : "+S"(src), // %0
1976 "+D"(dst), // %1
1977 "+c"(width_tmp) // %2
1978 :
1979 : "memory", "cc"
1980 );
1981}
1982#endif
1983
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001984#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001985void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001986 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001987 "pcmpeqb %%xmm5,%%xmm5 \n"
1988 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001989 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001990 "1: \n"
1991 "movdqa (%0),%%xmm0 \n"
1992 "movdqa 0x10(%0),%%xmm1 \n"
1993 "lea 0x20(%0),%0 \n"
1994 "pand %%xmm5,%%xmm0 \n"
1995 "pand %%xmm5,%%xmm1 \n"
1996 "packuswb %%xmm1,%%xmm0 \n"
1997 "movdqa %%xmm0,(%1) \n"
1998 "lea 0x10(%1),%1 \n"
1999 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002000 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002001 : "+r"(src_yuy2), // %0
2002 "+r"(dst_y), // %1
2003 "+r"(pix) // %2
2004 :
2005 : "memory", "cc"
2006#if defined(__SSE2__)
2007 , "xmm0", "xmm1", "xmm5"
2008#endif
2009 );
2010}
2011
2012void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2013 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002014 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002015 "pcmpeqb %%xmm5,%%xmm5 \n"
2016 "psrlw $0x8,%%xmm5 \n"
2017 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002018 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002019 "1: \n"
2020 "movdqa (%0),%%xmm0 \n"
2021 "movdqa 0x10(%0),%%xmm1 \n"
2022 "movdqa (%0,%4,1),%%xmm2 \n"
2023 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2024 "lea 0x20(%0),%0 \n"
2025 "pavgb %%xmm2,%%xmm0 \n"
2026 "pavgb %%xmm3,%%xmm1 \n"
2027 "psrlw $0x8,%%xmm0 \n"
2028 "psrlw $0x8,%%xmm1 \n"
2029 "packuswb %%xmm1,%%xmm0 \n"
2030 "movdqa %%xmm0,%%xmm1 \n"
2031 "pand %%xmm5,%%xmm0 \n"
2032 "packuswb %%xmm0,%%xmm0 \n"
2033 "psrlw $0x8,%%xmm1 \n"
2034 "packuswb %%xmm1,%%xmm1 \n"
2035 "movq %%xmm0,(%1) \n"
2036 "movq %%xmm1,(%1,%2) \n"
2037 "lea 0x8(%1),%1 \n"
2038 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002039 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002040 : "+r"(src_yuy2), // %0
2041 "+r"(dst_u), // %1
2042 "+r"(dst_y), // %2
2043 "+r"(pix) // %3
2044 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2045 : "memory", "cc"
2046#if defined(__SSE2__)
2047 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2048#endif
2049 );
2050}
2051
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002052
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002053void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2054 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002055 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002056 "pcmpeqb %%xmm5,%%xmm5 \n"
2057 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002058 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002059 "1: \n"
2060 "movdqu (%0),%%xmm0 \n"
2061 "movdqu 0x10(%0),%%xmm1 \n"
2062 "lea 0x20(%0),%0 \n"
2063 "pand %%xmm5,%%xmm0 \n"
2064 "pand %%xmm5,%%xmm1 \n"
2065 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002066 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002067 "movdqu %%xmm0,(%1) \n"
2068 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002069 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002070 : "+r"(src_yuy2), // %0
2071 "+r"(dst_y), // %1
2072 "+r"(pix) // %2
2073 :
2074 : "memory", "cc"
2075#if defined(__SSE2__)
2076 , "xmm0", "xmm1", "xmm5"
2077#endif
2078 );
2079}
2080
2081void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2082 int stride_yuy2,
2083 uint8* dst_u, uint8* dst_y,
2084 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002085 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002086 "pcmpeqb %%xmm5,%%xmm5 \n"
2087 "psrlw $0x8,%%xmm5 \n"
2088 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002089 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002090 "1: \n"
2091 "movdqu (%0),%%xmm0 \n"
2092 "movdqu 0x10(%0),%%xmm1 \n"
2093 "movdqu (%0,%4,1),%%xmm2 \n"
2094 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2095 "lea 0x20(%0),%0 \n"
2096 "pavgb %%xmm2,%%xmm0 \n"
2097 "pavgb %%xmm3,%%xmm1 \n"
2098 "psrlw $0x8,%%xmm0 \n"
2099 "psrlw $0x8,%%xmm1 \n"
2100 "packuswb %%xmm1,%%xmm0 \n"
2101 "movdqa %%xmm0,%%xmm1 \n"
2102 "pand %%xmm5,%%xmm0 \n"
2103 "packuswb %%xmm0,%%xmm0 \n"
2104 "psrlw $0x8,%%xmm1 \n"
2105 "packuswb %%xmm1,%%xmm1 \n"
2106 "movq %%xmm0,(%1) \n"
2107 "movq %%xmm1,(%1,%2) \n"
2108 "lea 0x8(%1),%1 \n"
2109 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002110 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002111 : "+r"(src_yuy2), // %0
2112 "+r"(dst_u), // %1
2113 "+r"(dst_y), // %2
2114 "+r"(pix) // %3
2115 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2116 : "memory", "cc"
2117#if defined(__SSE2__)
2118 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2119#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002120 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002121}
2122
2123void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002124 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002125 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002126 "1: \n"
2127 "movdqa (%0),%%xmm0 \n"
2128 "movdqa 0x10(%0),%%xmm1 \n"
2129 "lea 0x20(%0),%0 \n"
2130 "psrlw $0x8,%%xmm0 \n"
2131 "psrlw $0x8,%%xmm1 \n"
2132 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002133 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002134 "movdqa %%xmm0,(%1) \n"
2135 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002136 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002137 : "+r"(src_uyvy), // %0
2138 "+r"(dst_y), // %1
2139 "+r"(pix) // %2
2140 :
2141 : "memory", "cc"
2142#if defined(__SSE2__)
2143 , "xmm0", "xmm1"
2144#endif
2145 );
2146}
2147
2148void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2149 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002150 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002151 "pcmpeqb %%xmm5,%%xmm5 \n"
2152 "psrlw $0x8,%%xmm5 \n"
2153 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002154 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002155 "1: \n"
2156 "movdqa (%0),%%xmm0 \n"
2157 "movdqa 0x10(%0),%%xmm1 \n"
2158 "movdqa (%0,%4,1),%%xmm2 \n"
2159 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2160 "lea 0x20(%0),%0 \n"
2161 "pavgb %%xmm2,%%xmm0 \n"
2162 "pavgb %%xmm3,%%xmm1 \n"
2163 "pand %%xmm5,%%xmm0 \n"
2164 "pand %%xmm5,%%xmm1 \n"
2165 "packuswb %%xmm1,%%xmm0 \n"
2166 "movdqa %%xmm0,%%xmm1 \n"
2167 "pand %%xmm5,%%xmm0 \n"
2168 "packuswb %%xmm0,%%xmm0 \n"
2169 "psrlw $0x8,%%xmm1 \n"
2170 "packuswb %%xmm1,%%xmm1 \n"
2171 "movq %%xmm0,(%1) \n"
2172 "movq %%xmm1,(%1,%2) \n"
2173 "lea 0x8(%1),%1 \n"
2174 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002175 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002176 : "+r"(src_uyvy), // %0
2177 "+r"(dst_u), // %1
2178 "+r"(dst_y), // %2
2179 "+r"(pix) // %3
2180 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2181 : "memory", "cc"
2182#if defined(__SSE2__)
2183 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2184#endif
2185 );
2186}
2187
2188void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2189 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002190 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002191 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002192 "1: \n"
2193 "movdqu (%0),%%xmm0 \n"
2194 "movdqu 0x10(%0),%%xmm1 \n"
2195 "lea 0x20(%0),%0 \n"
2196 "psrlw $0x8,%%xmm0 \n"
2197 "psrlw $0x8,%%xmm1 \n"
2198 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002199 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002200 "movdqu %%xmm0,(%1) \n"
2201 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002202 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002203 : "+r"(src_uyvy), // %0
2204 "+r"(dst_y), // %1
2205 "+r"(pix) // %2
2206 :
2207 : "memory", "cc"
2208#if defined(__SSE2__)
2209 , "xmm0", "xmm1"
2210#endif
2211 );
2212}
2213
2214void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2215 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002216 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002217 "pcmpeqb %%xmm5,%%xmm5 \n"
2218 "psrlw $0x8,%%xmm5 \n"
2219 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002220 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002221 "1: \n"
2222 "movdqu (%0),%%xmm0 \n"
2223 "movdqu 0x10(%0),%%xmm1 \n"
2224 "movdqu (%0,%4,1),%%xmm2 \n"
2225 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2226 "lea 0x20(%0),%0 \n"
2227 "pavgb %%xmm2,%%xmm0 \n"
2228 "pavgb %%xmm3,%%xmm1 \n"
2229 "pand %%xmm5,%%xmm0 \n"
2230 "pand %%xmm5,%%xmm1 \n"
2231 "packuswb %%xmm1,%%xmm0 \n"
2232 "movdqa %%xmm0,%%xmm1 \n"
2233 "pand %%xmm5,%%xmm0 \n"
2234 "packuswb %%xmm0,%%xmm0 \n"
2235 "psrlw $0x8,%%xmm1 \n"
2236 "packuswb %%xmm1,%%xmm1 \n"
2237 "movq %%xmm0,(%1) \n"
2238 "movq %%xmm1,(%1,%2) \n"
2239 "lea 0x8(%1),%1 \n"
2240 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002241 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002242 : "+r"(src_uyvy), // %0
2243 "+r"(dst_u), // %1
2244 "+r"(dst_y), // %2
2245 "+r"(pix) // %3
2246 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2247 : "memory", "cc"
2248#if defined(__SSE2__)
2249 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2250#endif
2251 );
2252}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002253#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002254
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002255#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002256// Blend 8 pixels at a time.
2257// src_argb0 unaligned.
2258// src_argb1 and dst_argb aligned to 16 bytes.
2259// width must be multiple of 4 pixels.
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002260void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.comc757f302012-04-03 00:49:16 +00002261 uint8* dst_argb, int width) {
2262 asm volatile (
2263 "pcmpeqb %%xmm7,%%xmm7 \n"
2264 "psrlw $0xf,%%xmm7 \n"
2265 "pcmpeqb %%xmm6,%%xmm6 \n"
2266 "psrlw $0x8,%%xmm6 \n"
2267 "pcmpeqb %%xmm5,%%xmm5 \n"
2268 "psllw $0x8,%%xmm5 \n"
2269 "pcmpeqb %%xmm4,%%xmm4 \n"
2270 "pslld $0x18,%%xmm4 \n"
2271
2272 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002273 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002274 "1: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002275 "movdqu (%0),%%xmm3 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002276 "movdqa %%xmm3,%%xmm0 \n"
2277 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002278 "movdqu (%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002279 "psrlw $0x8,%%xmm3 \n"
2280 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2281 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2282 "pand %%xmm6,%%xmm2 \n"
2283 "paddw %%xmm7,%%xmm3 \n"
2284 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002285 "movdqu (%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002286 "psrlw $0x8,%%xmm1 \n"
2287 "por %%xmm4,%%xmm0 \n"
2288 "pmullw %%xmm3,%%xmm1 \n"
2289 "movdqu 0x10(%0),%%xmm3 \n"
2290 "lea 0x20(%0),%0 \n"
2291 "psrlw $0x8,%%xmm2 \n"
2292 "paddusb %%xmm2,%%xmm0 \n"
2293 "pand %%xmm5,%%xmm1 \n"
2294 "paddusb %%xmm1,%%xmm0 \n"
2295 "sub $0x4,%3 \n"
2296 "movdqa %%xmm0,(%2) \n"
2297 "jle 9f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002298 "movdqa %%xmm3,%%xmm0 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002299 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002300 "movdqu 0x10(%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002301 "psrlw $0x8,%%xmm3 \n"
2302 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2303 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2304 "pand %%xmm6,%%xmm2 \n"
2305 "paddw %%xmm7,%%xmm3 \n"
2306 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002307 "movdqu 0x10(%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002308 "lea 0x20(%1),%1 \n"
2309 "psrlw $0x8,%%xmm1 \n"
2310 "por %%xmm4,%%xmm0 \n"
2311 "pmullw %%xmm3,%%xmm1 \n"
2312 "psrlw $0x8,%%xmm2 \n"
2313 "paddusb %%xmm2,%%xmm0 \n"
2314 "pand %%xmm5,%%xmm1 \n"
2315 "paddusb %%xmm1,%%xmm0 \n"
2316 "sub $0x4,%3 \n"
2317 "movdqa %%xmm0,0x10(%2) \n"
2318 "lea 0x20(%2),%2 \n"
2319 "jg 1b \n"
2320 "9: \n"
2321 : "+r"(src_argb0), // %0
2322 "+r"(src_argb1), // %1
2323 "+r"(dst_argb), // %2
2324 "+r"(width) // %3
2325 :
2326 : "memory", "cc"
2327#if defined(__SSE2__)
2328 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2329#endif
2330 );
2331}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002332#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002333
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002334#ifdef HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002335// Blend 1 pixel at a time, unaligned
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002336void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com96af8702012-04-06 18:22:27 +00002337 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002338 asm volatile (
2339 "pcmpeqb %%xmm7,%%xmm7 \n"
2340 "psrlw $0xf,%%xmm7 \n"
2341 "pcmpeqb %%xmm6,%%xmm6 \n"
2342 "psrlw $0x8,%%xmm6 \n"
2343 "pcmpeqb %%xmm5,%%xmm5 \n"
2344 "psllw $0x8,%%xmm5 \n"
2345 "pcmpeqb %%xmm4,%%xmm4 \n"
2346 "pslld $0x18,%%xmm4 \n"
2347
2348 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002349 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002350 "1: \n"
2351 "movd (%0),%%xmm3 \n"
2352 "lea 0x4(%0),%0 \n"
2353 "movdqa %%xmm3,%%xmm0 \n"
2354 "pxor %%xmm4,%%xmm3 \n"
2355 "movd (%1),%%xmm2 \n"
2356 "psrlw $0x8,%%xmm3 \n"
2357 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2358 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2359 "pand %%xmm6,%%xmm2 \n"
2360 "paddw %%xmm7,%%xmm3 \n"
2361 "pmullw %%xmm3,%%xmm2 \n"
2362 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002363 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002364 "psrlw $0x8,%%xmm1 \n"
2365 "por %%xmm4,%%xmm0 \n"
2366 "pmullw %%xmm3,%%xmm1 \n"
2367 "psrlw $0x8,%%xmm2 \n"
2368 "paddusb %%xmm2,%%xmm0 \n"
2369 "pand %%xmm5,%%xmm1 \n"
2370 "paddusb %%xmm1,%%xmm0 \n"
2371 "sub $0x1,%3 \n"
2372 "movd %%xmm0,(%2) \n"
2373 "lea 0x4(%2),%2 \n"
2374 "jg 1b \n"
2375 : "+r"(src_argb0), // %0
2376 "+r"(src_argb1), // %1
2377 "+r"(dst_argb), // %2
2378 "+r"(width) // %3
2379 :
2380 : "memory", "cc"
2381#if defined(__SSE2__)
2382 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2383#endif
2384 );
2385}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002386#endif // HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002387
fbarchard@google.com96af8702012-04-06 18:22:27 +00002388#ifdef HAS_ARGBBLENDROW_SSSE3
2389// Shuffle table for reversing the bytes.
2390CONST uvec8 kShuffleAlpha = {
2391 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2392 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2393};
2394void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002395 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002396 asm volatile (
2397 "pcmpeqb %%xmm7,%%xmm7 \n"
2398 "psrlw $0xf,%%xmm7 \n"
2399 "pcmpeqb %%xmm6,%%xmm6 \n"
2400 "psrlw $0x8,%%xmm6 \n"
2401 "pcmpeqb %%xmm5,%%xmm5 \n"
2402 "psllw $0x8,%%xmm5 \n"
2403 "pcmpeqb %%xmm4,%%xmm4 \n"
2404 "pslld $0x18,%%xmm4 \n"
2405
2406 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002407 ".p2align 4 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002408 "1: \n"
2409 "movdqu (%0),%%xmm3 \n"
2410 "movdqa %%xmm3,%%xmm0 \n"
2411 "pxor %%xmm4,%%xmm3 \n"
2412 "pshufb %4,%%xmm3 \n"
2413 "movdqu (%1),%%xmm2 \n"
2414 "pand %%xmm6,%%xmm2 \n"
2415 "paddw %%xmm7,%%xmm3 \n"
2416 "pmullw %%xmm3,%%xmm2 \n"
2417 "movdqu (%1),%%xmm1 \n"
2418 "psrlw $0x8,%%xmm1 \n"
2419 "por %%xmm4,%%xmm0 \n"
2420 "pmullw %%xmm3,%%xmm1 \n"
2421 "movdqu 0x10(%0),%%xmm3 \n"
2422 "lea 0x20(%0),%0 \n"
2423 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002424 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002425 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002426 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002427 "sub $0x4,%3 \n"
2428 "movdqa %%xmm0,(%2) \n"
2429 "jle 9f \n"
2430 "movdqa %%xmm3,%%xmm0 \n"
2431 "pxor %%xmm4,%%xmm3 \n"
2432 "movdqu 0x10(%1),%%xmm2 \n"
2433 "pshufb %4,%%xmm3 \n"
2434 "pand %%xmm6,%%xmm2 \n"
2435 "paddw %%xmm7,%%xmm3 \n"
2436 "pmullw %%xmm3,%%xmm2 \n"
2437 "movdqu 0x10(%1),%%xmm1 \n"
2438 "lea 0x20(%1),%1 \n"
2439 "psrlw $0x8,%%xmm1 \n"
2440 "por %%xmm4,%%xmm0 \n"
2441 "pmullw %%xmm3,%%xmm1 \n"
2442 "psrlw $0x8,%%xmm2 \n"
2443 "paddusb %%xmm2,%%xmm0 \n"
2444 "pand %%xmm5,%%xmm1 \n"
2445 "paddusb %%xmm1,%%xmm0 \n"
2446 "sub $0x4,%3 \n"
2447 "movdqa %%xmm0,0x10(%2) \n"
2448 "lea 0x20(%2),%2 \n"
2449 "jg 1b \n"
2450 "9: \n"
2451 : "+r"(src_argb0), // %0
2452 "+r"(src_argb1), // %1
2453 "+r"(dst_argb), // %2
2454 "+r"(width) // %3
2455 : "m"(kShuffleAlpha) // %4
2456 : "memory", "cc"
2457#if defined(__SSE2__)
2458 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2459#endif
2460 );
2461}
2462#endif // HAS_ARGBBLENDROW_SSSE3
2463
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002464
2465#ifdef HAS_ARGBBLENDROW1_SSSE3
2466// Blend 1 pixel at a time, unaligned
2467void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2468 uint8* dst_argb, int width) {
2469 asm volatile (
2470 "pcmpeqb %%xmm7,%%xmm7 \n"
2471 "psrlw $0xf,%%xmm7 \n"
2472 "pcmpeqb %%xmm6,%%xmm6 \n"
2473 "psrlw $0x8,%%xmm6 \n"
2474 "pcmpeqb %%xmm5,%%xmm5 \n"
2475 "psllw $0x8,%%xmm5 \n"
2476 "pcmpeqb %%xmm4,%%xmm4 \n"
2477 "pslld $0x18,%%xmm4 \n"
2478
2479 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002480 ".p2align 4 \n"
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002481 "1: \n"
2482 "movd (%0),%%xmm3 \n"
2483 "lea 0x4(%0),%0 \n"
2484 "movdqa %%xmm3,%%xmm0 \n"
2485 "pxor %%xmm4,%%xmm3 \n"
2486 "movd (%1),%%xmm2 \n"
2487 "pshufb %4,%%xmm3 \n"
2488 "pand %%xmm6,%%xmm2 \n"
2489 "paddw %%xmm7,%%xmm3 \n"
2490 "pmullw %%xmm3,%%xmm2 \n"
2491 "movd (%1),%%xmm1 \n"
2492 "lea 0x4(%1),%1 \n"
2493 "psrlw $0x8,%%xmm1 \n"
2494 "por %%xmm4,%%xmm0 \n"
2495 "pmullw %%xmm3,%%xmm1 \n"
2496 "psrlw $0x8,%%xmm2 \n"
2497 "paddusb %%xmm2,%%xmm0 \n"
2498 "pand %%xmm5,%%xmm1 \n"
2499 "paddusb %%xmm1,%%xmm0 \n"
2500 "sub $0x1,%3 \n"
2501 "movd %%xmm0,(%2) \n"
2502 "lea 0x4(%2),%2 \n"
2503 "jg 1b \n"
2504 : "+r"(src_argb0), // %0
2505 "+r"(src_argb1), // %1
2506 "+r"(dst_argb), // %2
2507 "+r"(width) // %3
2508 : "m"(kShuffleAlpha) // %4
2509 : "memory", "cc"
2510#if defined(__SSE2__)
2511 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2512#endif
2513 );
2514}
2515#endif // HAS_ARGBBLENDROW1_SSSE3
2516
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002517#ifdef HAS_ARGBATTENUATE_SSE2
2518// Attenuate 4 pixels at a time.
2519// aligned to 16 bytes
2520void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2521 asm volatile (
2522 "sub %0,%1 \n"
2523 "pcmpeqb %%xmm4,%%xmm4 \n"
2524 "pslld $0x18,%%xmm4 \n"
2525 "pcmpeqb %%xmm5,%%xmm5 \n"
2526 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002527
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002528 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002529 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002530 "1: \n"
2531 "movdqa (%0),%%xmm0 \n"
2532 "punpcklbw %%xmm0,%%xmm0 \n"
2533 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2534 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2535 "pmulhuw %%xmm2,%%xmm0 \n"
2536 "movdqa (%0),%%xmm1 \n"
2537 "punpckhbw %%xmm1,%%xmm1 \n"
2538 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2539 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2540 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002541 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002542 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002543 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002544 "psrlw $0x8,%%xmm1 \n"
2545 "packuswb %%xmm1,%%xmm0 \n"
2546 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002547 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002548 "sub $0x4,%2 \n"
2549 "movdqa %%xmm0,(%0,%1,1) \n"
2550 "lea 0x10(%0),%0 \n"
2551 "jg 1b \n"
2552 : "+r"(src_argb), // %0
2553 "+r"(dst_argb), // %1
2554 "+r"(width) // %2
2555 :
2556 : "memory", "cc"
2557#if defined(__SSE2__)
2558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2559#endif
2560 );
2561}
2562#endif // HAS_ARGBATTENUATE_SSE2
2563
fbarchard@google.com810cd912012-04-20 20:15:27 +00002564#ifdef HAS_ARGBATTENUATE_SSSE3
2565// Shuffle table duplicating alpha
2566CONST uvec8 kShuffleAlpha0 = {
2567 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2568};
2569CONST uvec8 kShuffleAlpha1 = {
2570 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2571 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2572};
2573// Attenuate 4 pixels at a time.
2574// aligned to 16 bytes
2575void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2576 asm volatile (
2577 "sub %0,%1 \n"
2578 "pcmpeqb %%xmm3,%%xmm3 \n"
2579 "pslld $0x18,%%xmm3 \n"
2580 "movdqa %3,%%xmm4 \n"
2581 "movdqa %4,%%xmm5 \n"
2582
2583 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002584 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002585 "1: \n"
2586 "movdqa (%0),%%xmm0 \n"
2587 "pshufb %%xmm4,%%xmm0 \n"
2588 "movdqa (%0),%%xmm1 \n"
2589 "punpcklbw %%xmm1,%%xmm1 \n"
2590 "pmulhuw %%xmm1,%%xmm0 \n"
2591 "movdqa (%0),%%xmm1 \n"
2592 "pshufb %%xmm5,%%xmm1 \n"
2593 "movdqa (%0),%%xmm2 \n"
2594 "punpckhbw %%xmm2,%%xmm2 \n"
2595 "pmulhuw %%xmm2,%%xmm1 \n"
2596 "movdqa (%0),%%xmm2 \n"
2597 "pand %%xmm3,%%xmm2 \n"
2598 "psrlw $0x8,%%xmm0 \n"
2599 "psrlw $0x8,%%xmm1 \n"
2600 "packuswb %%xmm1,%%xmm0 \n"
2601 "por %%xmm2,%%xmm0 \n"
2602 "sub $0x4,%2 \n"
2603 "movdqa %%xmm0,(%0,%1,1) \n"
2604 "lea 0x10(%0),%0 \n"
2605 "jg 1b \n"
2606 : "+r"(src_argb), // %0
2607 "+r"(dst_argb), // %1
2608 "+r"(width) // %2
2609 : "m"(kShuffleAlpha0), // %3
2610 "m"(kShuffleAlpha1) // %4
2611 : "memory", "cc"
2612#if defined(__SSE2__)
2613 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2614#endif
2615 );
2616}
2617#endif // HAS_ARGBATTENUATE_SSSE3
2618
2619#ifdef HAS_ARGBUNATTENUATE_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00002620// Unattenuate 4 pixels at a time.
2621// aligned to 16 bytes
2622void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2623 int width) {
2624 uintptr_t alpha = 0;
2625 asm volatile (
2626 "sub %0,%1 \n"
2627 "pcmpeqb %%xmm4,%%xmm4 \n"
2628 "pslld $0x18,%%xmm4 \n"
2629
2630 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002631 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002632 "1: \n"
2633 "movdqa (%0),%%xmm0 \n"
2634 "movzb 0x3(%0),%3 \n"
2635 "punpcklbw %%xmm0,%%xmm0 \n"
2636 "movd 0x0(%4,%3,4),%%xmm2 \n"
2637 "movzb 0x7(%0),%3 \n"
2638 "movd 0x0(%4,%3,4),%%xmm3 \n"
2639 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2640 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2641 "movlhps %%xmm3,%%xmm2 \n"
2642 "pmulhuw %%xmm2,%%xmm0 \n"
2643 "movdqa (%0),%%xmm1 \n"
2644 "movzb 0xb(%0),%3 \n"
2645 "punpckhbw %%xmm1,%%xmm1 \n"
2646 "movd 0x0(%4,%3,4),%%xmm2 \n"
2647 "movzb 0xf(%0),%3 \n"
2648 "movd 0x0(%4,%3,4),%%xmm3 \n"
2649 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2650 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2651 "movlhps %%xmm3,%%xmm2 \n"
2652 "pmulhuw %%xmm2,%%xmm1 \n"
2653 "movdqa (%0),%%xmm2 \n"
2654 "pand %%xmm4,%%xmm2 \n"
2655 "packuswb %%xmm1,%%xmm0 \n"
2656 "por %%xmm2,%%xmm0 \n"
2657 "sub $0x4,%2 \n"
2658 "movdqa %%xmm0,(%0,%1,1) \n"
2659 "lea 0x10(%0),%0 \n"
2660 "jg 1b \n"
2661 : "+r"(src_argb), // %0
2662 "+r"(dst_argb), // %1
2663 "+r"(width), // %2
2664 "+r"(alpha) // %3
2665 : "r"(fixed_invtbl8) // %4
2666 : "memory", "cc"
2667#if defined(__SSE2__)
2668 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2669#endif
2670 );
2671}
2672#endif // HAS_ARGBUNATTENUATE_SSE2
2673
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002674#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002675// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
2676CONST vec8 kARGBToGray = {
2677 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
2678};
2679
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002680// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2681void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) {
2682 asm volatile (
2683 "movdqa %2,%%xmm4 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002684 // 8 pixel loop \n"
2685 ".p2align 4 \n"
2686 "1: \n"
2687 "movdqa (%0),%%xmm0 \n"
2688 "movdqa 0x10(%0),%%xmm1 \n"
2689 "pmaddubsw %%xmm4,%%xmm0 \n"
2690 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002691 "phaddw %%xmm1,%%xmm0 \n"
2692 "psrlw $0x7,%%xmm0 \n"
2693 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002694 "movdqa (%0),%%xmm2 \n"
2695 "movdqa 0x10(%0),%%xmm3 \n"
2696 "psrld $0x18,%%xmm2 \n"
2697 "psrld $0x18,%%xmm3 \n"
2698 "packuswb %%xmm3,%%xmm2 \n"
2699 "packuswb %%xmm2,%%xmm2 \n"
2700 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002701 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002702 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002703 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002704 "punpcklwd %%xmm3,%%xmm0 \n"
2705 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002706 "sub $0x8,%1 \n"
2707 "movdqa %%xmm0,(%0) \n"
2708 "movdqa %%xmm1,0x10(%0) \n"
2709 "lea 0x20(%0),%0 \n"
2710 "jg 1b \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002711 : "+r"(dst_argb), // %0
2712 "+r"(width) // %1
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002713 : "m"(kARGBToGray) // %2
2714 : "memory", "cc"
2715#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00002716 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002717#endif
2718 );
2719}
2720#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002721
2722#ifdef HAS_ARGBSEPIAROW_SSSE3
2723// b = (r * 35 + g * 68 + b * 17) >> 7
2724// g = (r * 45 + g * 88 + b * 22) >> 7
2725// r = (r * 50 + g * 98 + b * 24) >> 7
2726// Constant for ARGB color to sepia tone
2727CONST vec8 kARGBToSepiaB = {
2728 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
2729};
2730
2731CONST vec8 kARGBToSepiaG = {
2732 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
2733};
2734
2735CONST vec8 kARGBToSepiaR = {
2736 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
2737};
2738
2739// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels
2740void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
2741 asm volatile (
2742 "movdqa %2,%%xmm2 \n"
2743 "movdqa %3,%%xmm3 \n"
2744 "movdqa %4,%%xmm4 \n"
2745 // 8 pixel loop \n"
2746 ".p2align 4 \n"
2747 "1: \n"
2748 "movdqa (%0),%%xmm0 \n"
2749 "movdqa 0x10(%0),%%xmm6 \n"
2750 "pmaddubsw %%xmm2,%%xmm0 \n"
2751 "pmaddubsw %%xmm2,%%xmm6 \n"
2752 "phaddw %%xmm6,%%xmm0 \n"
2753 "psrlw $0x7,%%xmm0 \n"
2754 "packuswb %%xmm0,%%xmm0 \n"
2755 "movdqa (%0),%%xmm5 \n"
2756 "movdqa 0x10(%0),%%xmm1 \n"
2757 "pmaddubsw %%xmm3,%%xmm5 \n"
2758 "pmaddubsw %%xmm3,%%xmm1 \n"
2759 "phaddw %%xmm1,%%xmm5 \n"
2760 "psrlw $0x7,%%xmm5 \n"
2761 "packuswb %%xmm5,%%xmm5 \n"
2762 "punpcklbw %%xmm5,%%xmm0 \n"
2763 "movdqa (%0),%%xmm5 \n"
2764 "movdqa 0x10(%0),%%xmm1 \n"
2765 "pmaddubsw %%xmm4,%%xmm5 \n"
2766 "pmaddubsw %%xmm4,%%xmm1 \n"
2767 "phaddw %%xmm1,%%xmm5 \n"
2768 "psrlw $0x7,%%xmm5 \n"
2769 "packuswb %%xmm5,%%xmm5 \n"
2770 "movdqa (%0),%%xmm6 \n"
2771 "movdqa 0x10(%0),%%xmm1 \n"
2772 "psrld $0x18,%%xmm6 \n"
2773 "psrld $0x18,%%xmm1 \n"
2774 "packuswb %%xmm1,%%xmm6 \n"
2775 "packuswb %%xmm6,%%xmm6 \n"
2776 "punpcklbw %%xmm6,%%xmm5 \n"
2777 "movdqa %%xmm0,%%xmm1 \n"
2778 "punpcklwd %%xmm5,%%xmm0 \n"
2779 "punpckhwd %%xmm5,%%xmm1 \n"
2780 "sub $0x8,%1 \n"
2781 "movdqa %%xmm0,(%0) \n"
2782 "movdqa %%xmm1,0x10(%0) \n"
2783 "lea 0x20(%0),%0 \n"
2784 "jg 1b \n"
2785 : "+r"(dst_argb), // %0
2786 "+r"(width) // %1
2787 : "m"(kARGBToSepiaB), // %2
2788 "m"(kARGBToSepiaG), // %3
2789 "m"(kARGBToSepiaR) // %4
2790 : "memory", "cc"
2791#if defined(__SSE2__)
2792 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2793#endif
2794 );
2795}
2796#endif // HAS_ARGBSEPIAROW_SSSE3
2797
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002798#endif // defined(__x86_64__) || defined(__i386__)
2799
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002800#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002801} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002802} // namespace libyuv
2803#endif