blob: dadd7bc384b162424d5a90c5d389a1f306113083 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
fbarchard@google.comcde58702013-01-28 00:02:35 +00007 * in the file PATENTS. All contributing project authors may
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00008 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
fbarchard@google.com4de0c432012-10-11 01:25:46 +000072// Constants for RGBA.
73CONST vec8 kRGBAToY = {
74 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
75};
76
77CONST vec8 kRGBAToU = {
78 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
79};
80
81CONST vec8 kRGBAToV = {
82 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
83};
84
fbarchard@google.com714050a2012-02-17 22:59:56 +000085CONST uvec8 kAddY16 = {
86 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000087};
fbarchard@google.com2430e042011-11-11 21:57:06 +000088
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000089CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
92};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000093
fbarchard@google.comba1f5262012-01-12 19:22:41 +000094// Shuffle table for converting RGB24 to ARGB.
95CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000096 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
97};
98
99// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000100CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000101 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
102};
103
fbarchard@google.comb6149762011-11-07 21:58:52 +0000104// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000105CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000106 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
107};
108
109// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000110CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
112};
113
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000114// Shuffle table for converting RGBA to ARGB.
115CONST uvec8 kShuffleMaskRGBAToARGB = {
116 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
117};
118
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000119// Shuffle table for converting ARGB to RGBA.
120CONST uvec8 kShuffleMaskARGBToRGBA = {
121 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
122};
123
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000124// Shuffle table for converting ARGB to RGB24.
125CONST uvec8 kShuffleMaskARGBToRGB24 = {
126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
127};
128
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000129// Shuffle table for converting ARGB to RAW.
130CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000131 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000132};
133
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000134// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000135CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
137};
138
139// Shuffle table for converting ARGB to RAW.
140CONST uvec8 kShuffleMaskARGBToRAW_0 = {
141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142};
143
fbarchard@google.comb6149762011-11-07 21:58:52 +0000144void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000145 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pcmpeqb %%xmm5,%%xmm5 \n"
147 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000148 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "1: \n"
150 "movq (%0),%%xmm0 \n"
151 "lea 0x8(%0),%0 \n"
152 "punpcklbw %%xmm0,%%xmm0 \n"
153 "movdqa %%xmm0,%%xmm1 \n"
154 "punpcklwd %%xmm0,%%xmm0 \n"
155 "punpckhwd %%xmm1,%%xmm1 \n"
156 "por %%xmm5,%%xmm0 \n"
157 "por %%xmm5,%%xmm1 \n"
158 "movdqa %%xmm0,(%1) \n"
159 "movdqa %%xmm1,0x10(%1) \n"
160 "lea 0x20(%1),%1 \n"
161 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000162 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163 : "+r"(src_y), // %0
164 "+r"(dst_argb), // %1
165 "+r"(pix) // %2
166 :
167 : "memory", "cc"
168#if defined(__SSE2__)
169 , "xmm0", "xmm1", "xmm5"
170#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000171 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000172}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173
fbarchard@google.com00b69a22012-11-02 06:03:28 +0000174void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
175 int pix) {
176 asm volatile (
177 "pcmpeqb %%xmm5,%%xmm5 \n"
178 "pslld $0x18,%%xmm5 \n"
179 ".p2align 4 \n"
180 "1: \n"
181 "movq (%0),%%xmm0 \n"
182 "lea 0x8(%0),%0 \n"
183 "punpcklbw %%xmm0,%%xmm0 \n"
184 "movdqa %%xmm0,%%xmm1 \n"
185 "punpcklwd %%xmm0,%%xmm0 \n"
186 "punpckhwd %%xmm1,%%xmm1 \n"
187 "por %%xmm5,%%xmm0 \n"
188 "por %%xmm5,%%xmm1 \n"
189 "movdqu %%xmm0,(%1) \n"
190 "movdqu %%xmm1,0x10(%1) \n"
191 "lea 0x20(%1),%1 \n"
192 "sub $0x8,%2 \n"
193 "jg 1b \n"
194 : "+r"(src_y), // %0
195 "+r"(dst_argb), // %1
196 "+r"(pix) // %2
197 :
198 : "memory", "cc"
199#if defined(__SSE2__)
200 , "xmm0", "xmm1", "xmm5"
201#endif
202 );
203}
204
fbarchard@google.comb6149762011-11-07 21:58:52 +0000205void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000206 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000207 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000208 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000209 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000210 "1: \n"
211 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000212 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000213 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "movdqa %%xmm0,(%0,%1,1) \n"
215 "lea 0x10(%0),%0 \n"
216 "jg 1b \n"
217
fbarchard@google.comb6149762011-11-07 21:58:52 +0000218 : "+r"(src_abgr), // %0
219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
221 : "m"(kShuffleMaskABGRToARGB) // %3
222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000227}
228
229void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000232 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000233 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000234 "1: \n"
235 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000236 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000237 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000238 "movdqa %%xmm0,(%0,%1,1) \n"
239 "lea 0x10(%0),%0 \n"
240 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000241 : "+r"(src_bgra), // %0
242 "+r"(dst_argb), // %1
243 "+r"(pix) // %2
244 : "m"(kShuffleMaskBGRAToARGB) // %3
245 : "memory", "cc"
246#if defined(__SSE2__)
247 , "xmm0", "xmm5"
248#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000249 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000250}
251
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000252void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
253 asm volatile (
254 "movdqa %3,%%xmm5 \n"
255 "sub %0,%1 \n"
256 ".p2align 4 \n"
257 "1: \n"
258 "movdqa (%0),%%xmm0 \n"
259 "pshufb %%xmm5,%%xmm0 \n"
260 "sub $0x4,%2 \n"
261 "movdqa %%xmm0,(%0,%1,1) \n"
262 "lea 0x10(%0),%0 \n"
263 "jg 1b \n"
264
265 : "+r"(src_rgba), // %0
266 "+r"(dst_argb), // %1
267 "+r"(pix) // %2
268 : "m"(kShuffleMaskRGBAToARGB) // %3
269 : "memory", "cc"
270#if defined(__SSE2__)
271 , "xmm0", "xmm5"
272#endif
273 );
274}
275
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000276void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
277 asm volatile (
278 "movdqa %3,%%xmm5 \n"
279 "sub %0,%1 \n"
280 ".p2align 4 \n"
281 "1: \n"
282 "movdqa (%0),%%xmm0 \n"
283 "pshufb %%xmm5,%%xmm0 \n"
284 "sub $0x4,%2 \n"
285 "movdqa %%xmm0,(%0,%1,1) \n"
286 "lea 0x10(%0),%0 \n"
287 "jg 1b \n"
288
289 : "+r"(src_argb), // %0
290 "+r"(dst_rgba), // %1
291 "+r"(pix) // %2
292 : "m"(kShuffleMaskARGBToRGBA) // %3
293 : "memory", "cc"
294#if defined(__SSE2__)
295 , "xmm0", "xmm5"
296#endif
297 );
298}
299
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000300void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000301 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000302 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
303 "pslld $0x18,%%xmm5 \n"
304 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000305 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000306 "1: \n"
307 "movdqu (%0),%%xmm0 \n"
308 "movdqu 0x10(%0),%%xmm1 \n"
309 "movdqu 0x20(%0),%%xmm3 \n"
310 "lea 0x30(%0),%0 \n"
311 "movdqa %%xmm3,%%xmm2 \n"
312 "palignr $0x8,%%xmm1,%%xmm2 \n"
313 "pshufb %%xmm4,%%xmm2 \n"
314 "por %%xmm5,%%xmm2 \n"
315 "palignr $0xc,%%xmm0,%%xmm1 \n"
316 "pshufb %%xmm4,%%xmm0 \n"
317 "movdqa %%xmm2,0x20(%1) \n"
318 "por %%xmm5,%%xmm0 \n"
319 "pshufb %%xmm4,%%xmm1 \n"
320 "movdqa %%xmm0,(%1) \n"
321 "por %%xmm5,%%xmm1 \n"
322 "palignr $0x4,%%xmm3,%%xmm3 \n"
323 "pshufb %%xmm4,%%xmm3 \n"
324 "movdqa %%xmm1,0x10(%1) \n"
325 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000326 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000327 "movdqa %%xmm3,0x30(%1) \n"
328 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000329 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000330 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000331 "+r"(dst_argb), // %1
332 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000333 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000334 : "memory", "cc"
335#if defined(__SSE2__)
336 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
337#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000338 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000339}
340
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000341void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000342 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000343 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
344 "pslld $0x18,%%xmm5 \n"
345 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000346 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000347 "1: \n"
348 "movdqu (%0),%%xmm0 \n"
349 "movdqu 0x10(%0),%%xmm1 \n"
350 "movdqu 0x20(%0),%%xmm3 \n"
351 "lea 0x30(%0),%0 \n"
352 "movdqa %%xmm3,%%xmm2 \n"
353 "palignr $0x8,%%xmm1,%%xmm2 \n"
354 "pshufb %%xmm4,%%xmm2 \n"
355 "por %%xmm5,%%xmm2 \n"
356 "palignr $0xc,%%xmm0,%%xmm1 \n"
357 "pshufb %%xmm4,%%xmm0 \n"
358 "movdqa %%xmm2,0x20(%1) \n"
359 "por %%xmm5,%%xmm0 \n"
360 "pshufb %%xmm4,%%xmm1 \n"
361 "movdqa %%xmm0,(%1) \n"
362 "por %%xmm5,%%xmm1 \n"
363 "palignr $0x4,%%xmm3,%%xmm3 \n"
364 "pshufb %%xmm4,%%xmm3 \n"
365 "movdqa %%xmm1,0x10(%1) \n"
366 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000367 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000368 "movdqa %%xmm3,0x30(%1) \n"
369 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000370 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000371 : "+r"(src_raw), // %0
372 "+r"(dst_argb), // %1
373 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000374 : "m"(kShuffleMaskRAWToARGB) // %3
375 : "memory", "cc"
376#if defined(__SSE2__)
377 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
378#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000379 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000380}
381
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000382void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000383 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000384 "mov $0x1080108,%%eax \n"
385 "movd %%eax,%%xmm5 \n"
386 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000387 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000388 "movd %%eax,%%xmm6 \n"
389 "pshufd $0x0,%%xmm6,%%xmm6 \n"
390 "pcmpeqb %%xmm3,%%xmm3 \n"
391 "psllw $0xb,%%xmm3 \n"
392 "pcmpeqb %%xmm4,%%xmm4 \n"
393 "psllw $0xa,%%xmm4 \n"
394 "psrlw $0x5,%%xmm4 \n"
395 "pcmpeqb %%xmm7,%%xmm7 \n"
396 "psllw $0x8,%%xmm7 \n"
397 "sub %0,%1 \n"
398 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000399 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000400 "1: \n"
401 "movdqu (%0),%%xmm0 \n"
402 "movdqa %%xmm0,%%xmm1 \n"
403 "movdqa %%xmm0,%%xmm2 \n"
404 "pand %%xmm3,%%xmm1 \n"
405 "psllw $0xb,%%xmm2 \n"
406 "pmulhuw %%xmm5,%%xmm1 \n"
407 "pmulhuw %%xmm5,%%xmm2 \n"
408 "psllw $0x8,%%xmm1 \n"
409 "por %%xmm2,%%xmm1 \n"
410 "pand %%xmm4,%%xmm0 \n"
411 "pmulhuw %%xmm6,%%xmm0 \n"
412 "por %%xmm7,%%xmm0 \n"
413 "movdqa %%xmm1,%%xmm2 \n"
414 "punpcklbw %%xmm0,%%xmm1 \n"
415 "punpckhbw %%xmm0,%%xmm2 \n"
416 "movdqa %%xmm1,(%1,%0,2) \n"
417 "movdqa %%xmm2,0x10(%1,%0,2) \n"
418 "lea 0x10(%0),%0 \n"
419 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000420 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000421 : "+r"(src), // %0
422 "+r"(dst), // %1
423 "+r"(pix) // %2
424 :
425 : "memory", "cc", "eax"
426#if defined(__SSE2__)
427 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
428#endif
429 );
430}
431
432void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000433 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000434 "mov $0x1080108,%%eax \n"
435 "movd %%eax,%%xmm5 \n"
436 "pshufd $0x0,%%xmm5,%%xmm5 \n"
437 "mov $0x42004200,%%eax \n"
438 "movd %%eax,%%xmm6 \n"
439 "pshufd $0x0,%%xmm6,%%xmm6 \n"
440 "pcmpeqb %%xmm3,%%xmm3 \n"
441 "psllw $0xb,%%xmm3 \n"
442 "movdqa %%xmm3,%%xmm4 \n"
443 "psrlw $0x6,%%xmm4 \n"
444 "pcmpeqb %%xmm7,%%xmm7 \n"
445 "psllw $0x8,%%xmm7 \n"
446 "sub %0,%1 \n"
447 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000448 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000449 "1: \n"
450 "movdqu (%0),%%xmm0 \n"
451 "movdqa %%xmm0,%%xmm1 \n"
452 "movdqa %%xmm0,%%xmm2 \n"
453 "psllw $0x1,%%xmm1 \n"
454 "psllw $0xb,%%xmm2 \n"
455 "pand %%xmm3,%%xmm1 \n"
456 "pmulhuw %%xmm5,%%xmm2 \n"
457 "pmulhuw %%xmm5,%%xmm1 \n"
458 "psllw $0x8,%%xmm1 \n"
459 "por %%xmm2,%%xmm1 \n"
460 "movdqa %%xmm0,%%xmm2 \n"
461 "pand %%xmm4,%%xmm0 \n"
462 "psraw $0x8,%%xmm2 \n"
463 "pmulhuw %%xmm6,%%xmm0 \n"
464 "pand %%xmm7,%%xmm2 \n"
465 "por %%xmm2,%%xmm0 \n"
466 "movdqa %%xmm1,%%xmm2 \n"
467 "punpcklbw %%xmm0,%%xmm1 \n"
468 "punpckhbw %%xmm0,%%xmm2 \n"
469 "movdqa %%xmm1,(%1,%0,2) \n"
470 "movdqa %%xmm2,0x10(%1,%0,2) \n"
471 "lea 0x10(%0),%0 \n"
472 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000473 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000474 : "+r"(src), // %0
475 "+r"(dst), // %1
476 "+r"(pix) // %2
477 :
478 : "memory", "cc", "eax"
479#if defined(__SSE2__)
480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
481#endif
482 );
483}
484
485void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000486 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000487 "mov $0xf0f0f0f,%%eax \n"
488 "movd %%eax,%%xmm4 \n"
489 "pshufd $0x0,%%xmm4,%%xmm4 \n"
490 "movdqa %%xmm4,%%xmm5 \n"
491 "pslld $0x4,%%xmm5 \n"
492 "sub %0,%1 \n"
493 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000494 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000495 "1: \n"
496 "movdqu (%0),%%xmm0 \n"
497 "movdqa %%xmm0,%%xmm2 \n"
498 "pand %%xmm4,%%xmm0 \n"
499 "pand %%xmm5,%%xmm2 \n"
500 "movdqa %%xmm0,%%xmm1 \n"
501 "movdqa %%xmm2,%%xmm3 \n"
502 "psllw $0x4,%%xmm1 \n"
503 "psrlw $0x4,%%xmm3 \n"
504 "por %%xmm1,%%xmm0 \n"
505 "por %%xmm3,%%xmm2 \n"
506 "movdqa %%xmm0,%%xmm1 \n"
507 "punpcklbw %%xmm2,%%xmm0 \n"
508 "punpckhbw %%xmm2,%%xmm1 \n"
509 "movdqa %%xmm0,(%1,%0,2) \n"
510 "movdqa %%xmm1,0x10(%1,%0,2) \n"
511 "lea 0x10(%0),%0 \n"
512 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000513 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000514 : "+r"(src), // %0
515 "+r"(dst), // %1
516 "+r"(pix) // %2
517 :
518 : "memory", "cc", "eax"
519#if defined(__SSE2__)
520 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
521#endif
522 );
523}
524
525void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000526 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000527 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000528 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000529 "1: \n"
530 "movdqa (%0),%%xmm0 \n"
531 "movdqa 0x10(%0),%%xmm1 \n"
532 "movdqa 0x20(%0),%%xmm2 \n"
533 "movdqa 0x30(%0),%%xmm3 \n"
534 "lea 0x40(%0),%0 \n"
535 "pshufb %%xmm6,%%xmm0 \n"
536 "pshufb %%xmm6,%%xmm1 \n"
537 "pshufb %%xmm6,%%xmm2 \n"
538 "pshufb %%xmm6,%%xmm3 \n"
539 "movdqa %%xmm1,%%xmm4 \n"
540 "psrldq $0x4,%%xmm1 \n"
541 "pslldq $0xc,%%xmm4 \n"
542 "movdqa %%xmm2,%%xmm5 \n"
543 "por %%xmm4,%%xmm0 \n"
544 "pslldq $0x8,%%xmm5 \n"
545 "movdqa %%xmm0,(%1) \n"
546 "por %%xmm5,%%xmm1 \n"
547 "psrldq $0x8,%%xmm2 \n"
548 "pslldq $0x4,%%xmm3 \n"
549 "por %%xmm3,%%xmm2 \n"
550 "movdqa %%xmm1,0x10(%1) \n"
551 "movdqa %%xmm2,0x20(%1) \n"
552 "lea 0x30(%1),%1 \n"
553 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000554 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000555 : "+r"(src), // %0
556 "+r"(dst), // %1
557 "+r"(pix) // %2
558 : "m"(kShuffleMaskARGBToRGB24) // %3
559 : "memory", "cc"
560#if defined(__SSE2__)
561 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
562#endif
563 );
564}
565
566void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000567 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000569 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000570 "1: \n"
571 "movdqa (%0),%%xmm0 \n"
572 "movdqa 0x10(%0),%%xmm1 \n"
573 "movdqa 0x20(%0),%%xmm2 \n"
574 "movdqa 0x30(%0),%%xmm3 \n"
575 "lea 0x40(%0),%0 \n"
576 "pshufb %%xmm6,%%xmm0 \n"
577 "pshufb %%xmm6,%%xmm1 \n"
578 "pshufb %%xmm6,%%xmm2 \n"
579 "pshufb %%xmm6,%%xmm3 \n"
580 "movdqa %%xmm1,%%xmm4 \n"
581 "psrldq $0x4,%%xmm1 \n"
582 "pslldq $0xc,%%xmm4 \n"
583 "movdqa %%xmm2,%%xmm5 \n"
584 "por %%xmm4,%%xmm0 \n"
585 "pslldq $0x8,%%xmm5 \n"
586 "movdqa %%xmm0,(%1) \n"
587 "por %%xmm5,%%xmm1 \n"
588 "psrldq $0x8,%%xmm2 \n"
589 "pslldq $0x4,%%xmm3 \n"
590 "por %%xmm3,%%xmm2 \n"
591 "movdqa %%xmm1,0x10(%1) \n"
592 "movdqa %%xmm2,0x20(%1) \n"
593 "lea 0x30(%1),%1 \n"
594 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000595 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000596 : "+r"(src), // %0
597 "+r"(dst), // %1
598 "+r"(pix) // %2
599 : "m"(kShuffleMaskARGBToRAW) // %3
600 : "memory", "cc"
601#if defined(__SSE2__)
602 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
603#endif
604 );
605}
606
fbarchard@google.comdbcabea2012-10-29 21:20:25 +0000607void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000608 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000609 "pcmpeqb %%xmm3,%%xmm3 \n"
610 "psrld $0x1b,%%xmm3 \n"
611 "pcmpeqb %%xmm4,%%xmm4 \n"
612 "psrld $0x1a,%%xmm4 \n"
613 "pslld $0x5,%%xmm4 \n"
614 "pcmpeqb %%xmm5,%%xmm5 \n"
615 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000616 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000617 "1: \n"
618 "movdqa (%0),%%xmm0 \n"
619 "movdqa %%xmm0,%%xmm1 \n"
620 "movdqa %%xmm0,%%xmm2 \n"
621 "pslld $0x8,%%xmm0 \n"
622 "psrld $0x3,%%xmm1 \n"
623 "psrld $0x5,%%xmm2 \n"
624 "psrad $0x10,%%xmm0 \n"
625 "pand %%xmm3,%%xmm1 \n"
626 "pand %%xmm4,%%xmm2 \n"
627 "pand %%xmm5,%%xmm0 \n"
628 "por %%xmm2,%%xmm1 \n"
629 "por %%xmm1,%%xmm0 \n"
630 "packssdw %%xmm0,%%xmm0 \n"
631 "lea 0x10(%0),%0 \n"
632 "movq %%xmm0,(%1) \n"
633 "lea 0x8(%1),%1 \n"
634 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000636 : "+r"(src), // %0
637 "+r"(dst), // %1
638 "+r"(pix) // %2
639 :
640 : "memory", "cc"
641#if defined(__SSE2__)
642 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
643#endif
644 );
645}
646
647void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000648 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000649 "pcmpeqb %%xmm4,%%xmm4 \n"
650 "psrld $0x1b,%%xmm4 \n"
651 "movdqa %%xmm4,%%xmm5 \n"
652 "pslld $0x5,%%xmm5 \n"
653 "movdqa %%xmm4,%%xmm6 \n"
654 "pslld $0xa,%%xmm6 \n"
655 "pcmpeqb %%xmm7,%%xmm7 \n"
656 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000657 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000658 "1: \n"
659 "movdqa (%0),%%xmm0 \n"
660 "movdqa %%xmm0,%%xmm1 \n"
661 "movdqa %%xmm0,%%xmm2 \n"
662 "movdqa %%xmm0,%%xmm3 \n"
663 "psrad $0x10,%%xmm0 \n"
664 "psrld $0x3,%%xmm1 \n"
665 "psrld $0x6,%%xmm2 \n"
666 "psrld $0x9,%%xmm3 \n"
667 "pand %%xmm7,%%xmm0 \n"
668 "pand %%xmm4,%%xmm1 \n"
669 "pand %%xmm5,%%xmm2 \n"
670 "pand %%xmm6,%%xmm3 \n"
671 "por %%xmm1,%%xmm0 \n"
672 "por %%xmm3,%%xmm2 \n"
673 "por %%xmm2,%%xmm0 \n"
674 "packssdw %%xmm0,%%xmm0 \n"
675 "lea 0x10(%0),%0 \n"
676 "movq %%xmm0,(%1) \n"
677 "lea 0x8(%1),%1 \n"
678 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000679 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000680 : "+r"(src), // %0
681 "+r"(dst), // %1
682 "+r"(pix) // %2
683 :
684 : "memory", "cc"
685#if defined(__SSE2__)
686 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
687#endif
688 );
689}
690
691void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000693 "pcmpeqb %%xmm4,%%xmm4 \n"
694 "psllw $0xc,%%xmm4 \n"
695 "movdqa %%xmm4,%%xmm3 \n"
696 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000697 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000698 "1: \n"
699 "movdqa (%0),%%xmm0 \n"
700 "movdqa %%xmm0,%%xmm1 \n"
701 "pand %%xmm3,%%xmm0 \n"
702 "pand %%xmm4,%%xmm1 \n"
703 "psrlq $0x4,%%xmm0 \n"
704 "psrlq $0x8,%%xmm1 \n"
705 "por %%xmm1,%%xmm0 \n"
706 "packuswb %%xmm0,%%xmm0 \n"
707 "lea 0x10(%0),%0 \n"
708 "movq %%xmm0,(%1) \n"
709 "lea 0x8(%1),%1 \n"
710 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000711 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000712 : "+r"(src), // %0
713 "+r"(dst), // %1
714 "+r"(pix) // %2
715 :
716 : "memory", "cc"
717#if defined(__SSE2__)
718 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
719#endif
720 );
721}
722
fbarchard@google.comb6149762011-11-07 21:58:52 +0000723void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000724 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000725 "movdqa %4,%%xmm5 \n"
726 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000727 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000728 "1: \n"
729 "movdqa (%0),%%xmm0 \n"
730 "movdqa 0x10(%0),%%xmm1 \n"
731 "movdqa 0x20(%0),%%xmm2 \n"
732 "movdqa 0x30(%0),%%xmm3 \n"
733 "pmaddubsw %%xmm4,%%xmm0 \n"
734 "pmaddubsw %%xmm4,%%xmm1 \n"
735 "pmaddubsw %%xmm4,%%xmm2 \n"
736 "pmaddubsw %%xmm4,%%xmm3 \n"
737 "lea 0x40(%0),%0 \n"
738 "phaddw %%xmm1,%%xmm0 \n"
739 "phaddw %%xmm3,%%xmm2 \n"
740 "psrlw $0x7,%%xmm0 \n"
741 "psrlw $0x7,%%xmm2 \n"
742 "packuswb %%xmm2,%%xmm0 \n"
743 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000744 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000745 "movdqa %%xmm0,(%1) \n"
746 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000747 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000748 : "+r"(src_argb), // %0
749 "+r"(dst_y), // %1
750 "+r"(pix) // %2
751 : "m"(kARGBToY), // %3
752 "m"(kAddY16) // %4
753 : "memory", "cc"
754#if defined(__SSE2__)
755 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
756#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000757 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000758}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000759
760void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000761 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000762 "movdqa %4,%%xmm5 \n"
763 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm3 \n"
770 "pmaddubsw %%xmm4,%%xmm0 \n"
771 "pmaddubsw %%xmm4,%%xmm1 \n"
772 "pmaddubsw %%xmm4,%%xmm2 \n"
773 "pmaddubsw %%xmm4,%%xmm3 \n"
774 "lea 0x40(%0),%0 \n"
775 "phaddw %%xmm1,%%xmm0 \n"
776 "phaddw %%xmm3,%%xmm2 \n"
777 "psrlw $0x7,%%xmm0 \n"
778 "psrlw $0x7,%%xmm2 \n"
779 "packuswb %%xmm2,%%xmm0 \n"
780 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000781 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000782 "movdqu %%xmm0,(%1) \n"
783 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000784 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000785 : "+r"(src_argb), // %0
786 "+r"(dst_y), // %1
787 "+r"(pix) // %2
788 : "m"(kARGBToY), // %3
789 "m"(kAddY16) // %4
790 : "memory", "cc"
791#if defined(__SSE2__)
792 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
793#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000794 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000795}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000796
fbarchard@google.com714050a2012-02-17 22:59:56 +0000797// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000798// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
799// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
800// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000801// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000802void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
803 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000804 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000805 "movdqa %0,%%xmm4 \n"
806 "movdqa %1,%%xmm3 \n"
807 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000808 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000809 : "m"(kARGBToU), // %0
810 "m"(kARGBToV), // %1
811 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000812 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000813 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000814 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000815 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000816 "1: \n"
817 "movdqa (%0),%%xmm0 \n"
818 "movdqa 0x10(%0),%%xmm1 \n"
819 "movdqa 0x20(%0),%%xmm2 \n"
820 "movdqa 0x30(%0),%%xmm6 \n"
821 "pavgb (%0,%4,1),%%xmm0 \n"
822 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
823 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
824 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
825 "lea 0x40(%0),%0 \n"
826 "movdqa %%xmm0,%%xmm7 \n"
827 "shufps $0x88,%%xmm1,%%xmm0 \n"
828 "shufps $0xdd,%%xmm1,%%xmm7 \n"
829 "pavgb %%xmm7,%%xmm0 \n"
830 "movdqa %%xmm2,%%xmm7 \n"
831 "shufps $0x88,%%xmm6,%%xmm2 \n"
832 "shufps $0xdd,%%xmm6,%%xmm7 \n"
833 "pavgb %%xmm7,%%xmm2 \n"
834 "movdqa %%xmm0,%%xmm1 \n"
835 "movdqa %%xmm2,%%xmm6 \n"
836 "pmaddubsw %%xmm4,%%xmm0 \n"
837 "pmaddubsw %%xmm4,%%xmm2 \n"
838 "pmaddubsw %%xmm3,%%xmm1 \n"
839 "pmaddubsw %%xmm3,%%xmm6 \n"
840 "phaddw %%xmm2,%%xmm0 \n"
841 "phaddw %%xmm6,%%xmm1 \n"
842 "psraw $0x8,%%xmm0 \n"
843 "psraw $0x8,%%xmm1 \n"
844 "packsswb %%xmm1,%%xmm0 \n"
845 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000846 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000847 "movlps %%xmm0,(%1) \n"
848 "movhps %%xmm0,(%1,%2,1) \n"
849 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000850 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000851 : "+r"(src_argb0), // %0
852 "+r"(dst_u), // %1
853 "+r"(dst_v), // %2
854 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000855 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000856 : "memory", "cc"
857#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000858 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000859#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000860 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000861}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000862
863void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
864 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000865 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000866 "movdqa %0,%%xmm4 \n"
867 "movdqa %1,%%xmm3 \n"
868 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000869 :
870 : "m"(kARGBToU), // %0
871 "m"(kARGBToV), // %1
872 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000873 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000874 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000875 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000876 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000877 "1: \n"
878 "movdqu (%0),%%xmm0 \n"
879 "movdqu 0x10(%0),%%xmm1 \n"
880 "movdqu 0x20(%0),%%xmm2 \n"
881 "movdqu 0x30(%0),%%xmm6 \n"
882 "movdqu (%0,%4,1),%%xmm7 \n"
883 "pavgb %%xmm7,%%xmm0 \n"
884 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
885 "pavgb %%xmm7,%%xmm1 \n"
886 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
887 "pavgb %%xmm7,%%xmm2 \n"
888 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
889 "pavgb %%xmm7,%%xmm6 \n"
890 "lea 0x40(%0),%0 \n"
891 "movdqa %%xmm0,%%xmm7 \n"
892 "shufps $0x88,%%xmm1,%%xmm0 \n"
893 "shufps $0xdd,%%xmm1,%%xmm7 \n"
894 "pavgb %%xmm7,%%xmm0 \n"
895 "movdqa %%xmm2,%%xmm7 \n"
896 "shufps $0x88,%%xmm6,%%xmm2 \n"
897 "shufps $0xdd,%%xmm6,%%xmm7 \n"
898 "pavgb %%xmm7,%%xmm2 \n"
899 "movdqa %%xmm0,%%xmm1 \n"
900 "movdqa %%xmm2,%%xmm6 \n"
901 "pmaddubsw %%xmm4,%%xmm0 \n"
902 "pmaddubsw %%xmm4,%%xmm2 \n"
903 "pmaddubsw %%xmm3,%%xmm1 \n"
904 "pmaddubsw %%xmm3,%%xmm6 \n"
905 "phaddw %%xmm2,%%xmm0 \n"
906 "phaddw %%xmm6,%%xmm1 \n"
907 "psraw $0x8,%%xmm0 \n"
908 "psraw $0x8,%%xmm1 \n"
909 "packsswb %%xmm1,%%xmm0 \n"
910 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000911 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000912 "movlps %%xmm0,(%1) \n"
913 "movhps %%xmm0,(%1,%2,1) \n"
914 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000915 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000916 : "+r"(src_argb0), // %0
917 "+r"(dst_u), // %1
918 "+r"(dst_v), // %2
919 "+rm"(width) // %3
920 : "r"(static_cast<intptr_t>(src_stride_argb))
921 : "memory", "cc"
922#if defined(__SSE2__)
923 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
924#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000925 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000926}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000927
fbarchard@google.com762c0502013-02-04 18:47:21 +0000928void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
929 int width) {
930 asm volatile (
931 "movdqa %0,%%xmm4 \n"
932 "movdqa %1,%%xmm3 \n"
933 "movdqa %2,%%xmm5 \n"
934 :
935 : "m"(kARGBToU), // %0
936 "m"(kARGBToV), // %1
937 "m"(kAddUV128) // %2
938 );
939 asm volatile (
940 "sub %1,%2 \n"
941 ".p2align 4 \n"
942 "1: \n"
943 "movdqa (%0),%%xmm0 \n"
944 "movdqa 0x10(%0),%%xmm1 \n"
945 "movdqa 0x20(%0),%%xmm2 \n"
946 "movdqa 0x30(%0),%%xmm6 \n"
947 "pmaddubsw %%xmm4,%%xmm0 \n"
948 "pmaddubsw %%xmm4,%%xmm1 \n"
949 "pmaddubsw %%xmm4,%%xmm2 \n"
950 "pmaddubsw %%xmm4,%%xmm6 \n"
951 "phaddw %%xmm1,%%xmm0 \n"
952 "phaddw %%xmm6,%%xmm2 \n"
953 "psrlw $0x8,%%xmm0 \n"
954 "psrlw $0x8,%%xmm2 \n"
955 "packuswb %%xmm2,%%xmm0 \n"
956 "paddb %%xmm5,%%xmm0 \n"
957 "sub $0x10,%3 \n"
958 "movdqa %%xmm0,(%1) \n"
959 "movdqa (%0),%%xmm0 \n"
960 "movdqa 0x10(%0),%%xmm1 \n"
961 "movdqa 0x20(%0),%%xmm2 \n"
962 "movdqa 0x30(%0),%%xmm6 \n"
963 "pmaddubsw %%xmm3,%%xmm0 \n"
964 "pmaddubsw %%xmm3,%%xmm1 \n"
965 "pmaddubsw %%xmm3,%%xmm2 \n"
966 "pmaddubsw %%xmm3,%%xmm6 \n"
967 "phaddw %%xmm1,%%xmm0 \n"
968 "phaddw %%xmm6,%%xmm2 \n"
969 "psrlw $0x8,%%xmm0 \n"
970 "psrlw $0x8,%%xmm2 \n"
971 "packuswb %%xmm2,%%xmm0 \n"
972 "paddb %%xmm5,%%xmm0 \n"
973 "lea 0x40(%0),%0 \n"
974 "movdqa %%xmm0,(%1,%2,1) \n"
975 "lea 0x10(%1),%1 \n"
976 "jg 1b \n"
977 : "+r"(src_argb), // %0
978 "+r"(dst_u), // %1
979 "+r"(dst_v), // %2
980 "+rm"(width) // %3
981 :
982 : "memory", "cc"
983#if defined(__SSE2__)
984 , "xmm0", "xmm1", "xmm2", "xmm6"
985#endif
986 );
987}
988
989void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
990 uint8* dst_v, int width) {
991 asm volatile (
992 "movdqa %0,%%xmm4 \n"
993 "movdqa %1,%%xmm3 \n"
994 "movdqa %2,%%xmm5 \n"
995 :
996 : "m"(kARGBToU), // %0
997 "m"(kARGBToV), // %1
998 "m"(kAddUV128) // %2
999 );
1000 asm volatile (
1001 "sub %1,%2 \n"
1002 ".p2align 4 \n"
1003 "1: \n"
1004 "movdqu (%0),%%xmm0 \n"
1005 "movdqu 0x10(%0),%%xmm1 \n"
1006 "movdqu 0x20(%0),%%xmm2 \n"
1007 "movdqu 0x30(%0),%%xmm6 \n"
1008 "pmaddubsw %%xmm4,%%xmm0 \n"
1009 "pmaddubsw %%xmm4,%%xmm1 \n"
1010 "pmaddubsw %%xmm4,%%xmm2 \n"
1011 "pmaddubsw %%xmm4,%%xmm6 \n"
1012 "phaddw %%xmm1,%%xmm0 \n"
1013 "phaddw %%xmm6,%%xmm2 \n"
1014 "psrlw $0x8,%%xmm0 \n"
1015 "psrlw $0x8,%%xmm2 \n"
1016 "packuswb %%xmm2,%%xmm0 \n"
1017 "paddb %%xmm5,%%xmm0 \n"
1018 "sub $0x10,%3 \n"
1019 "movdqu %%xmm0,(%1) \n"
1020 "movdqu (%0),%%xmm0 \n"
1021 "movdqu 0x10(%0),%%xmm1 \n"
1022 "movdqu 0x20(%0),%%xmm2 \n"
1023 "movdqu 0x30(%0),%%xmm6 \n"
1024 "pmaddubsw %%xmm3,%%xmm0 \n"
1025 "pmaddubsw %%xmm3,%%xmm1 \n"
1026 "pmaddubsw %%xmm3,%%xmm2 \n"
1027 "pmaddubsw %%xmm3,%%xmm6 \n"
1028 "phaddw %%xmm1,%%xmm0 \n"
1029 "phaddw %%xmm6,%%xmm2 \n"
1030 "psrlw $0x8,%%xmm0 \n"
1031 "psrlw $0x8,%%xmm2 \n"
1032 "packuswb %%xmm2,%%xmm0 \n"
1033 "paddb %%xmm5,%%xmm0 \n"
1034 "lea 0x40(%0),%0 \n"
1035 "movdqu %%xmm0,(%1,%2,1) \n"
1036 "lea 0x10(%1),%1 \n"
1037 "jg 1b \n"
1038 : "+r"(src_argb), // %0
1039 "+r"(dst_u), // %1
1040 "+r"(dst_v), // %2
1041 "+rm"(width) // %3
1042 :
1043 : "memory", "cc"
1044#if defined(__SSE2__)
1045 , "xmm0", "xmm1", "xmm2", "xmm6"
1046#endif
1047 );
1048}
1049
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001050void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1051 uint8* dst_u, uint8* dst_v, int width) {
1052 asm volatile (
1053 "movdqa %0,%%xmm4 \n"
1054 "movdqa %1,%%xmm3 \n"
1055 "movdqa %2,%%xmm5 \n"
1056 :
1057 : "m"(kARGBToU), // %0
1058 "m"(kARGBToV), // %1
1059 "m"(kAddUV128) // %2
1060 );
1061 asm volatile (
1062 "sub %1,%2 \n"
1063 ".p2align 4 \n"
1064 "1: \n"
1065 "movdqa (%0),%%xmm0 \n"
1066 "movdqa 0x10(%0),%%xmm1 \n"
1067 "movdqa 0x20(%0),%%xmm2 \n"
1068 "movdqa 0x30(%0),%%xmm6 \n"
1069 "lea 0x40(%0),%0 \n"
1070 "movdqa %%xmm0,%%xmm7 \n"
1071 "shufps $0x88,%%xmm1,%%xmm0 \n"
1072 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1073 "pavgb %%xmm7,%%xmm0 \n"
1074 "movdqa %%xmm2,%%xmm7 \n"
1075 "shufps $0x88,%%xmm6,%%xmm2 \n"
1076 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1077 "pavgb %%xmm7,%%xmm2 \n"
1078 "movdqa %%xmm0,%%xmm1 \n"
1079 "movdqa %%xmm2,%%xmm6 \n"
1080 "pmaddubsw %%xmm4,%%xmm0 \n"
1081 "pmaddubsw %%xmm4,%%xmm2 \n"
1082 "pmaddubsw %%xmm3,%%xmm1 \n"
1083 "pmaddubsw %%xmm3,%%xmm6 \n"
1084 "phaddw %%xmm2,%%xmm0 \n"
1085 "phaddw %%xmm6,%%xmm1 \n"
1086 "psraw $0x8,%%xmm0 \n"
1087 "psraw $0x8,%%xmm1 \n"
1088 "packsswb %%xmm1,%%xmm0 \n"
1089 "paddb %%xmm5,%%xmm0 \n"
1090 "sub $0x10,%3 \n"
1091 "movlps %%xmm0,(%1) \n"
1092 "movhps %%xmm0,(%1,%2,1) \n"
1093 "lea 0x8(%1),%1 \n"
1094 "jg 1b \n"
1095 : "+r"(src_argb0), // %0
1096 "+r"(dst_u), // %1
1097 "+r"(dst_v), // %2
1098 "+rm"(width) // %3
1099 :
1100 : "memory", "cc"
1101#if defined(__SSE2__)
1102 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1103#endif
1104 );
1105}
1106
1107void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1108 uint8* dst_u, uint8* dst_v, int width) {
1109 asm volatile (
1110 "movdqa %0,%%xmm4 \n"
1111 "movdqa %1,%%xmm3 \n"
1112 "movdqa %2,%%xmm5 \n"
1113 :
1114 : "m"(kARGBToU), // %0
1115 "m"(kARGBToV), // %1
1116 "m"(kAddUV128) // %2
1117 );
1118 asm volatile (
1119 "sub %1,%2 \n"
1120 ".p2align 4 \n"
1121 "1: \n"
1122 "movdqu (%0),%%xmm0 \n"
1123 "movdqu 0x10(%0),%%xmm1 \n"
1124 "movdqu 0x20(%0),%%xmm2 \n"
1125 "movdqu 0x30(%0),%%xmm6 \n"
1126 "lea 0x40(%0),%0 \n"
1127 "movdqa %%xmm0,%%xmm7 \n"
1128 "shufps $0x88,%%xmm1,%%xmm0 \n"
1129 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1130 "pavgb %%xmm7,%%xmm0 \n"
1131 "movdqa %%xmm2,%%xmm7 \n"
1132 "shufps $0x88,%%xmm6,%%xmm2 \n"
1133 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1134 "pavgb %%xmm7,%%xmm2 \n"
1135 "movdqa %%xmm0,%%xmm1 \n"
1136 "movdqa %%xmm2,%%xmm6 \n"
1137 "pmaddubsw %%xmm4,%%xmm0 \n"
1138 "pmaddubsw %%xmm4,%%xmm2 \n"
1139 "pmaddubsw %%xmm3,%%xmm1 \n"
1140 "pmaddubsw %%xmm3,%%xmm6 \n"
1141 "phaddw %%xmm2,%%xmm0 \n"
1142 "phaddw %%xmm6,%%xmm1 \n"
1143 "psraw $0x8,%%xmm0 \n"
1144 "psraw $0x8,%%xmm1 \n"
1145 "packsswb %%xmm1,%%xmm0 \n"
1146 "paddb %%xmm5,%%xmm0 \n"
1147 "sub $0x10,%3 \n"
1148 "movlps %%xmm0,(%1) \n"
1149 "movhps %%xmm0,(%1,%2,1) \n"
1150 "lea 0x8(%1),%1 \n"
1151 "jg 1b \n"
1152 : "+r"(src_argb0), // %0
1153 "+r"(dst_u), // %1
1154 "+r"(dst_v), // %2
1155 "+rm"(width) // %3
1156 :
1157 : "memory", "cc"
1158#if defined(__SSE2__)
1159 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1160#endif
1161 );
1162}
1163
fbarchard@google.com714050a2012-02-17 22:59:56 +00001164void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001165 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001166 "movdqa %4,%%xmm5 \n"
1167 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001168 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001169 "1: \n"
1170 "movdqa (%0),%%xmm0 \n"
1171 "movdqa 0x10(%0),%%xmm1 \n"
1172 "movdqa 0x20(%0),%%xmm2 \n"
1173 "movdqa 0x30(%0),%%xmm3 \n"
1174 "pmaddubsw %%xmm4,%%xmm0 \n"
1175 "pmaddubsw %%xmm4,%%xmm1 \n"
1176 "pmaddubsw %%xmm4,%%xmm2 \n"
1177 "pmaddubsw %%xmm4,%%xmm3 \n"
1178 "lea 0x40(%0),%0 \n"
1179 "phaddw %%xmm1,%%xmm0 \n"
1180 "phaddw %%xmm3,%%xmm2 \n"
1181 "psrlw $0x7,%%xmm0 \n"
1182 "psrlw $0x7,%%xmm2 \n"
1183 "packuswb %%xmm2,%%xmm0 \n"
1184 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001185 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001186 "movdqa %%xmm0,(%1) \n"
1187 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001188 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001189 : "+r"(src_bgra), // %0
1190 "+r"(dst_y), // %1
1191 "+r"(pix) // %2
1192 : "m"(kBGRAToY), // %3
1193 "m"(kAddY16) // %4
1194 : "memory", "cc"
1195#if defined(__SSE2__)
1196 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001197#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +00001198 );
1199}
1200
1201void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001202 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001203 "movdqa %4,%%xmm5 \n"
1204 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001205 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001206 "1: \n"
1207 "movdqu (%0),%%xmm0 \n"
1208 "movdqu 0x10(%0),%%xmm1 \n"
1209 "movdqu 0x20(%0),%%xmm2 \n"
1210 "movdqu 0x30(%0),%%xmm3 \n"
1211 "pmaddubsw %%xmm4,%%xmm0 \n"
1212 "pmaddubsw %%xmm4,%%xmm1 \n"
1213 "pmaddubsw %%xmm4,%%xmm2 \n"
1214 "pmaddubsw %%xmm4,%%xmm3 \n"
1215 "lea 0x40(%0),%0 \n"
1216 "phaddw %%xmm1,%%xmm0 \n"
1217 "phaddw %%xmm3,%%xmm2 \n"
1218 "psrlw $0x7,%%xmm0 \n"
1219 "psrlw $0x7,%%xmm2 \n"
1220 "packuswb %%xmm2,%%xmm0 \n"
1221 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001222 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001223 "movdqu %%xmm0,(%1) \n"
1224 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001225 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001226 : "+r"(src_bgra), // %0
1227 "+r"(dst_y), // %1
1228 "+r"(pix) // %2
1229 : "m"(kBGRAToY), // %3
1230 "m"(kAddY16) // %4
1231 : "memory", "cc"
1232#if defined(__SSE2__)
1233 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1234#endif
1235 );
1236}
1237
1238void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1239 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001240 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001241 "movdqa %0,%%xmm4 \n"
1242 "movdqa %1,%%xmm3 \n"
1243 "movdqa %2,%%xmm5 \n"
1244 :
1245 : "m"(kBGRAToU), // %0
1246 "m"(kBGRAToV), // %1
1247 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001248 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001249 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001250 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001251 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001252 "1: \n"
1253 "movdqa (%0),%%xmm0 \n"
1254 "movdqa 0x10(%0),%%xmm1 \n"
1255 "movdqa 0x20(%0),%%xmm2 \n"
1256 "movdqa 0x30(%0),%%xmm6 \n"
1257 "pavgb (%0,%4,1),%%xmm0 \n"
1258 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1259 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1260 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1261 "lea 0x40(%0),%0 \n"
1262 "movdqa %%xmm0,%%xmm7 \n"
1263 "shufps $0x88,%%xmm1,%%xmm0 \n"
1264 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1265 "pavgb %%xmm7,%%xmm0 \n"
1266 "movdqa %%xmm2,%%xmm7 \n"
1267 "shufps $0x88,%%xmm6,%%xmm2 \n"
1268 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1269 "pavgb %%xmm7,%%xmm2 \n"
1270 "movdqa %%xmm0,%%xmm1 \n"
1271 "movdqa %%xmm2,%%xmm6 \n"
1272 "pmaddubsw %%xmm4,%%xmm0 \n"
1273 "pmaddubsw %%xmm4,%%xmm2 \n"
1274 "pmaddubsw %%xmm3,%%xmm1 \n"
1275 "pmaddubsw %%xmm3,%%xmm6 \n"
1276 "phaddw %%xmm2,%%xmm0 \n"
1277 "phaddw %%xmm6,%%xmm1 \n"
1278 "psraw $0x8,%%xmm0 \n"
1279 "psraw $0x8,%%xmm1 \n"
1280 "packsswb %%xmm1,%%xmm0 \n"
1281 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001282 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001283 "movlps %%xmm0,(%1) \n"
1284 "movhps %%xmm0,(%1,%2,1) \n"
1285 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001286 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001287 : "+r"(src_bgra0), // %0
1288 "+r"(dst_u), // %1
1289 "+r"(dst_v), // %2
1290 "+rm"(width) // %3
1291 : "r"(static_cast<intptr_t>(src_stride_bgra))
1292 : "memory", "cc"
1293#if defined(__SSE2__)
1294 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1295#endif
1296 );
1297}
1298
1299void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1300 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001301 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001302 "movdqa %0,%%xmm4 \n"
1303 "movdqa %1,%%xmm3 \n"
1304 "movdqa %2,%%xmm5 \n"
1305 :
1306 : "m"(kBGRAToU), // %0
1307 "m"(kBGRAToV), // %1
1308 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001309 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001310 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001311 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001312 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001313 "1: \n"
1314 "movdqu (%0),%%xmm0 \n"
1315 "movdqu 0x10(%0),%%xmm1 \n"
1316 "movdqu 0x20(%0),%%xmm2 \n"
1317 "movdqu 0x30(%0),%%xmm6 \n"
1318 "movdqu (%0,%4,1),%%xmm7 \n"
1319 "pavgb %%xmm7,%%xmm0 \n"
1320 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1321 "pavgb %%xmm7,%%xmm1 \n"
1322 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1323 "pavgb %%xmm7,%%xmm2 \n"
1324 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1325 "pavgb %%xmm7,%%xmm6 \n"
1326 "lea 0x40(%0),%0 \n"
1327 "movdqa %%xmm0,%%xmm7 \n"
1328 "shufps $0x88,%%xmm1,%%xmm0 \n"
1329 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1330 "pavgb %%xmm7,%%xmm0 \n"
1331 "movdqa %%xmm2,%%xmm7 \n"
1332 "shufps $0x88,%%xmm6,%%xmm2 \n"
1333 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1334 "pavgb %%xmm7,%%xmm2 \n"
1335 "movdqa %%xmm0,%%xmm1 \n"
1336 "movdqa %%xmm2,%%xmm6 \n"
1337 "pmaddubsw %%xmm4,%%xmm0 \n"
1338 "pmaddubsw %%xmm4,%%xmm2 \n"
1339 "pmaddubsw %%xmm3,%%xmm1 \n"
1340 "pmaddubsw %%xmm3,%%xmm6 \n"
1341 "phaddw %%xmm2,%%xmm0 \n"
1342 "phaddw %%xmm6,%%xmm1 \n"
1343 "psraw $0x8,%%xmm0 \n"
1344 "psraw $0x8,%%xmm1 \n"
1345 "packsswb %%xmm1,%%xmm0 \n"
1346 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001347 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001348 "movlps %%xmm0,(%1) \n"
1349 "movhps %%xmm0,(%1,%2,1) \n"
1350 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001351 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001352 : "+r"(src_bgra0), // %0
1353 "+r"(dst_u), // %1
1354 "+r"(dst_v), // %2
1355 "+rm"(width) // %3
1356 : "r"(static_cast<intptr_t>(src_stride_bgra))
1357 : "memory", "cc"
1358#if defined(__SSE2__)
1359 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1360#endif
1361 );
1362}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001363
1364void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001365 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001366 "movdqa %4,%%xmm5 \n"
1367 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001368 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001369 "1: \n"
1370 "movdqa (%0),%%xmm0 \n"
1371 "movdqa 0x10(%0),%%xmm1 \n"
1372 "movdqa 0x20(%0),%%xmm2 \n"
1373 "movdqa 0x30(%0),%%xmm3 \n"
1374 "pmaddubsw %%xmm4,%%xmm0 \n"
1375 "pmaddubsw %%xmm4,%%xmm1 \n"
1376 "pmaddubsw %%xmm4,%%xmm2 \n"
1377 "pmaddubsw %%xmm4,%%xmm3 \n"
1378 "lea 0x40(%0),%0 \n"
1379 "phaddw %%xmm1,%%xmm0 \n"
1380 "phaddw %%xmm3,%%xmm2 \n"
1381 "psrlw $0x7,%%xmm0 \n"
1382 "psrlw $0x7,%%xmm2 \n"
1383 "packuswb %%xmm2,%%xmm0 \n"
1384 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001385 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001386 "movdqa %%xmm0,(%1) \n"
1387 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001388 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001389 : "+r"(src_abgr), // %0
1390 "+r"(dst_y), // %1
1391 "+r"(pix) // %2
1392 : "m"(kABGRToY), // %3
1393 "m"(kAddY16) // %4
1394 : "memory", "cc"
1395#if defined(__SSE2__)
1396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1397#endif
1398 );
1399}
1400
1401void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001402 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001403 "movdqa %4,%%xmm5 \n"
1404 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001405 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001406 "1: \n"
1407 "movdqu (%0),%%xmm0 \n"
1408 "movdqu 0x10(%0),%%xmm1 \n"
1409 "movdqu 0x20(%0),%%xmm2 \n"
1410 "movdqu 0x30(%0),%%xmm3 \n"
1411 "pmaddubsw %%xmm4,%%xmm0 \n"
1412 "pmaddubsw %%xmm4,%%xmm1 \n"
1413 "pmaddubsw %%xmm4,%%xmm2 \n"
1414 "pmaddubsw %%xmm4,%%xmm3 \n"
1415 "lea 0x40(%0),%0 \n"
1416 "phaddw %%xmm1,%%xmm0 \n"
1417 "phaddw %%xmm3,%%xmm2 \n"
1418 "psrlw $0x7,%%xmm0 \n"
1419 "psrlw $0x7,%%xmm2 \n"
1420 "packuswb %%xmm2,%%xmm0 \n"
1421 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001422 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001423 "movdqu %%xmm0,(%1) \n"
1424 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001425 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001426 : "+r"(src_abgr), // %0
1427 "+r"(dst_y), // %1
1428 "+r"(pix) // %2
1429 : "m"(kABGRToY), // %3
1430 "m"(kAddY16) // %4
1431 : "memory", "cc"
1432#if defined(__SSE2__)
1433 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1434#endif
1435 );
1436}
1437
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001438void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1439 asm volatile (
1440 "movdqa %4,%%xmm5 \n"
1441 "movdqa %3,%%xmm4 \n"
1442 ".p2align 4 \n"
1443 "1: \n"
1444 "movdqa (%0),%%xmm0 \n"
1445 "movdqa 0x10(%0),%%xmm1 \n"
1446 "movdqa 0x20(%0),%%xmm2 \n"
1447 "movdqa 0x30(%0),%%xmm3 \n"
1448 "pmaddubsw %%xmm4,%%xmm0 \n"
1449 "pmaddubsw %%xmm4,%%xmm1 \n"
1450 "pmaddubsw %%xmm4,%%xmm2 \n"
1451 "pmaddubsw %%xmm4,%%xmm3 \n"
1452 "lea 0x40(%0),%0 \n"
1453 "phaddw %%xmm1,%%xmm0 \n"
1454 "phaddw %%xmm3,%%xmm2 \n"
1455 "psrlw $0x7,%%xmm0 \n"
1456 "psrlw $0x7,%%xmm2 \n"
1457 "packuswb %%xmm2,%%xmm0 \n"
1458 "paddb %%xmm5,%%xmm0 \n"
1459 "sub $0x10,%2 \n"
1460 "movdqa %%xmm0,(%1) \n"
1461 "lea 0x10(%1),%1 \n"
1462 "jg 1b \n"
1463 : "+r"(src_rgba), // %0
1464 "+r"(dst_y), // %1
1465 "+r"(pix) // %2
1466 : "m"(kRGBAToY), // %3
1467 "m"(kAddY16) // %4
1468 : "memory", "cc"
1469#if defined(__SSE2__)
1470 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1471#endif
1472 );
1473}
1474
1475void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1476 asm volatile (
1477 "movdqa %4,%%xmm5 \n"
1478 "movdqa %3,%%xmm4 \n"
1479 ".p2align 4 \n"
1480 "1: \n"
1481 "movdqu (%0),%%xmm0 \n"
1482 "movdqu 0x10(%0),%%xmm1 \n"
1483 "movdqu 0x20(%0),%%xmm2 \n"
1484 "movdqu 0x30(%0),%%xmm3 \n"
1485 "pmaddubsw %%xmm4,%%xmm0 \n"
1486 "pmaddubsw %%xmm4,%%xmm1 \n"
1487 "pmaddubsw %%xmm4,%%xmm2 \n"
1488 "pmaddubsw %%xmm4,%%xmm3 \n"
1489 "lea 0x40(%0),%0 \n"
1490 "phaddw %%xmm1,%%xmm0 \n"
1491 "phaddw %%xmm3,%%xmm2 \n"
1492 "psrlw $0x7,%%xmm0 \n"
1493 "psrlw $0x7,%%xmm2 \n"
1494 "packuswb %%xmm2,%%xmm0 \n"
1495 "paddb %%xmm5,%%xmm0 \n"
1496 "sub $0x10,%2 \n"
1497 "movdqu %%xmm0,(%1) \n"
1498 "lea 0x10(%1),%1 \n"
1499 "jg 1b \n"
1500 : "+r"(src_rgba), // %0
1501 "+r"(dst_y), // %1
1502 "+r"(pix) // %2
1503 : "m"(kRGBAToY), // %3
1504 "m"(kAddY16) // %4
1505 : "memory", "cc"
1506#if defined(__SSE2__)
1507 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1508#endif
1509 );
1510}
1511
fbarchard@google.com714050a2012-02-17 22:59:56 +00001512void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1513 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001514 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001515 "movdqa %0,%%xmm4 \n"
1516 "movdqa %1,%%xmm3 \n"
1517 "movdqa %2,%%xmm5 \n"
1518 :
1519 : "m"(kABGRToU), // %0
1520 "m"(kABGRToV), // %1
1521 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001522 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001523 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001524 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001525 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001526 "1: \n"
1527 "movdqa (%0),%%xmm0 \n"
1528 "movdqa 0x10(%0),%%xmm1 \n"
1529 "movdqa 0x20(%0),%%xmm2 \n"
1530 "movdqa 0x30(%0),%%xmm6 \n"
1531 "pavgb (%0,%4,1),%%xmm0 \n"
1532 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1533 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1534 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1535 "lea 0x40(%0),%0 \n"
1536 "movdqa %%xmm0,%%xmm7 \n"
1537 "shufps $0x88,%%xmm1,%%xmm0 \n"
1538 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1539 "pavgb %%xmm7,%%xmm0 \n"
1540 "movdqa %%xmm2,%%xmm7 \n"
1541 "shufps $0x88,%%xmm6,%%xmm2 \n"
1542 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1543 "pavgb %%xmm7,%%xmm2 \n"
1544 "movdqa %%xmm0,%%xmm1 \n"
1545 "movdqa %%xmm2,%%xmm6 \n"
1546 "pmaddubsw %%xmm4,%%xmm0 \n"
1547 "pmaddubsw %%xmm4,%%xmm2 \n"
1548 "pmaddubsw %%xmm3,%%xmm1 \n"
1549 "pmaddubsw %%xmm3,%%xmm6 \n"
1550 "phaddw %%xmm2,%%xmm0 \n"
1551 "phaddw %%xmm6,%%xmm1 \n"
1552 "psraw $0x8,%%xmm0 \n"
1553 "psraw $0x8,%%xmm1 \n"
1554 "packsswb %%xmm1,%%xmm0 \n"
1555 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001556 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001557 "movlps %%xmm0,(%1) \n"
1558 "movhps %%xmm0,(%1,%2,1) \n"
1559 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001560 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001561 : "+r"(src_abgr0), // %0
1562 "+r"(dst_u), // %1
1563 "+r"(dst_v), // %2
1564 "+rm"(width) // %3
1565 : "r"(static_cast<intptr_t>(src_stride_abgr))
1566 : "memory", "cc"
1567#if defined(__SSE2__)
1568 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1569#endif
1570 );
1571}
1572
1573void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1574 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001575 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001576 "movdqa %0,%%xmm4 \n"
1577 "movdqa %1,%%xmm3 \n"
1578 "movdqa %2,%%xmm5 \n"
1579 :
1580 : "m"(kABGRToU), // %0
1581 "m"(kABGRToV), // %1
1582 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001583 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001584 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001585 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001586 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001587 "1: \n"
1588 "movdqu (%0),%%xmm0 \n"
1589 "movdqu 0x10(%0),%%xmm1 \n"
1590 "movdqu 0x20(%0),%%xmm2 \n"
1591 "movdqu 0x30(%0),%%xmm6 \n"
1592 "movdqu (%0,%4,1),%%xmm7 \n"
1593 "pavgb %%xmm7,%%xmm0 \n"
1594 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1595 "pavgb %%xmm7,%%xmm1 \n"
1596 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1597 "pavgb %%xmm7,%%xmm2 \n"
1598 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1599 "pavgb %%xmm7,%%xmm6 \n"
1600 "lea 0x40(%0),%0 \n"
1601 "movdqa %%xmm0,%%xmm7 \n"
1602 "shufps $0x88,%%xmm1,%%xmm0 \n"
1603 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1604 "pavgb %%xmm7,%%xmm0 \n"
1605 "movdqa %%xmm2,%%xmm7 \n"
1606 "shufps $0x88,%%xmm6,%%xmm2 \n"
1607 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1608 "pavgb %%xmm7,%%xmm2 \n"
1609 "movdqa %%xmm0,%%xmm1 \n"
1610 "movdqa %%xmm2,%%xmm6 \n"
1611 "pmaddubsw %%xmm4,%%xmm0 \n"
1612 "pmaddubsw %%xmm4,%%xmm2 \n"
1613 "pmaddubsw %%xmm3,%%xmm1 \n"
1614 "pmaddubsw %%xmm3,%%xmm6 \n"
1615 "phaddw %%xmm2,%%xmm0 \n"
1616 "phaddw %%xmm6,%%xmm1 \n"
1617 "psraw $0x8,%%xmm0 \n"
1618 "psraw $0x8,%%xmm1 \n"
1619 "packsswb %%xmm1,%%xmm0 \n"
1620 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001621 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001622 "movlps %%xmm0,(%1) \n"
1623 "movhps %%xmm0,(%1,%2,1) \n"
1624 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001625 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001626 : "+r"(src_abgr0), // %0
1627 "+r"(dst_u), // %1
1628 "+r"(dst_v), // %2
1629 "+rm"(width) // %3
1630 : "r"(static_cast<intptr_t>(src_stride_abgr))
1631 : "memory", "cc"
1632#if defined(__SSE2__)
1633 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1634#endif
1635 );
1636}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001637
1638void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1639 uint8* dst_u, uint8* dst_v, int width) {
1640 asm volatile (
1641 "movdqa %0,%%xmm4 \n"
1642 "movdqa %1,%%xmm3 \n"
1643 "movdqa %2,%%xmm5 \n"
1644 :
1645 : "m"(kRGBAToU), // %0
1646 "m"(kRGBAToV), // %1
1647 "m"(kAddUV128) // %2
1648 );
1649 asm volatile (
1650 "sub %1,%2 \n"
1651 ".p2align 4 \n"
1652 "1: \n"
1653 "movdqa (%0),%%xmm0 \n"
1654 "movdqa 0x10(%0),%%xmm1 \n"
1655 "movdqa 0x20(%0),%%xmm2 \n"
1656 "movdqa 0x30(%0),%%xmm6 \n"
1657 "pavgb (%0,%4,1),%%xmm0 \n"
1658 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1659 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1660 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1661 "lea 0x40(%0),%0 \n"
1662 "movdqa %%xmm0,%%xmm7 \n"
1663 "shufps $0x88,%%xmm1,%%xmm0 \n"
1664 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1665 "pavgb %%xmm7,%%xmm0 \n"
1666 "movdqa %%xmm2,%%xmm7 \n"
1667 "shufps $0x88,%%xmm6,%%xmm2 \n"
1668 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1669 "pavgb %%xmm7,%%xmm2 \n"
1670 "movdqa %%xmm0,%%xmm1 \n"
1671 "movdqa %%xmm2,%%xmm6 \n"
1672 "pmaddubsw %%xmm4,%%xmm0 \n"
1673 "pmaddubsw %%xmm4,%%xmm2 \n"
1674 "pmaddubsw %%xmm3,%%xmm1 \n"
1675 "pmaddubsw %%xmm3,%%xmm6 \n"
1676 "phaddw %%xmm2,%%xmm0 \n"
1677 "phaddw %%xmm6,%%xmm1 \n"
1678 "psraw $0x8,%%xmm0 \n"
1679 "psraw $0x8,%%xmm1 \n"
1680 "packsswb %%xmm1,%%xmm0 \n"
1681 "paddb %%xmm5,%%xmm0 \n"
1682 "sub $0x10,%3 \n"
1683 "movlps %%xmm0,(%1) \n"
1684 "movhps %%xmm0,(%1,%2,1) \n"
1685 "lea 0x8(%1),%1 \n"
1686 "jg 1b \n"
1687 : "+r"(src_rgba0), // %0
1688 "+r"(dst_u), // %1
1689 "+r"(dst_v), // %2
1690 "+rm"(width) // %3
1691 : "r"(static_cast<intptr_t>(src_stride_rgba))
1692 : "memory", "cc"
1693#if defined(__SSE2__)
1694 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1695#endif
1696 );
1697}
1698
1699void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1700 uint8* dst_u, uint8* dst_v, int width) {
1701 asm volatile (
1702 "movdqa %0,%%xmm4 \n"
1703 "movdqa %1,%%xmm3 \n"
1704 "movdqa %2,%%xmm5 \n"
1705 :
1706 : "m"(kRGBAToU), // %0
1707 "m"(kRGBAToV), // %1
1708 "m"(kAddUV128) // %2
1709 );
1710 asm volatile (
1711 "sub %1,%2 \n"
1712 ".p2align 4 \n"
1713 "1: \n"
1714 "movdqu (%0),%%xmm0 \n"
1715 "movdqu 0x10(%0),%%xmm1 \n"
1716 "movdqu 0x20(%0),%%xmm2 \n"
1717 "movdqu 0x30(%0),%%xmm6 \n"
1718 "movdqu (%0,%4,1),%%xmm7 \n"
1719 "pavgb %%xmm7,%%xmm0 \n"
1720 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1721 "pavgb %%xmm7,%%xmm1 \n"
1722 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1723 "pavgb %%xmm7,%%xmm2 \n"
1724 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1725 "pavgb %%xmm7,%%xmm6 \n"
1726 "lea 0x40(%0),%0 \n"
1727 "movdqa %%xmm0,%%xmm7 \n"
1728 "shufps $0x88,%%xmm1,%%xmm0 \n"
1729 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1730 "pavgb %%xmm7,%%xmm0 \n"
1731 "movdqa %%xmm2,%%xmm7 \n"
1732 "shufps $0x88,%%xmm6,%%xmm2 \n"
1733 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1734 "pavgb %%xmm7,%%xmm2 \n"
1735 "movdqa %%xmm0,%%xmm1 \n"
1736 "movdqa %%xmm2,%%xmm6 \n"
1737 "pmaddubsw %%xmm4,%%xmm0 \n"
1738 "pmaddubsw %%xmm4,%%xmm2 \n"
1739 "pmaddubsw %%xmm3,%%xmm1 \n"
1740 "pmaddubsw %%xmm3,%%xmm6 \n"
1741 "phaddw %%xmm2,%%xmm0 \n"
1742 "phaddw %%xmm6,%%xmm1 \n"
1743 "psraw $0x8,%%xmm0 \n"
1744 "psraw $0x8,%%xmm1 \n"
1745 "packsswb %%xmm1,%%xmm0 \n"
1746 "paddb %%xmm5,%%xmm0 \n"
1747 "sub $0x10,%3 \n"
1748 "movlps %%xmm0,(%1) \n"
1749 "movhps %%xmm0,(%1,%2,1) \n"
1750 "lea 0x8(%1),%1 \n"
1751 "jg 1b \n"
1752 : "+r"(src_rgba0), // %0
1753 "+r"(dst_u), // %1
1754 "+r"(dst_v), // %2
1755 "+rm"(width) // %3
1756 : "r"(static_cast<intptr_t>(src_stride_rgba))
1757 : "memory", "cc"
1758#if defined(__SSE2__)
1759 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1760#endif
1761 );
1762}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001763#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001764
fbarchard@google.come214fe32012-06-04 23:47:11 +00001765#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001766#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1767#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1768#define UR 0
1769
1770#define VB 0
1771#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1772#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1773
1774// Bias
1775#define BB UB * 128 + VB * 128
1776#define BG UG * 128 + VG * 128
1777#define BR UR * 128 + VR * 128
1778
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001779#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001780
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001781struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001782 vec8 kUVToB; // 0
1783 vec8 kUVToG; // 16
1784 vec8 kUVToR; // 32
1785 vec16 kUVBiasB; // 48
1786 vec16 kUVBiasG; // 64
1787 vec16 kUVBiasR; // 80
1788 vec16 kYSub16; // 96
1789 vec16 kYToRgb; // 112
1790 vec8 kVUToB; // 128
1791 vec8 kVUToG; // 144
1792 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001793} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001794 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1795 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1796 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1797 { BB, BB, BB, BB, BB, BB, BB, BB },
1798 { BG, BG, BG, BG, BG, BG, BG, BG },
1799 { BR, BR, BR, BR, BR, BR, BR, BR },
1800 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001801 { YG, YG, YG, YG, YG, YG, YG, YG },
1802 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1803 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1804 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001805};
1806
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001807
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001808// Read 8 UV from 411
1809#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001810 "movq (%[u_buf]),%%xmm0 \n" \
1811 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1812 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001813 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001814
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001815// Read 4 UV from 422, upsample to 8 UV
1816#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001817 "movd (%[u_buf]),%%xmm0 \n" \
1818 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1819 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001820 "punpcklbw %%xmm1,%%xmm0 \n" \
1821 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001822
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001823// Read 2 UV from 411, upsample to 8 UV
1824#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001825 "movd (%[u_buf]),%%xmm0 \n" \
1826 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1827 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001828 "punpcklbw %%xmm1,%%xmm0 \n" \
1829 "punpcklwd %%xmm0,%%xmm0 \n" \
1830 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001831
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001832// Read 4 UV from NV12, upsample to 8 UV
1833#define READNV12 \
1834 "movq (%[uv_buf]),%%xmm0 \n" \
1835 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001836 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001837
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001838// Convert 8 pixels: 8 UV and 8 Y
1839#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001840 "movdqa %%xmm0,%%xmm1 \n" \
1841 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001842 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1843 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1844 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1845 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1846 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1847 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1848 "movq (%[y_buf]),%%xmm3 \n" \
1849 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001850 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001851 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1852 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001853 "paddsw %%xmm3,%%xmm0 \n" \
1854 "paddsw %%xmm3,%%xmm1 \n" \
1855 "paddsw %%xmm3,%%xmm2 \n" \
1856 "psraw $0x6,%%xmm0 \n" \
1857 "psraw $0x6,%%xmm1 \n" \
1858 "psraw $0x6,%%xmm2 \n" \
1859 "packuswb %%xmm0,%%xmm0 \n" \
1860 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001861 "packuswb %%xmm2,%%xmm2 \n" \
1862
1863// Convert 8 pixels: 8 VU and 8 Y
1864#define YVUTORGB \
1865 "movdqa %%xmm0,%%xmm1 \n" \
1866 "movdqa %%xmm0,%%xmm2 \n" \
1867 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1868 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1869 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1870 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1871 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1872 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1873 "movq (%[y_buf]),%%xmm3 \n" \
1874 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1875 "punpcklbw %%xmm4,%%xmm3 \n" \
1876 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1877 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1878 "paddsw %%xmm3,%%xmm0 \n" \
1879 "paddsw %%xmm3,%%xmm1 \n" \
1880 "paddsw %%xmm3,%%xmm2 \n" \
1881 "psraw $0x6,%%xmm0 \n" \
1882 "psraw $0x6,%%xmm1 \n" \
1883 "psraw $0x6,%%xmm2 \n" \
1884 "packuswb %%xmm0,%%xmm0 \n" \
1885 "packuswb %%xmm1,%%xmm1 \n" \
1886 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001887
1888void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001889 const uint8* u_buf,
1890 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001891 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001892 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001893 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001894 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001895 "pcmpeqb %%xmm5,%%xmm5 \n"
1896 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001897 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001898 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001899 READYUV444
1900 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001901 "punpcklbw %%xmm1,%%xmm0 \n"
1902 "punpcklbw %%xmm5,%%xmm2 \n"
1903 "movdqa %%xmm0,%%xmm1 \n"
1904 "punpcklwd %%xmm2,%%xmm0 \n"
1905 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001906 "movdqa %%xmm0,(%[dst_argb]) \n"
1907 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
1908 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001909 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001910 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001911 : [y_buf]"+r"(y_buf), // %[y_buf]
1912 [u_buf]"+r"(u_buf), // %[u_buf]
1913 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001914 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001915 [width]"+rm"(width) // %[width]
1916 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001917 : "memory", "cc"
1918#if defined(__SSE2__)
1919 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1920#endif
1921 );
1922}
1923
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001924void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1925 const uint8* u_buf,
1926 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001927 uint8* dst_rgb24,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001928 int width) {
1929// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1930#ifdef __APPLE__
1931 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001932 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1933 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1934 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1935 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001936#endif
1937
1938 asm volatile (
1939#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001940 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1941 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001942#endif
1943 "sub %[u_buf],%[v_buf] \n"
1944 "pxor %%xmm4,%%xmm4 \n"
1945 ".p2align 4 \n"
1946 "1: \n"
1947 READYUV422
1948 YUVTORGB
1949 "punpcklbw %%xmm1,%%xmm0 \n"
1950 "punpcklbw %%xmm2,%%xmm2 \n"
1951 "movdqa %%xmm0,%%xmm1 \n"
1952 "punpcklwd %%xmm2,%%xmm0 \n"
1953 "punpckhwd %%xmm2,%%xmm1 \n"
1954 "pshufb %%xmm5,%%xmm0 \n"
1955 "pshufb %%xmm6,%%xmm1 \n"
1956 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001957 "movq %%xmm0,(%[dst_rgb24]) \n"
1958 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
1959 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001960 "sub $0x8,%[width] \n"
1961 "jg 1b \n"
1962 : [y_buf]"+r"(y_buf), // %[y_buf]
1963 [u_buf]"+r"(u_buf), // %[u_buf]
1964 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001965 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001966 [width]"+rm"(width) // %[width]
1967 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1968#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001969 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1970 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001971#endif
1972 : "memory", "cc"
1973#if defined(__SSE2__)
1974 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1975#endif
1976 );
1977}
1978
1979void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1980 const uint8* u_buf,
1981 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001982 uint8* dst_raw,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001983 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001984// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001985#ifdef __APPLE__
1986 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001987 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1988 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1989 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1990 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001991#endif
1992
1993 asm volatile (
1994#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001995 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1996 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001997#endif
1998 "sub %[u_buf],%[v_buf] \n"
1999 "pxor %%xmm4,%%xmm4 \n"
2000 ".p2align 4 \n"
2001 "1: \n"
2002 READYUV422
2003 YUVTORGB
2004 "punpcklbw %%xmm1,%%xmm0 \n"
2005 "punpcklbw %%xmm2,%%xmm2 \n"
2006 "movdqa %%xmm0,%%xmm1 \n"
2007 "punpcklwd %%xmm2,%%xmm0 \n"
2008 "punpckhwd %%xmm2,%%xmm1 \n"
2009 "pshufb %%xmm5,%%xmm0 \n"
2010 "pshufb %%xmm6,%%xmm1 \n"
2011 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002012 "movq %%xmm0,(%[dst_raw]) \n"
2013 "movdqu %%xmm1,0x8(%[dst_raw]) \n"
2014 "lea 0x18(%[dst_raw]),%[dst_raw] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002015 "sub $0x8,%[width] \n"
2016 "jg 1b \n"
2017 : [y_buf]"+r"(y_buf), // %[y_buf]
2018 [u_buf]"+r"(u_buf), // %[u_buf]
2019 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002020 [dst_raw]"+r"(dst_raw), // %[dst_raw]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002021 [width]"+rm"(width) // %[width]
2022 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
2023#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002024 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
2025 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002026#endif
2027 : "memory", "cc"
2028#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002029 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002030#endif
2031 );
2032}
2033
fbarchard@google.come214fe32012-06-04 23:47:11 +00002034void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002035 const uint8* u_buf,
2036 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002037 uint8* dst_argb,
fbarchard@google.comdbcabea2012-10-29 21:20:25 +00002038 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002039 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002040 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002041 "pcmpeqb %%xmm5,%%xmm5 \n"
2042 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002043 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002044 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002045 READYUV422
2046 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002047 "punpcklbw %%xmm1,%%xmm0 \n"
2048 "punpcklbw %%xmm5,%%xmm2 \n"
2049 "movdqa %%xmm0,%%xmm1 \n"
2050 "punpcklwd %%xmm2,%%xmm0 \n"
2051 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002052 "movdqa %%xmm0,(%[dst_argb]) \n"
2053 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2054 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002055 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002056 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002057 : [y_buf]"+r"(y_buf), // %[y_buf]
2058 [u_buf]"+r"(u_buf), // %[u_buf]
2059 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002060 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002061 [width]"+rm"(width) // %[width]
2062 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002063 : "memory", "cc"
2064#if defined(__SSE2__)
2065 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2066#endif
2067 );
2068}
2069
2070void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
2071 const uint8* u_buf,
2072 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002073 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002074 int width) {
2075 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002076 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002077 "pcmpeqb %%xmm5,%%xmm5 \n"
2078 "pxor %%xmm4,%%xmm4 \n"
2079 ".p2align 4 \n"
2080 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002081 READYUV411
2082 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002083 "punpcklbw %%xmm1,%%xmm0 \n"
2084 "punpcklbw %%xmm5,%%xmm2 \n"
2085 "movdqa %%xmm0,%%xmm1 \n"
2086 "punpcklwd %%xmm2,%%xmm0 \n"
2087 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002088 "movdqa %%xmm0,(%[dst_argb]) \n"
2089 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2090 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002091 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002092 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002093 : [y_buf]"+r"(y_buf), // %[y_buf]
2094 [u_buf]"+r"(u_buf), // %[u_buf]
2095 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002096 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002097 [width]"+rm"(width) // %[width]
2098 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2099 : "memory", "cc"
2100#if defined(__SSE2__)
2101 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2102#endif
2103 );
2104}
2105
2106void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
2107 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002108 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002109 int width) {
2110 asm volatile (
2111 "pcmpeqb %%xmm5,%%xmm5 \n"
2112 "pxor %%xmm4,%%xmm4 \n"
2113 ".p2align 4 \n"
2114 "1: \n"
2115 READNV12
2116 YUVTORGB
2117 "punpcklbw %%xmm1,%%xmm0 \n"
2118 "punpcklbw %%xmm5,%%xmm2 \n"
2119 "movdqa %%xmm0,%%xmm1 \n"
2120 "punpcklwd %%xmm2,%%xmm0 \n"
2121 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002122 "movdqa %%xmm0,(%[dst_argb]) \n"
2123 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2124 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002125 "sub $0x8,%[width] \n"
2126 "jg 1b \n"
2127 : [y_buf]"+r"(y_buf), // %[y_buf]
2128 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002129 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002130 [width]"+rm"(width) // %[width]
2131 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2132 : "memory", "cc"
2133#if defined(__SSE2__)
2134 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2135#endif
2136 );
2137}
2138
2139void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002140 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002141 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002142 int width) {
2143 asm volatile (
2144 "pcmpeqb %%xmm5,%%xmm5 \n"
2145 "pxor %%xmm4,%%xmm4 \n"
2146 ".p2align 4 \n"
2147 "1: \n"
2148 READNV12
2149 YVUTORGB
2150 "punpcklbw %%xmm1,%%xmm0 \n"
2151 "punpcklbw %%xmm5,%%xmm2 \n"
2152 "movdqa %%xmm0,%%xmm1 \n"
2153 "punpcklwd %%xmm2,%%xmm0 \n"
2154 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002155 "movdqa %%xmm0,(%[dst_argb]) \n"
2156 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2157 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002158 "sub $0x8,%[width] \n"
2159 "jg 1b \n"
2160 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002161 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2162 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002163 [width]"+rm"(width) // %[width]
2164 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002165 : "memory", "cc"
2166#if defined(__SSE2__)
2167 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2168#endif
2169 );
2170}
2171
2172void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2173 const uint8* u_buf,
2174 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002175 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002176 int width) {
2177 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002178 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002179 "pcmpeqb %%xmm5,%%xmm5 \n"
2180 "pxor %%xmm4,%%xmm4 \n"
2181 ".p2align 4 \n"
2182 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002183 READYUV444
2184 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002185 "punpcklbw %%xmm1,%%xmm0 \n"
2186 "punpcklbw %%xmm5,%%xmm2 \n"
2187 "movdqa %%xmm0,%%xmm1 \n"
2188 "punpcklwd %%xmm2,%%xmm0 \n"
2189 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002190 "movdqu %%xmm0,(%[dst_argb]) \n"
2191 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2192 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002193 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002194 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002195 : [y_buf]"+r"(y_buf), // %[y_buf]
2196 [u_buf]"+r"(u_buf), // %[u_buf]
2197 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002198 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002199 [width]"+rm"(width) // %[width]
2200 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002201 : "memory", "cc"
2202#if defined(__SSE2__)
2203 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2204#endif
2205 );
2206}
2207
2208void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2209 const uint8* u_buf,
2210 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002211 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002212 int width) {
2213 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002214 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002215 "pcmpeqb %%xmm5,%%xmm5 \n"
2216 "pxor %%xmm4,%%xmm4 \n"
2217 ".p2align 4 \n"
2218 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002219 READYUV422
2220 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002221 "punpcklbw %%xmm1,%%xmm0 \n"
2222 "punpcklbw %%xmm5,%%xmm2 \n"
2223 "movdqa %%xmm0,%%xmm1 \n"
2224 "punpcklwd %%xmm2,%%xmm0 \n"
2225 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002226 "movdqu %%xmm0,(%[dst_argb]) \n"
2227 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2228 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002229 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002230 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002231 : [y_buf]"+r"(y_buf), // %[y_buf]
2232 [u_buf]"+r"(u_buf), // %[u_buf]
2233 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002234 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002235 [width]"+rm"(width) // %[width]
2236 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002237 : "memory", "cc"
2238#if defined(__SSE2__)
2239 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2240#endif
2241 );
2242}
2243
2244void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2245 const uint8* u_buf,
2246 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002247 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002248 int width) {
2249 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002250 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002251 "pcmpeqb %%xmm5,%%xmm5 \n"
2252 "pxor %%xmm4,%%xmm4 \n"
2253 ".p2align 4 \n"
2254 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002255 READYUV411
2256 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002257 "punpcklbw %%xmm1,%%xmm0 \n"
2258 "punpcklbw %%xmm5,%%xmm2 \n"
2259 "movdqa %%xmm0,%%xmm1 \n"
2260 "punpcklwd %%xmm2,%%xmm0 \n"
2261 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002262 "movdqu %%xmm0,(%[dst_argb]) \n"
2263 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2264 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002265 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002266 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002267 : [y_buf]"+r"(y_buf), // %[y_buf]
2268 [u_buf]"+r"(u_buf), // %[u_buf]
2269 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002270 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002271 [width]"+rm"(width) // %[width]
2272 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2273 : "memory", "cc"
2274#if defined(__SSE2__)
2275 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2276#endif
2277 );
2278}
2279
2280void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2281 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002282 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002283 int width) {
2284 asm volatile (
2285 "pcmpeqb %%xmm5,%%xmm5 \n"
2286 "pxor %%xmm4,%%xmm4 \n"
2287 ".p2align 4 \n"
2288 "1: \n"
2289 READNV12
2290 YUVTORGB
2291 "punpcklbw %%xmm1,%%xmm0 \n"
2292 "punpcklbw %%xmm5,%%xmm2 \n"
2293 "movdqa %%xmm0,%%xmm1 \n"
2294 "punpcklwd %%xmm2,%%xmm0 \n"
2295 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002296 "movdqu %%xmm0,(%[dst_argb]) \n"
2297 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2298 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002299 "sub $0x8,%[width] \n"
2300 "jg 1b \n"
2301 : [y_buf]"+r"(y_buf), // %[y_buf]
2302 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002303 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002304 [width]"+rm"(width) // %[width]
2305 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2306 : "memory", "cc"
2307#if defined(__SSE2__)
2308 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2309#endif
2310 );
2311}
2312
2313void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002314 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002315 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002316 int width) {
2317 asm volatile (
2318 "pcmpeqb %%xmm5,%%xmm5 \n"
2319 "pxor %%xmm4,%%xmm4 \n"
2320 ".p2align 4 \n"
2321 "1: \n"
2322 READNV12
2323 YVUTORGB
2324 "punpcklbw %%xmm1,%%xmm0 \n"
2325 "punpcklbw %%xmm5,%%xmm2 \n"
2326 "movdqa %%xmm0,%%xmm1 \n"
2327 "punpcklwd %%xmm2,%%xmm0 \n"
2328 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002329 "movdqu %%xmm0,(%[dst_argb]) \n"
2330 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2331 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002332 "sub $0x8,%[width] \n"
2333 "jg 1b \n"
2334 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002335 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2336 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002337 [width]"+rm"(width) // %[width]
2338 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002339 : "memory", "cc"
2340#if defined(__SSE2__)
2341 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2342#endif
2343 );
2344}
2345
2346void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2347 const uint8* u_buf,
2348 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002349 uint8* dst_bgra,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002350 int width) {
2351 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002352 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002353 "pcmpeqb %%xmm5,%%xmm5 \n"
2354 "pxor %%xmm4,%%xmm4 \n"
2355 ".p2align 4 \n"
2356 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002357 READYUV422
2358 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002359 "pcmpeqb %%xmm5,%%xmm5 \n"
2360 "punpcklbw %%xmm0,%%xmm1 \n"
2361 "punpcklbw %%xmm2,%%xmm5 \n"
2362 "movdqa %%xmm5,%%xmm0 \n"
2363 "punpcklwd %%xmm1,%%xmm5 \n"
2364 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002365 "movdqa %%xmm5,(%[dst_bgra]) \n"
2366 "movdqa %%xmm0,0x10(%[dst_bgra]) \n"
2367 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002368 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002369 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002370 : [y_buf]"+r"(y_buf), // %[y_buf]
2371 [u_buf]"+r"(u_buf), // %[u_buf]
2372 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002373 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002374 [width]"+rm"(width) // %[width]
2375 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002376 : "memory", "cc"
2377#if defined(__SSE2__)
2378 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2379#endif
2380 );
2381}
2382
fbarchard@google.come214fe32012-06-04 23:47:11 +00002383void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002384 const uint8* u_buf,
2385 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002386 uint8* dst_abgr,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002387 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002388 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002389 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002390 "pcmpeqb %%xmm5,%%xmm5 \n"
2391 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002392 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002393 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002394 READYUV422
2395 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002396 "punpcklbw %%xmm1,%%xmm2 \n"
2397 "punpcklbw %%xmm5,%%xmm0 \n"
2398 "movdqa %%xmm2,%%xmm1 \n"
2399 "punpcklwd %%xmm0,%%xmm2 \n"
2400 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002401 "movdqa %%xmm2,(%[dst_abgr]) \n"
2402 "movdqa %%xmm1,0x10(%[dst_abgr]) \n"
2403 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002404 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002405 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002406 : [y_buf]"+r"(y_buf), // %[y_buf]
2407 [u_buf]"+r"(u_buf), // %[u_buf]
2408 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002409 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002410 [width]"+rm"(width) // %[width]
2411 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002412 : "memory", "cc"
2413#if defined(__SSE2__)
2414 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2415#endif
2416 );
2417}
2418
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002419void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2420 const uint8* u_buf,
2421 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002422 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002423 int width) {
2424 asm volatile (
2425 "sub %[u_buf],%[v_buf] \n"
2426 "pcmpeqb %%xmm5,%%xmm5 \n"
2427 "pxor %%xmm4,%%xmm4 \n"
2428 ".p2align 4 \n"
2429 "1: \n"
2430 READYUV422
2431 YUVTORGB
2432 "pcmpeqb %%xmm5,%%xmm5 \n"
2433 "punpcklbw %%xmm2,%%xmm1 \n"
2434 "punpcklbw %%xmm0,%%xmm5 \n"
2435 "movdqa %%xmm5,%%xmm0 \n"
2436 "punpcklwd %%xmm1,%%xmm5 \n"
2437 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002438 "movdqa %%xmm5,(%[dst_rgba]) \n"
2439 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2440 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002441 "sub $0x8,%[width] \n"
2442 "jg 1b \n"
2443 : [y_buf]"+r"(y_buf), // %[y_buf]
2444 [u_buf]"+r"(u_buf), // %[u_buf]
2445 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002446 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002447 [width]"+rm"(width) // %[width]
2448 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2449 : "memory", "cc"
2450#if defined(__SSE2__)
2451 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2452#endif
2453 );
2454}
2455
fbarchard@google.come214fe32012-06-04 23:47:11 +00002456void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002457 const uint8* u_buf,
2458 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002459 uint8* dst_bgra,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002460 int width) {
2461 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002462 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002463 "pcmpeqb %%xmm5,%%xmm5 \n"
2464 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002465 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002466 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002467 READYUV422
2468 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002469 "pcmpeqb %%xmm5,%%xmm5 \n"
2470 "punpcklbw %%xmm0,%%xmm1 \n"
2471 "punpcklbw %%xmm2,%%xmm5 \n"
2472 "movdqa %%xmm5,%%xmm0 \n"
2473 "punpcklwd %%xmm1,%%xmm5 \n"
2474 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002475 "movdqu %%xmm5,(%[dst_bgra]) \n"
2476 "movdqu %%xmm0,0x10(%[dst_bgra]) \n"
2477 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002478 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002479 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002480 : [y_buf]"+r"(y_buf), // %[y_buf]
2481 [u_buf]"+r"(u_buf), // %[u_buf]
2482 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002483 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002484 [width]"+rm"(width) // %[width]
2485 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002486 : "memory", "cc"
2487#if defined(__SSE2__)
2488 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2489#endif
2490 );
2491}
2492
fbarchard@google.come214fe32012-06-04 23:47:11 +00002493void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002494 const uint8* u_buf,
2495 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002496 uint8* dst_abgr,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002497 int width) {
2498 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002499 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002500 "pcmpeqb %%xmm5,%%xmm5 \n"
2501 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002502 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002503 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002504 READYUV422
2505 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002506 "punpcklbw %%xmm1,%%xmm2 \n"
2507 "punpcklbw %%xmm5,%%xmm0 \n"
2508 "movdqa %%xmm2,%%xmm1 \n"
2509 "punpcklwd %%xmm0,%%xmm2 \n"
2510 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002511 "movdqu %%xmm2,(%[dst_abgr]) \n"
2512 "movdqu %%xmm1,0x10(%[dst_abgr]) \n"
2513 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002514 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002515 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002516 : [y_buf]"+r"(y_buf), // %[y_buf]
2517 [u_buf]"+r"(u_buf), // %[u_buf]
2518 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002519 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002520 [width]"+rm"(width) // %[width]
2521 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002522 : "memory", "cc"
2523#if defined(__SSE2__)
2524 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2525#endif
2526 );
2527}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002528
2529void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2530 const uint8* u_buf,
2531 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002532 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002533 int width) {
2534 asm volatile (
2535 "sub %[u_buf],%[v_buf] \n"
2536 "pcmpeqb %%xmm5,%%xmm5 \n"
2537 "pxor %%xmm4,%%xmm4 \n"
2538 ".p2align 4 \n"
2539 "1: \n"
2540 READYUV422
2541 YUVTORGB
2542 "pcmpeqb %%xmm5,%%xmm5 \n"
2543 "punpcklbw %%xmm2,%%xmm1 \n"
2544 "punpcklbw %%xmm0,%%xmm5 \n"
2545 "movdqa %%xmm5,%%xmm0 \n"
2546 "punpcklwd %%xmm1,%%xmm5 \n"
2547 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002548 "movdqa %%xmm5,(%[dst_rgba]) \n"
2549 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2550 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002551 "sub $0x8,%[width] \n"
2552 "jg 1b \n"
2553 : [y_buf]"+r"(y_buf), // %[y_buf]
2554 [u_buf]"+r"(u_buf), // %[u_buf]
2555 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002556 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002557 [width]"+rm"(width) // %[width]
2558 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2559 : "memory", "cc"
2560#if defined(__SSE2__)
2561 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2562#endif
2563 );
2564}
2565
fbarchard@google.come214fe32012-06-04 23:47:11 +00002566#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002567
2568#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002569void YToARGBRow_SSE2(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002570 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002571 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002572 asm volatile (
fbarchard@google.com30859f72012-11-02 09:51:29 +00002573 "pxor %%xmm5,%%xmm5 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002574 "pcmpeqb %%xmm4,%%xmm4 \n"
2575 "pslld $0x18,%%xmm4 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002576 "mov $0x00100010,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002577 "movd %%eax,%%xmm3 \n"
2578 "pshufd $0x0,%%xmm3,%%xmm3 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002579 "mov $0x004a004a,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002580 "movd %%eax,%%xmm2 \n"
2581 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002582 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002583 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002584 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002585 "movq (%0),%%xmm0 \n"
2586 "lea 0x8(%0),%0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002587 "punpcklbw %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002588 "psubusw %%xmm3,%%xmm0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002589 "pmullw %%xmm2,%%xmm0 \n"
2590 "psrlw $6, %%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002591 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002592
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002593 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002594 "punpcklbw %%xmm0,%%xmm0 \n"
2595 "movdqa %%xmm0,%%xmm1 \n"
2596 "punpcklwd %%xmm0,%%xmm0 \n"
2597 "punpckhwd %%xmm1,%%xmm1 \n"
2598 "por %%xmm4,%%xmm0 \n"
2599 "por %%xmm4,%%xmm1 \n"
2600 "movdqa %%xmm0,(%1) \n"
2601 "movdqa %%xmm1,16(%1) \n"
2602 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002603
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002604 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002605 "jg 1b \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002606 : "+r"(y_buf), // %0
2607 "+r"(dst_argb), // %1
2608 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002609 :
2610 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002611#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002612 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002613#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002614 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002615}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002616#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002617
fbarchard@google.com42831e02012-01-21 02:54:17 +00002618#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002619// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002620CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002621 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2622};
2623
fbarchard@google.com42831e02012-01-21 02:54:17 +00002624void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002625 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002626 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002627 "movdqa %3,%%xmm5 \n"
2628 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002629 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002630 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002631 "movdqa (%0,%2),%%xmm0 \n"
2632 "pshufb %%xmm5,%%xmm0 \n"
2633 "sub $0x10,%2 \n"
2634 "movdqa %%xmm0,(%1) \n"
2635 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002636 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002637 : "+r"(src), // %0
2638 "+r"(dst), // %1
2639 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002640 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002641 : "memory", "cc"
2642#if defined(__SSE2__)
2643 , "xmm0", "xmm5"
2644#endif
2645 );
2646}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002647#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002648
fbarchard@google.com42831e02012-01-21 02:54:17 +00002649#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002650void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002651 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002652 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002653 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002654 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002655 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002656 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002657 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002658 "psllw $0x8,%%xmm0 \n"
2659 "psrlw $0x8,%%xmm1 \n"
2660 "por %%xmm1,%%xmm0 \n"
2661 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2662 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2663 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2664 "sub $0x10,%2 \n"
2665 "movdqu %%xmm0,(%1) \n"
2666 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002667 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002668 : "+r"(src), // %0
2669 "+r"(dst), // %1
2670 "+r"(temp_width) // %2
2671 :
2672 : "memory", "cc"
2673#if defined(__SSE2__)
2674 , "xmm0", "xmm1"
2675#endif
2676 );
2677}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002678#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002679
fbarchard@google.com16a96642012-03-02 22:38:09 +00002680#ifdef HAS_MIRRORROW_UV_SSSE3
2681// Shuffle table for reversing the bytes of UV channels.
2682CONST uvec8 kShuffleMirrorUV = {
2683 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2684};
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002685void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
fbarchard@google.com16a96642012-03-02 22:38:09 +00002686 int width) {
2687 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002688 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002689 "movdqa %4,%%xmm1 \n"
2690 "lea -16(%0,%3,2),%0 \n"
2691 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002692 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002693 "1: \n"
2694 "movdqa (%0),%%xmm0 \n"
2695 "lea -16(%0),%0 \n"
2696 "pshufb %%xmm1,%%xmm0 \n"
2697 "sub $8,%3 \n"
2698 "movlpd %%xmm0,(%1) \n"
2699 "movhpd %%xmm0,(%1,%2) \n"
2700 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002701 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002702 : "+r"(src), // %0
2703 "+r"(dst_u), // %1
2704 "+r"(dst_v), // %2
2705 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002706 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002707 : "memory", "cc"
2708#if defined(__SSE2__)
2709 , "xmm0", "xmm1"
2710#endif
2711 );
2712}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002713#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002714
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002715#ifdef HAS_ARGBMIRRORROW_SSSE3
2716// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002717CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002718 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2719};
2720
2721void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2722 intptr_t temp_width = static_cast<intptr_t>(width);
2723 asm volatile (
2724 "movdqa %3,%%xmm5 \n"
2725 "lea -0x10(%0),%0 \n"
2726 ".p2align 4 \n"
2727 "1: \n"
2728 "movdqa (%0,%2,4),%%xmm0 \n"
2729 "pshufb %%xmm5,%%xmm0 \n"
2730 "sub $0x4,%2 \n"
2731 "movdqa %%xmm0,(%1) \n"
2732 "lea 0x10(%1),%1 \n"
2733 "jg 1b \n"
2734 : "+r"(src), // %0
2735 "+r"(dst), // %1
2736 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002737 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002738 : "memory", "cc"
2739#if defined(__SSE2__)
2740 , "xmm0", "xmm5"
2741#endif
2742 );
2743}
2744#endif // HAS_ARGBMIRRORROW_SSSE3
2745
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002746#ifdef HAS_SPLITUVROW_SSE2
2747void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002748 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002749 "pcmpeqb %%xmm5,%%xmm5 \n"
2750 "psrlw $0x8,%%xmm5 \n"
2751 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002752 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002753 "1: \n"
2754 "movdqa (%0),%%xmm0 \n"
2755 "movdqa 0x10(%0),%%xmm1 \n"
2756 "lea 0x20(%0),%0 \n"
2757 "movdqa %%xmm0,%%xmm2 \n"
2758 "movdqa %%xmm1,%%xmm3 \n"
2759 "pand %%xmm5,%%xmm0 \n"
2760 "pand %%xmm5,%%xmm1 \n"
2761 "packuswb %%xmm1,%%xmm0 \n"
2762 "psrlw $0x8,%%xmm2 \n"
2763 "psrlw $0x8,%%xmm3 \n"
2764 "packuswb %%xmm3,%%xmm2 \n"
2765 "movdqa %%xmm0,(%1) \n"
2766 "movdqa %%xmm2,(%1,%2) \n"
2767 "lea 0x10(%1),%1 \n"
2768 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002769 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002770 : "+r"(src_uv), // %0
2771 "+r"(dst_u), // %1
2772 "+r"(dst_v), // %2
2773 "+r"(pix) // %3
2774 :
2775 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002776#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002777 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002778#endif
2779 );
2780}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002781
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002782void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2783 int pix) {
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002784 asm volatile (
2785 "pcmpeqb %%xmm5,%%xmm5 \n"
2786 "psrlw $0x8,%%xmm5 \n"
2787 "sub %1,%2 \n"
2788 ".p2align 4 \n"
2789 "1: \n"
2790 "movdqu (%0),%%xmm0 \n"
2791 "movdqu 0x10(%0),%%xmm1 \n"
2792 "lea 0x20(%0),%0 \n"
2793 "movdqa %%xmm0,%%xmm2 \n"
2794 "movdqa %%xmm1,%%xmm3 \n"
2795 "pand %%xmm5,%%xmm0 \n"
2796 "pand %%xmm5,%%xmm1 \n"
2797 "packuswb %%xmm1,%%xmm0 \n"
2798 "psrlw $0x8,%%xmm2 \n"
2799 "psrlw $0x8,%%xmm3 \n"
2800 "packuswb %%xmm3,%%xmm2 \n"
2801 "movdqu %%xmm0,(%1) \n"
2802 "movdqu %%xmm2,(%1,%2) \n"
2803 "lea 0x10(%1),%1 \n"
2804 "sub $0x10,%3 \n"
2805 "jg 1b \n"
2806 : "+r"(src_uv), // %0
2807 "+r"(dst_u), // %1
2808 "+r"(dst_v), // %2
2809 "+r"(pix) // %3
2810 :
2811 : "memory", "cc"
2812#if defined(__SSE2__)
2813 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2814#endif
2815 );
2816}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002817#endif // HAS_SPLITUVROW_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002818
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002819#ifdef HAS_MERGEUVROW_SSE2
2820void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2821 int width) {
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002822 asm volatile (
2823 "sub %0,%1 \n"
2824 ".p2align 4 \n"
2825 "1: \n"
2826 "movdqa (%0),%%xmm0 \n"
2827 "movdqa (%0,%1,1),%%xmm1 \n"
2828 "lea 0x10(%0),%0 \n"
2829 "movdqa %%xmm0,%%xmm2 \n"
2830 "punpcklbw %%xmm1,%%xmm0 \n"
2831 "punpckhbw %%xmm1,%%xmm2 \n"
2832 "movdqa %%xmm0,(%2) \n"
2833 "movdqa %%xmm2,0x10(%2) \n"
2834 "lea 0x20(%2),%2 \n"
2835 "sub $0x10,%3 \n"
2836 "jg 1b \n"
2837 : "+r"(src_u), // %0
2838 "+r"(src_v), // %1
2839 "+r"(dst_uv), // %2
2840 "+r"(width) // %3
2841 :
2842 : "memory", "cc"
2843#if defined(__SSE2__)
2844 , "xmm0", "xmm1", "xmm2"
2845#endif
2846 );
2847}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002848
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002849void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2850 uint8* dst_uv, int width) {
fbarchard@google.come0d86482012-10-27 19:07:55 +00002851 asm volatile (
2852 "sub %0,%1 \n"
2853 ".p2align 4 \n"
2854 "1: \n"
2855 "movdqu (%0),%%xmm0 \n"
2856 "movdqu (%0,%1,1),%%xmm1 \n"
2857 "lea 0x10(%0),%0 \n"
2858 "movdqa %%xmm0,%%xmm2 \n"
2859 "punpcklbw %%xmm1,%%xmm0 \n"
2860 "punpckhbw %%xmm1,%%xmm2 \n"
2861 "movdqu %%xmm0,(%2) \n"
2862 "movdqu %%xmm2,0x10(%2) \n"
2863 "lea 0x20(%2),%2 \n"
2864 "sub $0x10,%3 \n"
2865 "jg 1b \n"
2866 : "+r"(src_u), // %0
2867 "+r"(src_v), // %1
2868 "+r"(dst_uv), // %2
2869 "+r"(width) // %3
2870 :
2871 : "memory", "cc"
2872#if defined(__SSE2__)
2873 , "xmm0", "xmm1", "xmm2"
2874#endif
2875 );
2876}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002877#endif // HAS_MERGEUVROW_SSE2
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002878
fbarchard@google.com19932f82012-02-16 22:19:14 +00002879#ifdef HAS_COPYROW_SSE2
2880void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002881 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002882 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002883 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002884 "1: \n"
2885 "movdqa (%0),%%xmm0 \n"
2886 "movdqa 0x10(%0),%%xmm1 \n"
2887 "movdqa %%xmm0,(%0,%1) \n"
2888 "movdqa %%xmm1,0x10(%0,%1) \n"
2889 "lea 0x20(%0),%0 \n"
2890 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002891 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002892 : "+r"(src), // %0
2893 "+r"(dst), // %1
2894 "+r"(count) // %2
2895 :
2896 : "memory", "cc"
2897#if defined(__SSE2__)
2898 , "xmm0", "xmm1"
2899#endif
2900 );
2901}
2902#endif // HAS_COPYROW_SSE2
2903
2904#ifdef HAS_COPYROW_X86
2905void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2906 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002907 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002908 "shr $0x2,%2 \n"
2909 "rep movsl \n"
2910 : "+S"(src), // %0
2911 "+D"(dst), // %1
2912 "+c"(width_tmp) // %2
2913 :
2914 : "memory", "cc"
2915 );
2916}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002917#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002918
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002919#ifdef HAS_SETROW_X86
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002920void SetRow_X86(uint8* dst, uint32 v32, int width) {
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002921 size_t width_tmp = static_cast<size_t>(width);
2922 asm volatile (
2923 "shr $0x2,%1 \n"
2924 "rep stosl \n"
2925 : "+D"(dst), // %0
2926 "+c"(width_tmp) // %1
2927 : "a"(v32) // %2
2928 : "memory", "cc");
2929}
2930
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002931void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002932 int dst_stride, int height) {
2933 for (int y = 0; y < height; ++y) {
2934 size_t width_tmp = static_cast<size_t>(width);
2935 uint32* d = reinterpret_cast<uint32*>(dst);
2936 asm volatile (
2937 "rep stosl \n"
2938 : "+D"(d), // %0
2939 "+c"(width_tmp) // %1
2940 : "a"(v32) // %2
2941 : "memory", "cc");
2942 dst += dst_stride;
2943 }
2944}
2945#endif // HAS_SETROW_X86
2946
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002947#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002948void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002949 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002950 "pcmpeqb %%xmm5,%%xmm5 \n"
2951 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002952 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002953 "1: \n"
2954 "movdqa (%0),%%xmm0 \n"
2955 "movdqa 0x10(%0),%%xmm1 \n"
2956 "lea 0x20(%0),%0 \n"
2957 "pand %%xmm5,%%xmm0 \n"
2958 "pand %%xmm5,%%xmm1 \n"
2959 "packuswb %%xmm1,%%xmm0 \n"
2960 "movdqa %%xmm0,(%1) \n"
2961 "lea 0x10(%1),%1 \n"
2962 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002963 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002964 : "+r"(src_yuy2), // %0
2965 "+r"(dst_y), // %1
2966 "+r"(pix) // %2
2967 :
2968 : "memory", "cc"
2969#if defined(__SSE2__)
2970 , "xmm0", "xmm1", "xmm5"
2971#endif
2972 );
2973}
2974
2975void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002976 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002977 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002978 "pcmpeqb %%xmm5,%%xmm5 \n"
2979 "psrlw $0x8,%%xmm5 \n"
2980 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002981 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002982 "1: \n"
2983 "movdqa (%0),%%xmm0 \n"
2984 "movdqa 0x10(%0),%%xmm1 \n"
2985 "movdqa (%0,%4,1),%%xmm2 \n"
2986 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2987 "lea 0x20(%0),%0 \n"
2988 "pavgb %%xmm2,%%xmm0 \n"
2989 "pavgb %%xmm3,%%xmm1 \n"
2990 "psrlw $0x8,%%xmm0 \n"
2991 "psrlw $0x8,%%xmm1 \n"
2992 "packuswb %%xmm1,%%xmm0 \n"
2993 "movdqa %%xmm0,%%xmm1 \n"
2994 "pand %%xmm5,%%xmm0 \n"
2995 "packuswb %%xmm0,%%xmm0 \n"
2996 "psrlw $0x8,%%xmm1 \n"
2997 "packuswb %%xmm1,%%xmm1 \n"
2998 "movq %%xmm0,(%1) \n"
2999 "movq %%xmm1,(%1,%2) \n"
3000 "lea 0x8(%1),%1 \n"
3001 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003002 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003003 : "+r"(src_yuy2), // %0
3004 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003005 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003006 "+r"(pix) // %3
3007 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
3008 : "memory", "cc"
3009#if defined(__SSE2__)
3010 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3011#endif
3012 );
3013}
3014
fbarchard@google.comc704f782012-08-30 19:53:48 +00003015void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3016 uint8* dst_u, uint8* dst_v, int pix) {
3017 asm volatile (
3018 "pcmpeqb %%xmm5,%%xmm5 \n"
3019 "psrlw $0x8,%%xmm5 \n"
3020 "sub %1,%2 \n"
3021 ".p2align 4 \n"
3022 "1: \n"
3023 "movdqa (%0),%%xmm0 \n"
3024 "movdqa 0x10(%0),%%xmm1 \n"
3025 "lea 0x20(%0),%0 \n"
3026 "psrlw $0x8,%%xmm0 \n"
3027 "psrlw $0x8,%%xmm1 \n"
3028 "packuswb %%xmm1,%%xmm0 \n"
3029 "movdqa %%xmm0,%%xmm1 \n"
3030 "pand %%xmm5,%%xmm0 \n"
3031 "packuswb %%xmm0,%%xmm0 \n"
3032 "psrlw $0x8,%%xmm1 \n"
3033 "packuswb %%xmm1,%%xmm1 \n"
3034 "movq %%xmm0,(%1) \n"
3035 "movq %%xmm1,(%1,%2) \n"
3036 "lea 0x8(%1),%1 \n"
3037 "sub $0x10,%3 \n"
3038 "jg 1b \n"
3039 : "+r"(src_yuy2), // %0
3040 "+r"(dst_u), // %1
3041 "+r"(dst_v), // %2
3042 "+r"(pix) // %3
3043 :
3044 : "memory", "cc"
3045#if defined(__SSE2__)
3046 , "xmm0", "xmm1", "xmm5"
3047#endif
3048 );
3049}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00003050
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003051void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
3052 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003053 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003054 "pcmpeqb %%xmm5,%%xmm5 \n"
3055 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003056 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003057 "1: \n"
3058 "movdqu (%0),%%xmm0 \n"
3059 "movdqu 0x10(%0),%%xmm1 \n"
3060 "lea 0x20(%0),%0 \n"
3061 "pand %%xmm5,%%xmm0 \n"
3062 "pand %%xmm5,%%xmm1 \n"
3063 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003064 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003065 "movdqu %%xmm0,(%1) \n"
3066 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003067 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003068 : "+r"(src_yuy2), // %0
3069 "+r"(dst_y), // %1
3070 "+r"(pix) // %2
3071 :
3072 : "memory", "cc"
3073#if defined(__SSE2__)
3074 , "xmm0", "xmm1", "xmm5"
3075#endif
3076 );
3077}
3078
3079void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
3080 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00003081 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003082 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003083 "pcmpeqb %%xmm5,%%xmm5 \n"
3084 "psrlw $0x8,%%xmm5 \n"
3085 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003086 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003087 "1: \n"
3088 "movdqu (%0),%%xmm0 \n"
3089 "movdqu 0x10(%0),%%xmm1 \n"
3090 "movdqu (%0,%4,1),%%xmm2 \n"
3091 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3092 "lea 0x20(%0),%0 \n"
3093 "pavgb %%xmm2,%%xmm0 \n"
3094 "pavgb %%xmm3,%%xmm1 \n"
3095 "psrlw $0x8,%%xmm0 \n"
3096 "psrlw $0x8,%%xmm1 \n"
3097 "packuswb %%xmm1,%%xmm0 \n"
3098 "movdqa %%xmm0,%%xmm1 \n"
3099 "pand %%xmm5,%%xmm0 \n"
3100 "packuswb %%xmm0,%%xmm0 \n"
3101 "psrlw $0x8,%%xmm1 \n"
3102 "packuswb %%xmm1,%%xmm1 \n"
3103 "movq %%xmm0,(%1) \n"
3104 "movq %%xmm1,(%1,%2) \n"
3105 "lea 0x8(%1),%1 \n"
3106 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003107 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003108 : "+r"(src_yuy2), // %0
3109 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003110 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003111 "+r"(pix) // %3
3112 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
3113 : "memory", "cc"
3114#if defined(__SSE2__)
3115 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3116#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00003117 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003118}
3119
fbarchard@google.comc704f782012-08-30 19:53:48 +00003120void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
3121 uint8* dst_u, uint8* dst_v, int pix) {
3122 asm volatile (
3123 "pcmpeqb %%xmm5,%%xmm5 \n"
3124 "psrlw $0x8,%%xmm5 \n"
3125 "sub %1,%2 \n"
3126 ".p2align 4 \n"
3127 "1: \n"
3128 "movdqu (%0),%%xmm0 \n"
3129 "movdqu 0x10(%0),%%xmm1 \n"
3130 "lea 0x20(%0),%0 \n"
3131 "psrlw $0x8,%%xmm0 \n"
3132 "psrlw $0x8,%%xmm1 \n"
3133 "packuswb %%xmm1,%%xmm0 \n"
3134 "movdqa %%xmm0,%%xmm1 \n"
3135 "pand %%xmm5,%%xmm0 \n"
3136 "packuswb %%xmm0,%%xmm0 \n"
3137 "psrlw $0x8,%%xmm1 \n"
3138 "packuswb %%xmm1,%%xmm1 \n"
3139 "movq %%xmm0,(%1) \n"
3140 "movq %%xmm1,(%1,%2) \n"
3141 "lea 0x8(%1),%1 \n"
3142 "sub $0x10,%3 \n"
3143 "jg 1b \n"
3144 : "+r"(src_yuy2), // %0
3145 "+r"(dst_u), // %1
3146 "+r"(dst_v), // %2
3147 "+r"(pix) // %3
3148 :
3149 : "memory", "cc"
3150#if defined(__SSE2__)
3151 , "xmm0", "xmm1", "xmm5"
3152#endif
3153 );
3154}
3155
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003156void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003157 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003158 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003159 "1: \n"
3160 "movdqa (%0),%%xmm0 \n"
3161 "movdqa 0x10(%0),%%xmm1 \n"
3162 "lea 0x20(%0),%0 \n"
3163 "psrlw $0x8,%%xmm0 \n"
3164 "psrlw $0x8,%%xmm1 \n"
3165 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003166 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003167 "movdqa %%xmm0,(%1) \n"
3168 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003169 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003170 : "+r"(src_uyvy), // %0
3171 "+r"(dst_y), // %1
3172 "+r"(pix) // %2
3173 :
3174 : "memory", "cc"
3175#if defined(__SSE2__)
3176 , "xmm0", "xmm1"
3177#endif
3178 );
3179}
3180
3181void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003182 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003183 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003184 "pcmpeqb %%xmm5,%%xmm5 \n"
3185 "psrlw $0x8,%%xmm5 \n"
3186 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003187 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003188 "1: \n"
3189 "movdqa (%0),%%xmm0 \n"
3190 "movdqa 0x10(%0),%%xmm1 \n"
3191 "movdqa (%0,%4,1),%%xmm2 \n"
3192 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
3193 "lea 0x20(%0),%0 \n"
3194 "pavgb %%xmm2,%%xmm0 \n"
3195 "pavgb %%xmm3,%%xmm1 \n"
3196 "pand %%xmm5,%%xmm0 \n"
3197 "pand %%xmm5,%%xmm1 \n"
3198 "packuswb %%xmm1,%%xmm0 \n"
3199 "movdqa %%xmm0,%%xmm1 \n"
3200 "pand %%xmm5,%%xmm0 \n"
3201 "packuswb %%xmm0,%%xmm0 \n"
3202 "psrlw $0x8,%%xmm1 \n"
3203 "packuswb %%xmm1,%%xmm1 \n"
3204 "movq %%xmm0,(%1) \n"
3205 "movq %%xmm1,(%1,%2) \n"
3206 "lea 0x8(%1),%1 \n"
3207 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003208 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003209 : "+r"(src_uyvy), // %0
3210 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003211 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003212 "+r"(pix) // %3
3213 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3214 : "memory", "cc"
3215#if defined(__SSE2__)
3216 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3217#endif
3218 );
3219}
3220
fbarchard@google.comc704f782012-08-30 19:53:48 +00003221void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3222 uint8* dst_u, uint8* dst_v, int pix) {
3223 asm volatile (
3224 "pcmpeqb %%xmm5,%%xmm5 \n"
3225 "psrlw $0x8,%%xmm5 \n"
3226 "sub %1,%2 \n"
3227 ".p2align 4 \n"
3228 "1: \n"
3229 "movdqa (%0),%%xmm0 \n"
3230 "movdqa 0x10(%0),%%xmm1 \n"
3231 "lea 0x20(%0),%0 \n"
3232 "pand %%xmm5,%%xmm0 \n"
3233 "pand %%xmm5,%%xmm1 \n"
3234 "packuswb %%xmm1,%%xmm0 \n"
3235 "movdqa %%xmm0,%%xmm1 \n"
3236 "pand %%xmm5,%%xmm0 \n"
3237 "packuswb %%xmm0,%%xmm0 \n"
3238 "psrlw $0x8,%%xmm1 \n"
3239 "packuswb %%xmm1,%%xmm1 \n"
3240 "movq %%xmm0,(%1) \n"
3241 "movq %%xmm1,(%1,%2) \n"
3242 "lea 0x8(%1),%1 \n"
3243 "sub $0x10,%3 \n"
3244 "jg 1b \n"
3245 : "+r"(src_uyvy), // %0
3246 "+r"(dst_u), // %1
3247 "+r"(dst_v), // %2
3248 "+r"(pix) // %3
3249 :
3250 : "memory", "cc"
3251#if defined(__SSE2__)
3252 , "xmm0", "xmm1", "xmm5"
3253#endif
3254 );
3255}
3256
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003257void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3258 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003259 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003260 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003261 "1: \n"
3262 "movdqu (%0),%%xmm0 \n"
3263 "movdqu 0x10(%0),%%xmm1 \n"
3264 "lea 0x20(%0),%0 \n"
3265 "psrlw $0x8,%%xmm0 \n"
3266 "psrlw $0x8,%%xmm1 \n"
3267 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003268 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003269 "movdqu %%xmm0,(%1) \n"
3270 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003271 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003272 : "+r"(src_uyvy), // %0
3273 "+r"(dst_y), // %1
3274 "+r"(pix) // %2
3275 :
3276 : "memory", "cc"
3277#if defined(__SSE2__)
3278 , "xmm0", "xmm1"
3279#endif
3280 );
3281}
3282
3283void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003284 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003285 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003286 "pcmpeqb %%xmm5,%%xmm5 \n"
3287 "psrlw $0x8,%%xmm5 \n"
3288 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003289 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003290 "1: \n"
3291 "movdqu (%0),%%xmm0 \n"
3292 "movdqu 0x10(%0),%%xmm1 \n"
3293 "movdqu (%0,%4,1),%%xmm2 \n"
3294 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3295 "lea 0x20(%0),%0 \n"
3296 "pavgb %%xmm2,%%xmm0 \n"
3297 "pavgb %%xmm3,%%xmm1 \n"
3298 "pand %%xmm5,%%xmm0 \n"
3299 "pand %%xmm5,%%xmm1 \n"
3300 "packuswb %%xmm1,%%xmm0 \n"
3301 "movdqa %%xmm0,%%xmm1 \n"
3302 "pand %%xmm5,%%xmm0 \n"
3303 "packuswb %%xmm0,%%xmm0 \n"
3304 "psrlw $0x8,%%xmm1 \n"
3305 "packuswb %%xmm1,%%xmm1 \n"
3306 "movq %%xmm0,(%1) \n"
3307 "movq %%xmm1,(%1,%2) \n"
3308 "lea 0x8(%1),%1 \n"
3309 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003310 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003311 : "+r"(src_uyvy), // %0
3312 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003313 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003314 "+r"(pix) // %3
3315 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3316 : "memory", "cc"
3317#if defined(__SSE2__)
3318 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3319#endif
3320 );
3321}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003322
3323void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3324 uint8* dst_u, uint8* dst_v, int pix) {
3325 asm volatile (
3326 "pcmpeqb %%xmm5,%%xmm5 \n"
3327 "psrlw $0x8,%%xmm5 \n"
3328 "sub %1,%2 \n"
3329 ".p2align 4 \n"
3330 "1: \n"
3331 "movdqu (%0),%%xmm0 \n"
3332 "movdqu 0x10(%0),%%xmm1 \n"
3333 "lea 0x20(%0),%0 \n"
3334 "pand %%xmm5,%%xmm0 \n"
3335 "pand %%xmm5,%%xmm1 \n"
3336 "packuswb %%xmm1,%%xmm0 \n"
3337 "movdqa %%xmm0,%%xmm1 \n"
3338 "pand %%xmm5,%%xmm0 \n"
3339 "packuswb %%xmm0,%%xmm0 \n"
3340 "psrlw $0x8,%%xmm1 \n"
3341 "packuswb %%xmm1,%%xmm1 \n"
3342 "movq %%xmm0,(%1) \n"
3343 "movq %%xmm1,(%1,%2) \n"
3344 "lea 0x8(%1),%1 \n"
3345 "sub $0x10,%3 \n"
3346 "jg 1b \n"
3347 : "+r"(src_uyvy), // %0
3348 "+r"(dst_u), // %1
3349 "+r"(dst_v), // %2
3350 "+r"(pix) // %3
3351 :
3352 : "memory", "cc"
3353#if defined(__SSE2__)
3354 , "xmm0", "xmm1", "xmm5"
3355#endif
3356 );
3357}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003358#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003359
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003360#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003361// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003362void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3363 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003364 asm volatile (
3365 "pcmpeqb %%xmm7,%%xmm7 \n"
3366 "psrlw $0xf,%%xmm7 \n"
3367 "pcmpeqb %%xmm6,%%xmm6 \n"
3368 "psrlw $0x8,%%xmm6 \n"
3369 "pcmpeqb %%xmm5,%%xmm5 \n"
3370 "psllw $0x8,%%xmm5 \n"
3371 "pcmpeqb %%xmm4,%%xmm4 \n"
3372 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003373 "sub $0x1,%3 \n"
3374 "je 91f \n"
3375 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003376
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003377 // 1 pixel loop until destination pointer is aligned.
3378 "10: \n"
3379 "test $0xf,%2 \n"
3380 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003381 "movd (%0),%%xmm3 \n"
3382 "lea 0x4(%0),%0 \n"
3383 "movdqa %%xmm3,%%xmm0 \n"
3384 "pxor %%xmm4,%%xmm3 \n"
3385 "movd (%1),%%xmm2 \n"
3386 "psrlw $0x8,%%xmm3 \n"
3387 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3388 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3389 "pand %%xmm6,%%xmm2 \n"
3390 "paddw %%xmm7,%%xmm3 \n"
3391 "pmullw %%xmm3,%%xmm2 \n"
3392 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003393 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003394 "psrlw $0x8,%%xmm1 \n"
3395 "por %%xmm4,%%xmm0 \n"
3396 "pmullw %%xmm3,%%xmm1 \n"
3397 "psrlw $0x8,%%xmm2 \n"
3398 "paddusb %%xmm2,%%xmm0 \n"
3399 "pand %%xmm5,%%xmm1 \n"
3400 "paddusb %%xmm1,%%xmm0 \n"
3401 "sub $0x1,%3 \n"
3402 "movd %%xmm0,(%2) \n"
3403 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003404 "jge 10b \n"
3405
3406 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003407 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003408 "jl 49f \n"
3409
fbarchard@google.com794fe122012-06-15 01:05:01 +00003410 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003411 ".p2align 2 \n"
3412 "41: \n"
3413 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003414 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003415 "movdqa %%xmm3,%%xmm0 \n"
3416 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003417 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003418 "psrlw $0x8,%%xmm3 \n"
3419 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3420 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003421 "pand %%xmm6,%%xmm2 \n"
3422 "paddw %%xmm7,%%xmm3 \n"
3423 "pmullw %%xmm3,%%xmm2 \n"
3424 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003425 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003426 "psrlw $0x8,%%xmm1 \n"
3427 "por %%xmm4,%%xmm0 \n"
3428 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003429 "psrlw $0x8,%%xmm2 \n"
3430 "paddusb %%xmm2,%%xmm0 \n"
3431 "pand %%xmm5,%%xmm1 \n"
3432 "paddusb %%xmm1,%%xmm0 \n"
3433 "sub $0x4,%3 \n"
3434 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003435 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003436 "jge 41b \n"
3437
3438 "49: \n"
3439 "add $0x3,%3 \n"
3440 "jl 99f \n"
3441
fbarchard@google.com794fe122012-06-15 01:05:01 +00003442 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003443 "91: \n"
3444 "movd (%0),%%xmm3 \n"
3445 "lea 0x4(%0),%0 \n"
3446 "movdqa %%xmm3,%%xmm0 \n"
3447 "pxor %%xmm4,%%xmm3 \n"
3448 "movd (%1),%%xmm2 \n"
3449 "psrlw $0x8,%%xmm3 \n"
3450 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3451 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3452 "pand %%xmm6,%%xmm2 \n"
3453 "paddw %%xmm7,%%xmm3 \n"
3454 "pmullw %%xmm3,%%xmm2 \n"
3455 "movd (%1),%%xmm1 \n"
3456 "lea 0x4(%1),%1 \n"
3457 "psrlw $0x8,%%xmm1 \n"
3458 "por %%xmm4,%%xmm0 \n"
3459 "pmullw %%xmm3,%%xmm1 \n"
3460 "psrlw $0x8,%%xmm2 \n"
3461 "paddusb %%xmm2,%%xmm0 \n"
3462 "pand %%xmm5,%%xmm1 \n"
3463 "paddusb %%xmm1,%%xmm0 \n"
3464 "sub $0x1,%3 \n"
3465 "movd %%xmm0,(%2) \n"
3466 "lea 0x4(%2),%2 \n"
3467 "jge 91b \n"
3468 "99: \n"
3469 : "+r"(src_argb0), // %0
3470 "+r"(src_argb1), // %1
3471 "+r"(dst_argb), // %2
3472 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003473 :
3474 : "memory", "cc"
3475#if defined(__SSE2__)
3476 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3477#endif
3478 );
3479}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003480#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003481
fbarchard@google.com96af8702012-04-06 18:22:27 +00003482#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003483// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003484CONST uvec8 kShuffleAlpha = {
3485 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3486 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3487};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003488
3489// Blend 8 pixels at a time
3490// Shuffle table for reversing the bytes.
3491
3492// Same as SSE2, but replaces
3493// psrlw xmm3, 8 // alpha
3494// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3495// pshuflw xmm3, xmm3,0F5h
3496// with..
3497// pshufb xmm3, kShuffleAlpha // alpha
3498
3499void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3500 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003501 asm volatile (
3502 "pcmpeqb %%xmm7,%%xmm7 \n"
3503 "psrlw $0xf,%%xmm7 \n"
3504 "pcmpeqb %%xmm6,%%xmm6 \n"
3505 "psrlw $0x8,%%xmm6 \n"
3506 "pcmpeqb %%xmm5,%%xmm5 \n"
3507 "psllw $0x8,%%xmm5 \n"
3508 "pcmpeqb %%xmm4,%%xmm4 \n"
3509 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003510 "sub $0x1,%3 \n"
3511 "je 91f \n"
3512 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003513
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003514 // 1 pixel loop until destination pointer is aligned.
3515 "10: \n"
3516 "test $0xf,%2 \n"
3517 "je 19f \n"
3518 "movd (%0),%%xmm3 \n"
3519 "lea 0x4(%0),%0 \n"
3520 "movdqa %%xmm3,%%xmm0 \n"
3521 "pxor %%xmm4,%%xmm3 \n"
3522 "movd (%1),%%xmm2 \n"
3523 "pshufb %4,%%xmm3 \n"
3524 "pand %%xmm6,%%xmm2 \n"
3525 "paddw %%xmm7,%%xmm3 \n"
3526 "pmullw %%xmm3,%%xmm2 \n"
3527 "movd (%1),%%xmm1 \n"
3528 "lea 0x4(%1),%1 \n"
3529 "psrlw $0x8,%%xmm1 \n"
3530 "por %%xmm4,%%xmm0 \n"
3531 "pmullw %%xmm3,%%xmm1 \n"
3532 "psrlw $0x8,%%xmm2 \n"
3533 "paddusb %%xmm2,%%xmm0 \n"
3534 "pand %%xmm5,%%xmm1 \n"
3535 "paddusb %%xmm1,%%xmm0 \n"
3536 "sub $0x1,%3 \n"
3537 "movd %%xmm0,(%2) \n"
3538 "lea 0x4(%2),%2 \n"
3539 "jge 10b \n"
3540
3541 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003542 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003543 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003544 "test $0xf,%0 \n"
3545 "jne 41f \n"
3546 "test $0xf,%1 \n"
3547 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003548
fbarchard@google.com794fe122012-06-15 01:05:01 +00003549 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003550 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003551 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003552 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003553 "lea 0x10(%0),%0 \n"
3554 "movdqa %%xmm3,%%xmm0 \n"
3555 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003556 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003557 "pshufb %4,%%xmm3 \n"
3558 "pand %%xmm6,%%xmm2 \n"
3559 "paddw %%xmm7,%%xmm3 \n"
3560 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003561 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003562 "lea 0x10(%1),%1 \n"
3563 "psrlw $0x8,%%xmm1 \n"
3564 "por %%xmm4,%%xmm0 \n"
3565 "pmullw %%xmm3,%%xmm1 \n"
3566 "psrlw $0x8,%%xmm2 \n"
3567 "paddusb %%xmm2,%%xmm0 \n"
3568 "pand %%xmm5,%%xmm1 \n"
3569 "paddusb %%xmm1,%%xmm0 \n"
3570 "sub $0x4,%3 \n"
3571 "movdqa %%xmm0,(%2) \n"
3572 "lea 0x10(%2),%2 \n"
3573 "jge 40b \n"
3574 "jmp 49f \n"
3575
3576 // 4 pixel unaligned loop.
3577 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003578 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003579 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003580 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003581 "movdqa %%xmm3,%%xmm0 \n"
3582 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003583 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003584 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003585 "pand %%xmm6,%%xmm2 \n"
3586 "paddw %%xmm7,%%xmm3 \n"
3587 "pmullw %%xmm3,%%xmm2 \n"
3588 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003589 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003590 "psrlw $0x8,%%xmm1 \n"
3591 "por %%xmm4,%%xmm0 \n"
3592 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003593 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003594 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003595 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003596 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003597 "sub $0x4,%3 \n"
3598 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003599 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003600 "jge 41b \n"
3601
3602 "49: \n"
3603 "add $0x3,%3 \n"
3604 "jl 99f \n"
3605
fbarchard@google.com794fe122012-06-15 01:05:01 +00003606 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003607 "91: \n"
3608 "movd (%0),%%xmm3 \n"
3609 "lea 0x4(%0),%0 \n"
3610 "movdqa %%xmm3,%%xmm0 \n"
3611 "pxor %%xmm4,%%xmm3 \n"
3612 "movd (%1),%%xmm2 \n"
3613 "pshufb %4,%%xmm3 \n"
3614 "pand %%xmm6,%%xmm2 \n"
3615 "paddw %%xmm7,%%xmm3 \n"
3616 "pmullw %%xmm3,%%xmm2 \n"
3617 "movd (%1),%%xmm1 \n"
3618 "lea 0x4(%1),%1 \n"
3619 "psrlw $0x8,%%xmm1 \n"
3620 "por %%xmm4,%%xmm0 \n"
3621 "pmullw %%xmm3,%%xmm1 \n"
3622 "psrlw $0x8,%%xmm2 \n"
3623 "paddusb %%xmm2,%%xmm0 \n"
3624 "pand %%xmm5,%%xmm1 \n"
3625 "paddusb %%xmm1,%%xmm0 \n"
3626 "sub $0x1,%3 \n"
3627 "movd %%xmm0,(%2) \n"
3628 "lea 0x4(%2),%2 \n"
3629 "jge 91b \n"
3630 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003631 : "+r"(src_argb0), // %0
3632 "+r"(src_argb1), // %1
3633 "+r"(dst_argb), // %2
3634 "+r"(width) // %3
3635 : "m"(kShuffleAlpha) // %4
3636 : "memory", "cc"
3637#if defined(__SSE2__)
3638 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3639#endif
3640 );
3641}
3642#endif // HAS_ARGBBLENDROW_SSSE3
3643
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003644#ifdef HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003645// Attenuate 4 pixels at a time.
3646// aligned to 16 bytes
3647void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3648 asm volatile (
3649 "sub %0,%1 \n"
3650 "pcmpeqb %%xmm4,%%xmm4 \n"
3651 "pslld $0x18,%%xmm4 \n"
3652 "pcmpeqb %%xmm5,%%xmm5 \n"
3653 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003654
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003655 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003656 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003657 "1: \n"
3658 "movdqa (%0),%%xmm0 \n"
3659 "punpcklbw %%xmm0,%%xmm0 \n"
3660 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3661 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3662 "pmulhuw %%xmm2,%%xmm0 \n"
3663 "movdqa (%0),%%xmm1 \n"
3664 "punpckhbw %%xmm1,%%xmm1 \n"
3665 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3666 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3667 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003668 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003669 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003670 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003671 "psrlw $0x8,%%xmm1 \n"
3672 "packuswb %%xmm1,%%xmm0 \n"
3673 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003674 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003675 "sub $0x4,%2 \n"
3676 "movdqa %%xmm0,(%0,%1,1) \n"
3677 "lea 0x10(%0),%0 \n"
3678 "jg 1b \n"
3679 : "+r"(src_argb), // %0
3680 "+r"(dst_argb), // %1
3681 "+r"(width) // %2
3682 :
3683 : "memory", "cc"
3684#if defined(__SSE2__)
3685 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3686#endif
3687 );
3688}
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003689#endif // HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003690
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003691#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003692// Shuffle table duplicating alpha
3693CONST uvec8 kShuffleAlpha0 = {
3694 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3695};
3696CONST uvec8 kShuffleAlpha1 = {
3697 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3698 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3699};
3700// Attenuate 4 pixels at a time.
3701// aligned to 16 bytes
3702void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3703 asm volatile (
3704 "sub %0,%1 \n"
3705 "pcmpeqb %%xmm3,%%xmm3 \n"
3706 "pslld $0x18,%%xmm3 \n"
3707 "movdqa %3,%%xmm4 \n"
3708 "movdqa %4,%%xmm5 \n"
3709
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003710 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003711 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003712 "1: \n"
3713 "movdqa (%0),%%xmm0 \n"
3714 "pshufb %%xmm4,%%xmm0 \n"
3715 "movdqa (%0),%%xmm1 \n"
3716 "punpcklbw %%xmm1,%%xmm1 \n"
3717 "pmulhuw %%xmm1,%%xmm0 \n"
3718 "movdqa (%0),%%xmm1 \n"
3719 "pshufb %%xmm5,%%xmm1 \n"
3720 "movdqa (%0),%%xmm2 \n"
3721 "punpckhbw %%xmm2,%%xmm2 \n"
3722 "pmulhuw %%xmm2,%%xmm1 \n"
3723 "movdqa (%0),%%xmm2 \n"
3724 "pand %%xmm3,%%xmm2 \n"
3725 "psrlw $0x8,%%xmm0 \n"
3726 "psrlw $0x8,%%xmm1 \n"
3727 "packuswb %%xmm1,%%xmm0 \n"
3728 "por %%xmm2,%%xmm0 \n"
3729 "sub $0x4,%2 \n"
3730 "movdqa %%xmm0,(%0,%1,1) \n"
3731 "lea 0x10(%0),%0 \n"
3732 "jg 1b \n"
3733 : "+r"(src_argb), // %0
3734 "+r"(dst_argb), // %1
3735 "+r"(width) // %2
3736 : "m"(kShuffleAlpha0), // %3
3737 "m"(kShuffleAlpha1) // %4
3738 : "memory", "cc"
3739#if defined(__SSE2__)
3740 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3741#endif
3742 );
3743}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003744#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003745
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003746#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003747// Unattenuate 4 pixels at a time.
3748// aligned to 16 bytes
3749void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3750 int width) {
3751 uintptr_t alpha = 0;
3752 asm volatile (
3753 "sub %0,%1 \n"
3754 "pcmpeqb %%xmm4,%%xmm4 \n"
3755 "pslld $0x18,%%xmm4 \n"
3756
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003757 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003758 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003759 "1: \n"
3760 "movdqa (%0),%%xmm0 \n"
3761 "movzb 0x3(%0),%3 \n"
3762 "punpcklbw %%xmm0,%%xmm0 \n"
3763 "movd 0x0(%4,%3,4),%%xmm2 \n"
3764 "movzb 0x7(%0),%3 \n"
3765 "movd 0x0(%4,%3,4),%%xmm3 \n"
3766 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3767 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3768 "movlhps %%xmm3,%%xmm2 \n"
3769 "pmulhuw %%xmm2,%%xmm0 \n"
3770 "movdqa (%0),%%xmm1 \n"
3771 "movzb 0xb(%0),%3 \n"
3772 "punpckhbw %%xmm1,%%xmm1 \n"
3773 "movd 0x0(%4,%3,4),%%xmm2 \n"
3774 "movzb 0xf(%0),%3 \n"
3775 "movd 0x0(%4,%3,4),%%xmm3 \n"
3776 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3777 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3778 "movlhps %%xmm3,%%xmm2 \n"
3779 "pmulhuw %%xmm2,%%xmm1 \n"
3780 "movdqa (%0),%%xmm2 \n"
3781 "pand %%xmm4,%%xmm2 \n"
3782 "packuswb %%xmm1,%%xmm0 \n"
3783 "por %%xmm2,%%xmm0 \n"
3784 "sub $0x4,%2 \n"
3785 "movdqa %%xmm0,(%0,%1,1) \n"
3786 "lea 0x10(%0),%0 \n"
3787 "jg 1b \n"
3788 : "+r"(src_argb), // %0
3789 "+r"(dst_argb), // %1
3790 "+r"(width), // %2
3791 "+r"(alpha) // %3
3792 : "r"(fixed_invtbl8) // %4
3793 : "memory", "cc"
3794#if defined(__SSE2__)
3795 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3796#endif
3797 );
3798}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003799#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003800
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003801#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003802// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003803CONST vec8 kARGBToGray = {
3804 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3805};
3806
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003807// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003808void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003809 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003810 "movdqa %3,%%xmm4 \n"
3811 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003812
3813 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003814 ".p2align 4 \n"
3815 "1: \n"
3816 "movdqa (%0),%%xmm0 \n"
3817 "movdqa 0x10(%0),%%xmm1 \n"
3818 "pmaddubsw %%xmm4,%%xmm0 \n"
3819 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003820 "phaddw %%xmm1,%%xmm0 \n"
3821 "psrlw $0x7,%%xmm0 \n"
3822 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003823 "movdqa (%0),%%xmm2 \n"
3824 "movdqa 0x10(%0),%%xmm3 \n"
3825 "psrld $0x18,%%xmm2 \n"
3826 "psrld $0x18,%%xmm3 \n"
3827 "packuswb %%xmm3,%%xmm2 \n"
3828 "packuswb %%xmm2,%%xmm2 \n"
3829 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003830 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003831 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003832 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003833 "punpcklwd %%xmm3,%%xmm0 \n"
3834 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003835 "sub $0x8,%2 \n"
3836 "movdqa %%xmm0,(%0,%1,1) \n"
3837 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003838 "lea 0x20(%0),%0 \n"
3839 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003840 : "+r"(src_argb), // %0
3841 "+r"(dst_argb), // %1
3842 "+r"(width) // %2
3843 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003844 : "memory", "cc"
3845#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003846 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003847#endif
3848 );
3849}
3850#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003851
3852#ifdef HAS_ARGBSEPIAROW_SSSE3
3853// b = (r * 35 + g * 68 + b * 17) >> 7
3854// g = (r * 45 + g * 88 + b * 22) >> 7
3855// r = (r * 50 + g * 98 + b * 24) >> 7
3856// Constant for ARGB color to sepia tone
3857CONST vec8 kARGBToSepiaB = {
3858 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3859};
3860
3861CONST vec8 kARGBToSepiaG = {
3862 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3863};
3864
3865CONST vec8 kARGBToSepiaR = {
3866 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3867};
3868
fbarchard@google.come442dc42012-06-18 17:37:09 +00003869// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003870void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3871 asm volatile (
3872 "movdqa %2,%%xmm2 \n"
3873 "movdqa %3,%%xmm3 \n"
3874 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003875
3876 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003877 ".p2align 4 \n"
3878 "1: \n"
3879 "movdqa (%0),%%xmm0 \n"
3880 "movdqa 0x10(%0),%%xmm6 \n"
3881 "pmaddubsw %%xmm2,%%xmm0 \n"
3882 "pmaddubsw %%xmm2,%%xmm6 \n"
3883 "phaddw %%xmm6,%%xmm0 \n"
3884 "psrlw $0x7,%%xmm0 \n"
3885 "packuswb %%xmm0,%%xmm0 \n"
3886 "movdqa (%0),%%xmm5 \n"
3887 "movdqa 0x10(%0),%%xmm1 \n"
3888 "pmaddubsw %%xmm3,%%xmm5 \n"
3889 "pmaddubsw %%xmm3,%%xmm1 \n"
3890 "phaddw %%xmm1,%%xmm5 \n"
3891 "psrlw $0x7,%%xmm5 \n"
3892 "packuswb %%xmm5,%%xmm5 \n"
3893 "punpcklbw %%xmm5,%%xmm0 \n"
3894 "movdqa (%0),%%xmm5 \n"
3895 "movdqa 0x10(%0),%%xmm1 \n"
3896 "pmaddubsw %%xmm4,%%xmm5 \n"
3897 "pmaddubsw %%xmm4,%%xmm1 \n"
3898 "phaddw %%xmm1,%%xmm5 \n"
3899 "psrlw $0x7,%%xmm5 \n"
3900 "packuswb %%xmm5,%%xmm5 \n"
3901 "movdqa (%0),%%xmm6 \n"
3902 "movdqa 0x10(%0),%%xmm1 \n"
3903 "psrld $0x18,%%xmm6 \n"
3904 "psrld $0x18,%%xmm1 \n"
3905 "packuswb %%xmm1,%%xmm6 \n"
3906 "packuswb %%xmm6,%%xmm6 \n"
3907 "punpcklbw %%xmm6,%%xmm5 \n"
3908 "movdqa %%xmm0,%%xmm1 \n"
3909 "punpcklwd %%xmm5,%%xmm0 \n"
3910 "punpckhwd %%xmm5,%%xmm1 \n"
3911 "sub $0x8,%1 \n"
3912 "movdqa %%xmm0,(%0) \n"
3913 "movdqa %%xmm1,0x10(%0) \n"
3914 "lea 0x20(%0),%0 \n"
3915 "jg 1b \n"
3916 : "+r"(dst_argb), // %0
3917 "+r"(width) // %1
3918 : "m"(kARGBToSepiaB), // %2
3919 "m"(kARGBToSepiaG), // %3
3920 "m"(kARGBToSepiaR) // %4
3921 : "memory", "cc"
3922#if defined(__SSE2__)
3923 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3924#endif
3925 );
3926}
3927#endif // HAS_ARGBSEPIAROW_SSSE3
3928
fbarchard@google.come442dc42012-06-18 17:37:09 +00003929#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3930// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3931// Same as Sepia except matrix is provided.
3932void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3933 int width) {
3934 asm volatile (
3935 "movd (%2),%%xmm2 \n"
3936 "movd 0x4(%2),%%xmm3 \n"
3937 "movd 0x8(%2),%%xmm4 \n"
3938 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3939 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3940 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003941
3942 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003943 ".p2align 4 \n"
3944 "1: \n"
3945 "movdqa (%0),%%xmm0 \n"
3946 "movdqa 0x10(%0),%%xmm6 \n"
3947 "pmaddubsw %%xmm2,%%xmm0 \n"
3948 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003949 "movdqa (%0),%%xmm5 \n"
3950 "movdqa 0x10(%0),%%xmm1 \n"
3951 "pmaddubsw %%xmm3,%%xmm5 \n"
3952 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003953 "phaddsw %%xmm6,%%xmm0 \n"
3954 "phaddsw %%xmm1,%%xmm5 \n"
3955 "psraw $0x7,%%xmm0 \n"
3956 "psraw $0x7,%%xmm5 \n"
3957 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003958 "packuswb %%xmm5,%%xmm5 \n"
3959 "punpcklbw %%xmm5,%%xmm0 \n"
3960 "movdqa (%0),%%xmm5 \n"
3961 "movdqa 0x10(%0),%%xmm1 \n"
3962 "pmaddubsw %%xmm4,%%xmm5 \n"
3963 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003964 "phaddsw %%xmm1,%%xmm5 \n"
3965 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003966 "packuswb %%xmm5,%%xmm5 \n"
3967 "movdqa (%0),%%xmm6 \n"
3968 "movdqa 0x10(%0),%%xmm1 \n"
3969 "psrld $0x18,%%xmm6 \n"
3970 "psrld $0x18,%%xmm1 \n"
3971 "packuswb %%xmm1,%%xmm6 \n"
3972 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003973 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003974 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003975 "punpcklwd %%xmm5,%%xmm0 \n"
3976 "punpckhwd %%xmm5,%%xmm1 \n"
3977 "sub $0x8,%1 \n"
3978 "movdqa %%xmm0,(%0) \n"
3979 "movdqa %%xmm1,0x10(%0) \n"
3980 "lea 0x20(%0),%0 \n"
3981 "jg 1b \n"
3982 : "+r"(dst_argb), // %0
3983 "+r"(width) // %1
3984 : "r"(matrix_argb) // %2
3985 : "memory", "cc"
3986#if defined(__SSE2__)
3987 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3988#endif
3989 );
3990}
3991#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3992
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003993#ifdef HAS_ARGBQUANTIZEROW_SSE2
3994// Quantize 4 ARGB pixels (16 bytes).
3995// aligned to 16 bytes
3996void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3997 int interval_offset, int width) {
3998 asm volatile (
3999 "movd %2,%%xmm2 \n"
4000 "movd %3,%%xmm3 \n"
4001 "movd %4,%%xmm4 \n"
4002 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4003 "pshufd $0x44,%%xmm2,%%xmm2 \n"
4004 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4005 "pshufd $0x44,%%xmm3,%%xmm3 \n"
4006 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
4007 "pshufd $0x44,%%xmm4,%%xmm4 \n"
4008 "pxor %%xmm5,%%xmm5 \n"
4009 "pcmpeqb %%xmm6,%%xmm6 \n"
4010 "pslld $0x18,%%xmm6 \n"
4011
4012 // 4 pixel loop.
4013 ".p2align 2 \n"
4014 "1: \n"
4015 "movdqa (%0),%%xmm0 \n"
4016 "punpcklbw %%xmm5,%%xmm0 \n"
4017 "pmulhuw %%xmm2,%%xmm0 \n"
4018 "movdqa (%0),%%xmm1 \n"
4019 "punpckhbw %%xmm5,%%xmm1 \n"
4020 "pmulhuw %%xmm2,%%xmm1 \n"
4021 "pmullw %%xmm3,%%xmm0 \n"
4022 "movdqa (%0),%%xmm7 \n"
4023 "pmullw %%xmm3,%%xmm1 \n"
4024 "pand %%xmm6,%%xmm7 \n"
4025 "paddw %%xmm4,%%xmm0 \n"
4026 "paddw %%xmm4,%%xmm1 \n"
4027 "packuswb %%xmm1,%%xmm0 \n"
4028 "por %%xmm7,%%xmm0 \n"
4029 "sub $0x4,%1 \n"
4030 "movdqa %%xmm0,(%0) \n"
4031 "lea 0x10(%0),%0 \n"
4032 "jg 1b \n"
4033 : "+r"(dst_argb), // %0
4034 "+r"(width) // %1
4035 : "r"(scale), // %2
4036 "r"(interval_size), // %3
4037 "r"(interval_offset) // %4
4038 : "memory", "cc"
4039#if defined(__SSE2__)
4040 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4041#endif
4042 );
4043}
4044#endif // HAS_ARGBQUANTIZEROW_SSE2
4045
fbarchard@google.comb94b1392012-12-03 20:36:40 +00004046#ifdef HAS_ARGBSHADEROW_SSE2
4047// Shade 4 pixels at a time by specified value.
4048// Aligned to 16 bytes.
4049void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4050 uint32 value) {
4051 asm volatile (
4052 "movd %3,%%xmm2 \n"
4053 "sub %0,%1 \n"
4054 "punpcklbw %%xmm2,%%xmm2 \n"
4055 "punpcklqdq %%xmm2,%%xmm2 \n"
4056
4057 // 4 pixel loop.
4058 ".p2align 2 \n"
4059 "1: \n"
4060 "movdqa (%0),%%xmm0 \n"
4061 "movdqa %%xmm0,%%xmm1 \n"
4062 "punpcklbw %%xmm0,%%xmm0 \n"
4063 "punpckhbw %%xmm1,%%xmm1 \n"
4064 "pmulhuw %%xmm2,%%xmm0 \n"
4065 "pmulhuw %%xmm2,%%xmm1 \n"
4066 "psrlw $0x8,%%xmm0 \n"
4067 "psrlw $0x8,%%xmm1 \n"
4068 "packuswb %%xmm1,%%xmm0 \n"
4069 "sub $0x4,%2 \n"
4070 "movdqa %%xmm0,(%0,%1,1) \n"
4071 "lea 0x10(%0),%0 \n"
4072 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004073 : "+r"(src_argb), // %0
4074 "+r"(dst_argb), // %1
4075 "+r"(width) // %2
4076 : "r"(value) // %3
fbarchard@google.comb94b1392012-12-03 20:36:40 +00004077 : "memory", "cc"
4078#if defined(__SSE2__)
4079 , "xmm0", "xmm1", "xmm2"
4080#endif
4081 );
4082}
4083#endif // HAS_ARGBSHADEROW_SSE2
4084
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004085#ifdef HAS_ARGBMULTIPLYROW_SSE2
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004086// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004087// Aligned to 16 bytes.
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004088void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4089 uint8* dst_argb, int width) {
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004090 asm volatile (
4091 "pxor %%xmm5,%%xmm5 \n"
4092 "sub %0,%1 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004093 "sub %0,%2 \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004094
4095 // 4 pixel loop.
4096 ".p2align 4 \n"
4097 "1: \n"
4098 "movdqa (%0),%%xmm0 \n"
4099 "movdqa (%0,%1),%%xmm2 \n"
4100 "movdqa %%xmm0,%%xmm1 \n"
4101 "movdqa %%xmm2,%%xmm3 \n"
4102 "punpcklbw %%xmm0,%%xmm0 \n"
4103 "punpckhbw %%xmm1,%%xmm1 \n"
4104 "punpcklbw %%xmm5,%%xmm2 \n"
4105 "punpckhbw %%xmm5,%%xmm3 \n"
4106 "pmulhuw %%xmm2,%%xmm0 \n"
4107 "pmulhuw %%xmm3,%%xmm1 \n"
4108 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004109 "sub $0x4,%3 \n"
4110 "movdqa %%xmm0,(%0,%2,1) \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004111 "lea 0x10(%0),%0 \n"
4112 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004113 : "+r"(src_argb0), // %0
4114 "+r"(src_argb1), // %1
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004115 "+r"(dst_argb), // %2
4116 "+r"(width) // %3
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004117 :
4118 : "memory", "cc"
4119#if defined(__SSE2__)
4120 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4121#endif
4122 );
4123}
4124#endif // HAS_ARGBMULTIPLYROW_SSE2
4125
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004126#ifdef HAS_ARGBADDROW_SSE2
4127// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4128// Aligned to 16 bytes.
4129void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4130 uint8* dst_argb, int width) {
4131 asm volatile (
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004132 "sub %0,%1 \n"
4133 "sub %0,%2 \n"
4134
4135 // 4 pixel loop.
4136 ".p2align 4 \n"
4137 "1: \n"
4138 "movdqa (%0),%%xmm0 \n"
4139 "movdqa (%0,%1),%%xmm1 \n"
4140 "paddusb %%xmm1,%%xmm0 \n"
4141 "sub $0x4,%3 \n"
4142 "movdqa %%xmm0,(%0,%2,1) \n"
4143 "lea 0x10(%0),%0 \n"
4144 "jg 1b \n"
4145 : "+r"(src_argb0), // %0
4146 "+r"(src_argb1), // %1
4147 "+r"(dst_argb), // %2
4148 "+r"(width) // %3
4149 :
4150 : "memory", "cc"
4151#if defined(__SSE2__)
fbarchard@google.com573a8832013-01-24 23:08:12 +00004152 , "xmm0", "xmm1"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004153#endif
4154 );
4155}
4156#endif // HAS_ARGBADDROW_SSE2
4157
fbarchard@google.com573a8832013-01-24 23:08:12 +00004158#ifdef HAS_ARGBSUBTRACTROW_SSE2
4159// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4160// Aligned to 16 bytes.
4161void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4162 uint8* dst_argb, int width) {
4163 asm volatile (
4164 "sub %0,%1 \n"
4165 "sub %0,%2 \n"
4166
4167 // 4 pixel loop.
4168 ".p2align 4 \n"
4169 "1: \n"
4170 "movdqa (%0),%%xmm0 \n"
4171 "movdqa (%0,%1),%%xmm1 \n"
4172 "psubusb %%xmm1,%%xmm0 \n"
4173 "sub $0x4,%3 \n"
4174 "movdqa %%xmm0,(%0,%2,1) \n"
4175 "lea 0x10(%0),%0 \n"
4176 "jg 1b \n"
4177 : "+r"(src_argb0), // %0
4178 "+r"(src_argb1), // %1
4179 "+r"(dst_argb), // %2
4180 "+r"(width) // %3
4181 :
4182 : "memory", "cc"
4183#if defined(__SSE2__)
4184 , "xmm0", "xmm1"
4185#endif
4186 );
4187}
4188#endif // HAS_ARGBSUBTRACTROW_SSE2
4189
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004190#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4191// Creates a table of cumulative sums where each value is a sum of all values
4192// above and to the left of the value, inclusive of the value.
4193void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00004194 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004195 asm volatile (
4196 "sub %1,%2 \n"
4197 "pxor %%xmm0,%%xmm0 \n"
4198 "pxor %%xmm1,%%xmm1 \n"
4199 "sub $0x4,%3 \n"
4200 "jl 49f \n"
4201 "test $0xf,%1 \n"
4202 "jne 49f \n"
4203
4204 // 4 pixel loop \n"
4205 ".p2align 2 \n"
4206 "40: \n"
4207 "movdqu (%0),%%xmm2 \n"
4208 "lea 0x10(%0),%0 \n"
4209 "movdqa %%xmm2,%%xmm4 \n"
4210 "punpcklbw %%xmm1,%%xmm2 \n"
4211 "movdqa %%xmm2,%%xmm3 \n"
4212 "punpcklwd %%xmm1,%%xmm2 \n"
4213 "punpckhwd %%xmm1,%%xmm3 \n"
4214 "punpckhbw %%xmm1,%%xmm4 \n"
4215 "movdqa %%xmm4,%%xmm5 \n"
4216 "punpcklwd %%xmm1,%%xmm4 \n"
4217 "punpckhwd %%xmm1,%%xmm5 \n"
4218 "paddd %%xmm2,%%xmm0 \n"
4219 "movdqa (%1,%2,1),%%xmm2 \n"
4220 "paddd %%xmm0,%%xmm2 \n"
4221 "paddd %%xmm3,%%xmm0 \n"
4222 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
4223 "paddd %%xmm0,%%xmm3 \n"
4224 "paddd %%xmm4,%%xmm0 \n"
4225 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
4226 "paddd %%xmm0,%%xmm4 \n"
4227 "paddd %%xmm5,%%xmm0 \n"
4228 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
4229 "paddd %%xmm0,%%xmm5 \n"
4230 "movdqa %%xmm2,(%1) \n"
4231 "movdqa %%xmm3,0x10(%1) \n"
4232 "movdqa %%xmm4,0x20(%1) \n"
4233 "movdqa %%xmm5,0x30(%1) \n"
4234 "lea 0x40(%1),%1 \n"
4235 "sub $0x4,%3 \n"
4236 "jge 40b \n"
4237
4238 "49: \n"
4239 "add $0x3,%3 \n"
4240 "jl 19f \n"
4241
4242 // 1 pixel loop \n"
4243 ".p2align 2 \n"
4244 "10: \n"
4245 "movd (%0),%%xmm2 \n"
4246 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00004247 "punpcklbw %%xmm1,%%xmm2 \n"
4248 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004249 "paddd %%xmm2,%%xmm0 \n"
4250 "movdqu (%1,%2,1),%%xmm2 \n"
4251 "paddd %%xmm0,%%xmm2 \n"
4252 "movdqu %%xmm2,(%1) \n"
4253 "lea 0x10(%1),%1 \n"
4254 "sub $0x1,%3 \n"
4255 "jge 10b \n"
4256
4257 "19: \n"
4258 : "+r"(row), // %0
4259 "+r"(cumsum), // %1
4260 "+r"(previous_cumsum), // %2
4261 "+r"(width) // %3
4262 :
4263 : "memory", "cc"
4264#if defined(__SSE2__)
4265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4266#endif
4267 );
4268}
4269#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4270
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004271#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4272void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4273 int width, int area, uint8* dst,
4274 int count) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004275 asm volatile (
4276 "movd %5,%%xmm4 \n"
4277 "cvtdq2ps %%xmm4,%%xmm4 \n"
4278 "rcpss %%xmm4,%%xmm4 \n"
4279 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4280 "sub $0x4,%3 \n"
4281 "jl 49f \n"
4282
4283 // 4 pixel loop \n"
4284 ".p2align 2 \n"
4285 "40: \n"
4286 "movdqa (%0),%%xmm0 \n"
4287 "movdqa 0x10(%0),%%xmm1 \n"
4288 "movdqa 0x20(%0),%%xmm2 \n"
4289 "movdqa 0x30(%0),%%xmm3 \n"
4290 "psubd (%0,%4,4),%%xmm0 \n"
4291 "psubd 0x10(%0,%4,4),%%xmm1 \n"
4292 "psubd 0x20(%0,%4,4),%%xmm2 \n"
4293 "psubd 0x30(%0,%4,4),%%xmm3 \n"
4294 "lea 0x40(%0),%0 \n"
4295 "psubd (%1),%%xmm0 \n"
4296 "psubd 0x10(%1),%%xmm1 \n"
4297 "psubd 0x20(%1),%%xmm2 \n"
4298 "psubd 0x30(%1),%%xmm3 \n"
4299 "paddd (%1,%4,4),%%xmm0 \n"
4300 "paddd 0x10(%1,%4,4),%%xmm1 \n"
4301 "paddd 0x20(%1,%4,4),%%xmm2 \n"
4302 "paddd 0x30(%1,%4,4),%%xmm3 \n"
4303 "lea 0x40(%1),%1 \n"
4304 "cvtdq2ps %%xmm0,%%xmm0 \n"
4305 "cvtdq2ps %%xmm1,%%xmm1 \n"
4306 "mulps %%xmm4,%%xmm0 \n"
4307 "mulps %%xmm4,%%xmm1 \n"
4308 "cvtdq2ps %%xmm2,%%xmm2 \n"
4309 "cvtdq2ps %%xmm3,%%xmm3 \n"
4310 "mulps %%xmm4,%%xmm2 \n"
4311 "mulps %%xmm4,%%xmm3 \n"
4312 "cvtps2dq %%xmm0,%%xmm0 \n"
4313 "cvtps2dq %%xmm1,%%xmm1 \n"
4314 "cvtps2dq %%xmm2,%%xmm2 \n"
4315 "cvtps2dq %%xmm3,%%xmm3 \n"
4316 "packssdw %%xmm1,%%xmm0 \n"
4317 "packssdw %%xmm3,%%xmm2 \n"
4318 "packuswb %%xmm2,%%xmm0 \n"
4319 "movdqu %%xmm0,(%2) \n"
4320 "lea 0x10(%2),%2 \n"
4321 "sub $0x4,%3 \n"
4322 "jge 40b \n"
4323
4324 "49: \n"
4325 "add $0x3,%3 \n"
4326 "jl 19f \n"
4327
4328 // 1 pixel loop \n"
4329 ".p2align 2 \n"
4330 "10: \n"
4331 "movdqa (%0),%%xmm0 \n"
4332 "psubd (%0,%4,4),%%xmm0 \n"
4333 "lea 0x10(%0),%0 \n"
4334 "psubd (%1),%%xmm0 \n"
4335 "paddd (%1,%4,4),%%xmm0 \n"
4336 "lea 0x10(%1),%1 \n"
4337 "cvtdq2ps %%xmm0,%%xmm0 \n"
4338 "mulps %%xmm4,%%xmm0 \n"
4339 "cvtps2dq %%xmm0,%%xmm0 \n"
4340 "packssdw %%xmm0,%%xmm0 \n"
4341 "packuswb %%xmm0,%%xmm0 \n"
4342 "movd %%xmm0,(%2) \n"
4343 "lea 0x4(%2),%2 \n"
4344 "sub $0x1,%3 \n"
4345 "jge 10b \n"
4346 "19: \n"
4347 : "+r"(topleft), // %0
4348 "+r"(botleft), // %1
4349 "+r"(dst), // %2
4350 "+rm"(count) // %3
4351 : "r"(static_cast<intptr_t>(width)), // %4
4352 "rm"(area) // %5
4353 : "memory", "cc"
4354#if defined(__SSE2__)
4355 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
4356#endif
4357 );
4358}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004359#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004360
fbarchard@google.com73444402012-08-09 17:33:29 +00004361#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004362// TODO(fbarchard): Find 64 bit way to avoid masking.
fbarchard@google.com73444402012-08-09 17:33:29 +00004363// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004364// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00004365// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004366
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00004367LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00004368void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004369 uint8* dst_argb, const float* src_dudv, int width) {
fbarchard@google.com73444402012-08-09 17:33:29 +00004370 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004371 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00004372 asm volatile (
4373 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004374 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004375 "shl $0x10,%1 \n"
4376 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004377 "movd %1,%%xmm5 \n"
4378 "sub $0x4,%4 \n"
4379 "jl 49f \n"
4380
4381 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4382 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004383 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004384 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004385 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004386 "movdqa %%xmm7,%%xmm4 \n"
4387 "addps %%xmm4,%%xmm4 \n"
4388 "movdqa %%xmm2,%%xmm3 \n"
4389 "addps %%xmm4,%%xmm3 \n"
4390 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004391
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004392 // 4 pixel loop \n"
4393 ".p2align 4 \n"
4394 "40: \n"
4395 "cvttps2dq %%xmm2,%%xmm0 \n"
4396 "cvttps2dq %%xmm3,%%xmm1 \n"
4397 "packssdw %%xmm1,%%xmm0 \n"
4398 "pmaddwd %%xmm5,%%xmm0 \n"
4399#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004400 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004401 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004402 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004403 "shr $32,%5 \n"
4404 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4405#else
4406 "movd %%xmm0,%1 \n"
4407 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4408 "movd %%xmm0,%5 \n"
4409 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4410#endif
4411 "movd (%0,%1,1),%%xmm1 \n"
4412 "movd (%0,%5,1),%%xmm6 \n"
4413 "punpckldq %%xmm6,%%xmm1 \n"
4414 "addps %%xmm4,%%xmm2 \n"
4415 "movq %%xmm1,(%2) \n"
4416#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004417 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004418 "mov %1,%5 \n"
4419 "and $0x0fffffff,%1 \n"
4420 "shr $32,%5 \n"
4421#else
4422 "movd %%xmm0,%1 \n"
4423 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4424 "movd %%xmm0,%5 \n"
4425#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004426 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004427 "movd (%0,%5,1),%%xmm6 \n"
4428 "punpckldq %%xmm6,%%xmm0 \n"
4429 "addps %%xmm4,%%xmm3 \n"
4430 "sub $0x4,%4 \n"
4431 "movq %%xmm0,0x08(%2) \n"
4432 "lea 0x10(%2),%2 \n"
4433 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004434
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004435 "49: \n"
4436 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004437 "jl 19f \n"
4438
4439 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004440 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004441 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004442 "cvttps2dq %%xmm2,%%xmm0 \n"
4443 "packssdw %%xmm0,%%xmm0 \n"
4444 "pmaddwd %%xmm5,%%xmm0 \n"
4445 "addps %%xmm7,%%xmm2 \n"
4446 "movd %%xmm0,%1 \n"
4447#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004448 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004449#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004450 "movd (%0,%1,1),%%xmm0 \n"
4451 "sub $0x1,%4 \n"
4452 "movd %%xmm0,(%2) \n"
4453 "lea 0x4(%2),%2 \n"
4454 "jge 10b \n"
4455 "19: \n"
4456 : "+r"(src_argb), // %0
4457 "+r"(src_argb_stride_temp), // %1
4458 "+r"(dst_argb), // %2
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004459 "+r"(src_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004460 "+rm"(width), // %4
4461 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004462 :
4463 : "memory", "cc"
4464#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004466#endif
4467 );
4468}
4469#endif // HAS_ARGBAFFINEROW_SSE2
4470
fbarchard@google.comb5491752012-11-20 09:44:46 +00004471// Bilinear image filtering.
4472// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4473void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004474 ptrdiff_t src_stride, int dst_width,
4475 int source_y_fraction) {
4476 asm volatile (
4477 "sub %1,%0 \n"
4478 "shr %3 \n"
4479 "cmp $0x0,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004480 "je 100f \n"
4481 "cmp $0x20,%3 \n"
4482 "je 75f \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004483 "cmp $0x40,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004484 "je 50f \n"
4485 "cmp $0x60,%3 \n"
4486 "je 25f \n"
4487
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004488 "movd %3,%%xmm0 \n"
4489 "neg %3 \n"
4490 "add $0x80,%3 \n"
4491 "movd %3,%%xmm5 \n"
4492 "punpcklbw %%xmm0,%%xmm5 \n"
4493 "punpcklwd %%xmm5,%%xmm5 \n"
4494 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004495
4496 // General purpose row blend.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004497 ".p2align 4 \n"
4498 "1: \n"
4499 "movdqa (%1),%%xmm0 \n"
4500 "movdqa (%1,%4,1),%%xmm2 \n"
4501 "movdqa %%xmm0,%%xmm1 \n"
4502 "punpcklbw %%xmm2,%%xmm0 \n"
4503 "punpckhbw %%xmm2,%%xmm1 \n"
4504 "pmaddubsw %%xmm5,%%xmm0 \n"
4505 "pmaddubsw %%xmm5,%%xmm1 \n"
4506 "psrlw $0x7,%%xmm0 \n"
4507 "psrlw $0x7,%%xmm1 \n"
4508 "packuswb %%xmm1,%%xmm0 \n"
4509 "sub $0x4,%2 \n"
4510 "movdqa %%xmm0,(%1,%0,1) \n"
4511 "lea 0x10(%1),%1 \n"
4512 "jg 1b \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004513 "jmp 99f \n"
4514
4515 // Blend 25 / 75.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004516 ".p2align 4 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004517 "25: \n"
4518 "movdqa (%1),%%xmm0 \n"
4519 "movdqa (%1,%4,1),%%xmm1 \n"
4520 "pavgb %%xmm1,%%xmm0 \n"
4521 "pavgb %%xmm1,%%xmm0 \n"
4522 "sub $0x4,%2 \n"
4523 "movdqa %%xmm0,(%1,%0,1) \n"
4524 "lea 0x10(%1),%1 \n"
4525 "jg 25b \n"
4526 "jmp 99f \n"
4527
4528 // Blend 50 / 50.
4529 ".p2align 4 \n"
4530 "50: \n"
4531 "movdqa (%1),%%xmm0 \n"
4532 "movdqa (%1,%4,1),%%xmm1 \n"
4533 "pavgb %%xmm1,%%xmm0 \n"
4534 "sub $0x4,%2 \n"
4535 "movdqa %%xmm0,(%1,%0,1) \n"
4536 "lea 0x10(%1),%1 \n"
4537 "jg 50b \n"
4538 "jmp 99f \n"
4539
4540 // Blend 75 / 25.
4541 ".p2align 4 \n"
4542 "75: \n"
4543 "movdqa (%1),%%xmm1 \n"
4544 "movdqa (%1,%4,1),%%xmm0 \n"
4545 "pavgb %%xmm1,%%xmm0 \n"
4546 "pavgb %%xmm1,%%xmm0 \n"
4547 "sub $0x4,%2 \n"
4548 "movdqa %%xmm0,(%1,%0,1) \n"
4549 "lea 0x10(%1),%1 \n"
4550 "jg 75b \n"
4551 "jmp 99f \n"
4552
4553 // Blend 100 / 0 - Copy row unchanged.
4554 ".p2align 4 \n"
4555 "100: \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004556 "movdqa (%1),%%xmm0 \n"
4557 "sub $0x4,%2 \n"
4558 "movdqa %%xmm0,(%1,%0,1) \n"
4559 "lea 0x10(%1),%1 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004560 "jg 100b \n"
4561
4562 // Extrude last pixel.
4563 "99: \n"
4564 : "+r"(dst_argb), // %0
4565 "+r"(src_argb), // %1
4566 "+r"(dst_width), // %2
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004567 "+r"(source_y_fraction) // %3
4568 : "r"(static_cast<intptr_t>(src_stride)) // %4
4569 : "memory", "cc"
4570#if defined(__SSE2__)
4571 , "xmm0", "xmm1", "xmm2", "xmm5"
4572#endif
4573 );
4574}
4575
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004576void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4577 uint8* dst_uv, int pix) {
4578 asm volatile (
4579 "sub %0,%1 \n"
4580 ".p2align 4 \n"
4581 "1: \n"
4582 "movdqa (%0),%%xmm0 \n"
4583 "pavgb (%0,%3),%%xmm0 \n"
4584 "sub $0x10,%2 \n"
4585 "movdqa %%xmm0,(%0,%1) \n"
4586 "lea 0x10(%0),%0 \n"
4587 "jg 1b \n"
4588 : "+r"(src_uv), // %0
4589 "+r"(dst_uv), // %1
4590 "+r"(pix) // %2
4591 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4592 : "memory", "cc"
4593#if defined(__SSE2__)
4594 , "xmm0"
4595#endif
4596 );
4597}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004598
4599void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4600 uint32 selector, int pix) {
4601 asm volatile (
4602 "movd %3,%%xmm5 \n"
4603 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4604 ".p2align 4 \n"
4605 "1: \n"
4606 "movdqa (%0),%%xmm0 \n"
4607 "lea 0x10(%0),%0 \n"
4608 "pshufb %%xmm5,%%xmm0 \n"
4609 "sub $0x4,%2 \n"
4610 "movd %%xmm0,(%1) \n"
4611 "lea 0x4(%1),%1 \n"
4612 "jg 1b \n"
4613 : "+r"(src_argb), // %0
4614 "+r"(dst_bayer), // %1
4615 "+r"(pix) // %2
4616 : "g"(selector) // %3
4617 : "memory", "cc"
4618#if defined(__SSE2__)
4619 , "xmm0", "xmm5"
4620#endif
4621 );
4622}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004623
4624void I422ToYUY2Row_SSE2(const uint8* src_y,
4625 const uint8* src_u,
4626 const uint8* src_v,
4627 uint8* dst_frame, int width) {
4628 asm volatile (
4629 "sub %1,%2 \n"
4630 ".p2align 4 \n"
4631 "1: \n"
4632 "movq (%1),%%xmm2 \n"
4633 "movq (%1,%2,1),%%xmm3 \n"
4634 "lea 0x8(%1),%1 \n"
4635 "punpcklbw %%xmm3,%%xmm2 \n"
4636 "movdqa (%0),%%xmm0 \n"
4637 "lea 0x10(%0),%0 \n"
4638 "movdqa %%xmm0,%%xmm1 \n"
4639 "punpcklbw %%xmm2,%%xmm0 \n"
4640 "punpckhbw %%xmm2,%%xmm1 \n"
4641 "movdqa %%xmm0,(%3) \n"
4642 "movdqa %%xmm1,0x10(%3) \n"
4643 "lea 0x20(%3),%3 \n"
4644 "sub $0x10,%4 \n"
4645 "jg 1b \n"
4646 : "+r"(src_y), // %0
4647 "+r"(src_u), // %1
4648 "+r"(src_v), // %2
4649 "+r"(dst_frame), // %3
4650 "+rm"(width) // %4
4651 :
4652 : "memory", "cc"
4653#if defined(__SSE2__)
4654 , "xmm0", "xmm1", "xmm2", "xmm3"
4655#endif
4656 );
4657}
4658
4659void I422ToUYVYRow_SSE2(const uint8* src_y,
4660 const uint8* src_u,
4661 const uint8* src_v,
4662 uint8* dst_frame, int width) {
4663 asm volatile (
4664 "sub %1,%2 \n"
4665 ".p2align 4 \n"
4666 "1: \n"
4667 "movq (%1),%%xmm2 \n"
4668 "movq (%1,%2,1),%%xmm3 \n"
4669 "lea 0x8(%1),%1 \n"
4670 "punpcklbw %%xmm3,%%xmm2 \n"
4671 "movdqa (%0),%%xmm0 \n"
4672 "movdqa %%xmm2,%%xmm1 \n"
4673 "lea 0x10(%0),%0 \n"
4674 "punpcklbw %%xmm0,%%xmm1 \n"
4675 "punpckhbw %%xmm0,%%xmm2 \n"
4676 "movdqa %%xmm1,(%3) \n"
4677 "movdqa %%xmm2,0x10(%3) \n"
4678 "lea 0x20(%3),%3 \n"
4679 "sub $0x10,%4 \n"
4680 "jg 1b \n"
4681 : "+r"(src_y), // %0
4682 "+r"(src_u), // %1
4683 "+r"(src_v), // %2
4684 "+r"(dst_frame), // %3
4685 "+rm"(width) // %4
4686 :
4687 : "memory", "cc"
4688#if defined(__SSE2__)
4689 , "xmm0", "xmm1", "xmm2", "xmm3"
4690#endif
4691 );
4692}
4693
fbarchard@google.com2d11d432012-02-16 02:50:39 +00004694#endif // defined(__x86_64__) || defined(__i386__)
4695
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004696#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00004697} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004698} // namespace libyuv
4699#endif