blob: 471f94585ddba1764fa095bbe288ee27e853b914 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
fbarchard@google.comcde58702013-01-28 00:02:35 +00007 * in the file PATENTS. All contributing project authors may
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00008 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.com83a63e62013-02-27 00:20:29 +000021#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
fbarchard@google.com4de0c432012-10-11 01:25:46 +000072// Constants for RGBA.
73CONST vec8 kRGBAToY = {
74 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
75};
76
77CONST vec8 kRGBAToU = {
78 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
79};
80
81CONST vec8 kRGBAToV = {
82 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
83};
84
fbarchard@google.com714050a2012-02-17 22:59:56 +000085CONST uvec8 kAddY16 = {
86 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000087};
fbarchard@google.com2430e042011-11-11 21:57:06 +000088
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000089CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
92};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000093
fbarchard@google.comba1f5262012-01-12 19:22:41 +000094// Shuffle table for converting RGB24 to ARGB.
95CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000096 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
97};
98
99// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000100CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000101 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
102};
103
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000104// Shuffle table for converting ARGB to RGB24.
105CONST uvec8 kShuffleMaskARGBToRGB24 = {
106 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
107};
108
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109// Shuffle table for converting ARGB to RAW.
110CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000111 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000112};
113
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000114// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000115CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
116 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
117};
118
119// Shuffle table for converting ARGB to RAW.
120CONST uvec8 kShuffleMaskARGBToRAW_0 = {
121 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
122};
123
fbarchard@google.comb6149762011-11-07 21:58:52 +0000124void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000125 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000126 "pcmpeqb %%xmm5,%%xmm5 \n"
127 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000128 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000129 "1: \n"
130 "movq (%0),%%xmm0 \n"
131 "lea 0x8(%0),%0 \n"
132 "punpcklbw %%xmm0,%%xmm0 \n"
133 "movdqa %%xmm0,%%xmm1 \n"
134 "punpcklwd %%xmm0,%%xmm0 \n"
135 "punpckhwd %%xmm1,%%xmm1 \n"
136 "por %%xmm5,%%xmm0 \n"
137 "por %%xmm5,%%xmm1 \n"
138 "movdqa %%xmm0,(%1) \n"
139 "movdqa %%xmm1,0x10(%1) \n"
140 "lea 0x20(%1),%1 \n"
141 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000142 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000143 : "+r"(src_y), // %0
144 "+r"(dst_argb), // %1
145 "+r"(pix) // %2
146 :
147 : "memory", "cc"
148#if defined(__SSE2__)
149 , "xmm0", "xmm1", "xmm5"
150#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000151 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000152}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000153
fbarchard@google.com00b69a22012-11-02 06:03:28 +0000154void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
155 int pix) {
156 asm volatile (
157 "pcmpeqb %%xmm5,%%xmm5 \n"
158 "pslld $0x18,%%xmm5 \n"
159 ".p2align 4 \n"
160 "1: \n"
161 "movq (%0),%%xmm0 \n"
162 "lea 0x8(%0),%0 \n"
163 "punpcklbw %%xmm0,%%xmm0 \n"
164 "movdqa %%xmm0,%%xmm1 \n"
165 "punpcklwd %%xmm0,%%xmm0 \n"
166 "punpckhwd %%xmm1,%%xmm1 \n"
167 "por %%xmm5,%%xmm0 \n"
168 "por %%xmm5,%%xmm1 \n"
169 "movdqu %%xmm0,(%1) \n"
170 "movdqu %%xmm1,0x10(%1) \n"
171 "lea 0x20(%1),%1 \n"
172 "sub $0x8,%2 \n"
173 "jg 1b \n"
174 : "+r"(src_y), // %0
175 "+r"(dst_argb), // %1
176 "+r"(pix) // %2
177 :
178 : "memory", "cc"
179#if defined(__SSE2__)
180 , "xmm0", "xmm1", "xmm5"
181#endif
182 );
183}
184
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000185void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000186 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000187 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
188 "pslld $0x18,%%xmm5 \n"
189 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000190 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000191 "1: \n"
192 "movdqu (%0),%%xmm0 \n"
193 "movdqu 0x10(%0),%%xmm1 \n"
194 "movdqu 0x20(%0),%%xmm3 \n"
195 "lea 0x30(%0),%0 \n"
196 "movdqa %%xmm3,%%xmm2 \n"
197 "palignr $0x8,%%xmm1,%%xmm2 \n"
198 "pshufb %%xmm4,%%xmm2 \n"
199 "por %%xmm5,%%xmm2 \n"
200 "palignr $0xc,%%xmm0,%%xmm1 \n"
201 "pshufb %%xmm4,%%xmm0 \n"
202 "movdqa %%xmm2,0x20(%1) \n"
203 "por %%xmm5,%%xmm0 \n"
204 "pshufb %%xmm4,%%xmm1 \n"
205 "movdqa %%xmm0,(%1) \n"
206 "por %%xmm5,%%xmm1 \n"
207 "palignr $0x4,%%xmm3,%%xmm3 \n"
208 "pshufb %%xmm4,%%xmm3 \n"
209 "movdqa %%xmm1,0x10(%1) \n"
210 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000211 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000212 "movdqa %%xmm3,0x30(%1) \n"
213 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000215 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000216 "+r"(dst_argb), // %1
217 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000218 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000219 : "memory", "cc"
220#if defined(__SSE2__)
221 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
222#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000223 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000224}
225
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000226void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000227 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000228 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
229 "pslld $0x18,%%xmm5 \n"
230 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000231 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000232 "1: \n"
233 "movdqu (%0),%%xmm0 \n"
234 "movdqu 0x10(%0),%%xmm1 \n"
235 "movdqu 0x20(%0),%%xmm3 \n"
236 "lea 0x30(%0),%0 \n"
237 "movdqa %%xmm3,%%xmm2 \n"
238 "palignr $0x8,%%xmm1,%%xmm2 \n"
239 "pshufb %%xmm4,%%xmm2 \n"
240 "por %%xmm5,%%xmm2 \n"
241 "palignr $0xc,%%xmm0,%%xmm1 \n"
242 "pshufb %%xmm4,%%xmm0 \n"
243 "movdqa %%xmm2,0x20(%1) \n"
244 "por %%xmm5,%%xmm0 \n"
245 "pshufb %%xmm4,%%xmm1 \n"
246 "movdqa %%xmm0,(%1) \n"
247 "por %%xmm5,%%xmm1 \n"
248 "palignr $0x4,%%xmm3,%%xmm3 \n"
249 "pshufb %%xmm4,%%xmm3 \n"
250 "movdqa %%xmm1,0x10(%1) \n"
251 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000252 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000253 "movdqa %%xmm3,0x30(%1) \n"
254 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000255 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000256 : "+r"(src_raw), // %0
257 "+r"(dst_argb), // %1
258 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000259 : "m"(kShuffleMaskRAWToARGB) // %3
260 : "memory", "cc"
261#if defined(__SSE2__)
262 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
263#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000264 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000265}
266
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000268 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000269 "mov $0x1080108,%%eax \n"
270 "movd %%eax,%%xmm5 \n"
271 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000272 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000273 "movd %%eax,%%xmm6 \n"
274 "pshufd $0x0,%%xmm6,%%xmm6 \n"
275 "pcmpeqb %%xmm3,%%xmm3 \n"
276 "psllw $0xb,%%xmm3 \n"
277 "pcmpeqb %%xmm4,%%xmm4 \n"
278 "psllw $0xa,%%xmm4 \n"
279 "psrlw $0x5,%%xmm4 \n"
280 "pcmpeqb %%xmm7,%%xmm7 \n"
281 "psllw $0x8,%%xmm7 \n"
282 "sub %0,%1 \n"
283 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000284 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000285 "1: \n"
286 "movdqu (%0),%%xmm0 \n"
287 "movdqa %%xmm0,%%xmm1 \n"
288 "movdqa %%xmm0,%%xmm2 \n"
289 "pand %%xmm3,%%xmm1 \n"
290 "psllw $0xb,%%xmm2 \n"
291 "pmulhuw %%xmm5,%%xmm1 \n"
292 "pmulhuw %%xmm5,%%xmm2 \n"
293 "psllw $0x8,%%xmm1 \n"
294 "por %%xmm2,%%xmm1 \n"
295 "pand %%xmm4,%%xmm0 \n"
296 "pmulhuw %%xmm6,%%xmm0 \n"
297 "por %%xmm7,%%xmm0 \n"
298 "movdqa %%xmm1,%%xmm2 \n"
299 "punpcklbw %%xmm0,%%xmm1 \n"
300 "punpckhbw %%xmm0,%%xmm2 \n"
301 "movdqa %%xmm1,(%1,%0,2) \n"
302 "movdqa %%xmm2,0x10(%1,%0,2) \n"
303 "lea 0x10(%0),%0 \n"
304 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000305 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000306 : "+r"(src), // %0
307 "+r"(dst), // %1
308 "+r"(pix) // %2
309 :
310 : "memory", "cc", "eax"
311#if defined(__SSE2__)
312 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
313#endif
314 );
315}
316
317void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000318 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000319 "mov $0x1080108,%%eax \n"
320 "movd %%eax,%%xmm5 \n"
321 "pshufd $0x0,%%xmm5,%%xmm5 \n"
322 "mov $0x42004200,%%eax \n"
323 "movd %%eax,%%xmm6 \n"
324 "pshufd $0x0,%%xmm6,%%xmm6 \n"
325 "pcmpeqb %%xmm3,%%xmm3 \n"
326 "psllw $0xb,%%xmm3 \n"
327 "movdqa %%xmm3,%%xmm4 \n"
328 "psrlw $0x6,%%xmm4 \n"
329 "pcmpeqb %%xmm7,%%xmm7 \n"
330 "psllw $0x8,%%xmm7 \n"
331 "sub %0,%1 \n"
332 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000333 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000334 "1: \n"
335 "movdqu (%0),%%xmm0 \n"
336 "movdqa %%xmm0,%%xmm1 \n"
337 "movdqa %%xmm0,%%xmm2 \n"
338 "psllw $0x1,%%xmm1 \n"
339 "psllw $0xb,%%xmm2 \n"
340 "pand %%xmm3,%%xmm1 \n"
341 "pmulhuw %%xmm5,%%xmm2 \n"
342 "pmulhuw %%xmm5,%%xmm1 \n"
343 "psllw $0x8,%%xmm1 \n"
344 "por %%xmm2,%%xmm1 \n"
345 "movdqa %%xmm0,%%xmm2 \n"
346 "pand %%xmm4,%%xmm0 \n"
347 "psraw $0x8,%%xmm2 \n"
348 "pmulhuw %%xmm6,%%xmm0 \n"
349 "pand %%xmm7,%%xmm2 \n"
350 "por %%xmm2,%%xmm0 \n"
351 "movdqa %%xmm1,%%xmm2 \n"
352 "punpcklbw %%xmm0,%%xmm1 \n"
353 "punpckhbw %%xmm0,%%xmm2 \n"
354 "movdqa %%xmm1,(%1,%0,2) \n"
355 "movdqa %%xmm2,0x10(%1,%0,2) \n"
356 "lea 0x10(%0),%0 \n"
357 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000358 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000359 : "+r"(src), // %0
360 "+r"(dst), // %1
361 "+r"(pix) // %2
362 :
363 : "memory", "cc", "eax"
364#if defined(__SSE2__)
365 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
366#endif
367 );
368}
369
370void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000371 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000372 "mov $0xf0f0f0f,%%eax \n"
373 "movd %%eax,%%xmm4 \n"
374 "pshufd $0x0,%%xmm4,%%xmm4 \n"
375 "movdqa %%xmm4,%%xmm5 \n"
376 "pslld $0x4,%%xmm5 \n"
377 "sub %0,%1 \n"
378 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000379 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000380 "1: \n"
381 "movdqu (%0),%%xmm0 \n"
382 "movdqa %%xmm0,%%xmm2 \n"
383 "pand %%xmm4,%%xmm0 \n"
384 "pand %%xmm5,%%xmm2 \n"
385 "movdqa %%xmm0,%%xmm1 \n"
386 "movdqa %%xmm2,%%xmm3 \n"
387 "psllw $0x4,%%xmm1 \n"
388 "psrlw $0x4,%%xmm3 \n"
389 "por %%xmm1,%%xmm0 \n"
390 "por %%xmm3,%%xmm2 \n"
391 "movdqa %%xmm0,%%xmm1 \n"
392 "punpcklbw %%xmm2,%%xmm0 \n"
393 "punpckhbw %%xmm2,%%xmm1 \n"
394 "movdqa %%xmm0,(%1,%0,2) \n"
395 "movdqa %%xmm1,0x10(%1,%0,2) \n"
396 "lea 0x10(%0),%0 \n"
397 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000398 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000399 : "+r"(src), // %0
400 "+r"(dst), // %1
401 "+r"(pix) // %2
402 :
403 : "memory", "cc", "eax"
404#if defined(__SSE2__)
405 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
406#endif
407 );
408}
409
410void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000411 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000412 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000413 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000414 "1: \n"
415 "movdqa (%0),%%xmm0 \n"
416 "movdqa 0x10(%0),%%xmm1 \n"
417 "movdqa 0x20(%0),%%xmm2 \n"
418 "movdqa 0x30(%0),%%xmm3 \n"
419 "lea 0x40(%0),%0 \n"
420 "pshufb %%xmm6,%%xmm0 \n"
421 "pshufb %%xmm6,%%xmm1 \n"
422 "pshufb %%xmm6,%%xmm2 \n"
423 "pshufb %%xmm6,%%xmm3 \n"
424 "movdqa %%xmm1,%%xmm4 \n"
425 "psrldq $0x4,%%xmm1 \n"
426 "pslldq $0xc,%%xmm4 \n"
427 "movdqa %%xmm2,%%xmm5 \n"
428 "por %%xmm4,%%xmm0 \n"
429 "pslldq $0x8,%%xmm5 \n"
430 "movdqa %%xmm0,(%1) \n"
431 "por %%xmm5,%%xmm1 \n"
432 "psrldq $0x8,%%xmm2 \n"
433 "pslldq $0x4,%%xmm3 \n"
434 "por %%xmm3,%%xmm2 \n"
435 "movdqa %%xmm1,0x10(%1) \n"
436 "movdqa %%xmm2,0x20(%1) \n"
437 "lea 0x30(%1),%1 \n"
438 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000439 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000440 : "+r"(src), // %0
441 "+r"(dst), // %1
442 "+r"(pix) // %2
443 : "m"(kShuffleMaskARGBToRGB24) // %3
444 : "memory", "cc"
445#if defined(__SSE2__)
446 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
447#endif
448 );
449}
450
451void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000452 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000453 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000454 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000455 "1: \n"
456 "movdqa (%0),%%xmm0 \n"
457 "movdqa 0x10(%0),%%xmm1 \n"
458 "movdqa 0x20(%0),%%xmm2 \n"
459 "movdqa 0x30(%0),%%xmm3 \n"
460 "lea 0x40(%0),%0 \n"
461 "pshufb %%xmm6,%%xmm0 \n"
462 "pshufb %%xmm6,%%xmm1 \n"
463 "pshufb %%xmm6,%%xmm2 \n"
464 "pshufb %%xmm6,%%xmm3 \n"
465 "movdqa %%xmm1,%%xmm4 \n"
466 "psrldq $0x4,%%xmm1 \n"
467 "pslldq $0xc,%%xmm4 \n"
468 "movdqa %%xmm2,%%xmm5 \n"
469 "por %%xmm4,%%xmm0 \n"
470 "pslldq $0x8,%%xmm5 \n"
471 "movdqa %%xmm0,(%1) \n"
472 "por %%xmm5,%%xmm1 \n"
473 "psrldq $0x8,%%xmm2 \n"
474 "pslldq $0x4,%%xmm3 \n"
475 "por %%xmm3,%%xmm2 \n"
476 "movdqa %%xmm1,0x10(%1) \n"
477 "movdqa %%xmm2,0x20(%1) \n"
478 "lea 0x30(%1),%1 \n"
479 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000480 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000481 : "+r"(src), // %0
482 "+r"(dst), // %1
483 "+r"(pix) // %2
484 : "m"(kShuffleMaskARGBToRAW) // %3
485 : "memory", "cc"
486#if defined(__SSE2__)
487 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
488#endif
489 );
490}
491
fbarchard@google.comdbcabea2012-10-29 21:20:25 +0000492void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000493 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000494 "pcmpeqb %%xmm3,%%xmm3 \n"
495 "psrld $0x1b,%%xmm3 \n"
496 "pcmpeqb %%xmm4,%%xmm4 \n"
497 "psrld $0x1a,%%xmm4 \n"
498 "pslld $0x5,%%xmm4 \n"
499 "pcmpeqb %%xmm5,%%xmm5 \n"
500 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000501 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000502 "1: \n"
503 "movdqa (%0),%%xmm0 \n"
504 "movdqa %%xmm0,%%xmm1 \n"
505 "movdqa %%xmm0,%%xmm2 \n"
506 "pslld $0x8,%%xmm0 \n"
507 "psrld $0x3,%%xmm1 \n"
508 "psrld $0x5,%%xmm2 \n"
509 "psrad $0x10,%%xmm0 \n"
510 "pand %%xmm3,%%xmm1 \n"
511 "pand %%xmm4,%%xmm2 \n"
512 "pand %%xmm5,%%xmm0 \n"
513 "por %%xmm2,%%xmm1 \n"
514 "por %%xmm1,%%xmm0 \n"
515 "packssdw %%xmm0,%%xmm0 \n"
516 "lea 0x10(%0),%0 \n"
517 "movq %%xmm0,(%1) \n"
518 "lea 0x8(%1),%1 \n"
519 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000520 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000521 : "+r"(src), // %0
522 "+r"(dst), // %1
523 "+r"(pix) // %2
524 :
525 : "memory", "cc"
526#if defined(__SSE2__)
527 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
528#endif
529 );
530}
531
532void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000533 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000534 "pcmpeqb %%xmm4,%%xmm4 \n"
535 "psrld $0x1b,%%xmm4 \n"
536 "movdqa %%xmm4,%%xmm5 \n"
537 "pslld $0x5,%%xmm5 \n"
538 "movdqa %%xmm4,%%xmm6 \n"
539 "pslld $0xa,%%xmm6 \n"
540 "pcmpeqb %%xmm7,%%xmm7 \n"
541 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000542 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000543 "1: \n"
544 "movdqa (%0),%%xmm0 \n"
545 "movdqa %%xmm0,%%xmm1 \n"
546 "movdqa %%xmm0,%%xmm2 \n"
547 "movdqa %%xmm0,%%xmm3 \n"
548 "psrad $0x10,%%xmm0 \n"
549 "psrld $0x3,%%xmm1 \n"
550 "psrld $0x6,%%xmm2 \n"
551 "psrld $0x9,%%xmm3 \n"
552 "pand %%xmm7,%%xmm0 \n"
553 "pand %%xmm4,%%xmm1 \n"
554 "pand %%xmm5,%%xmm2 \n"
555 "pand %%xmm6,%%xmm3 \n"
556 "por %%xmm1,%%xmm0 \n"
557 "por %%xmm3,%%xmm2 \n"
558 "por %%xmm2,%%xmm0 \n"
559 "packssdw %%xmm0,%%xmm0 \n"
560 "lea 0x10(%0),%0 \n"
561 "movq %%xmm0,(%1) \n"
562 "lea 0x8(%1),%1 \n"
563 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000564 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000565 : "+r"(src), // %0
566 "+r"(dst), // %1
567 "+r"(pix) // %2
568 :
569 : "memory", "cc"
570#if defined(__SSE2__)
571 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
572#endif
573 );
574}
575
576void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000577 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000578 "pcmpeqb %%xmm4,%%xmm4 \n"
579 "psllw $0xc,%%xmm4 \n"
580 "movdqa %%xmm4,%%xmm3 \n"
581 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000582 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000583 "1: \n"
584 "movdqa (%0),%%xmm0 \n"
585 "movdqa %%xmm0,%%xmm1 \n"
586 "pand %%xmm3,%%xmm0 \n"
587 "pand %%xmm4,%%xmm1 \n"
588 "psrlq $0x4,%%xmm0 \n"
589 "psrlq $0x8,%%xmm1 \n"
590 "por %%xmm1,%%xmm0 \n"
591 "packuswb %%xmm0,%%xmm0 \n"
592 "lea 0x10(%0),%0 \n"
593 "movq %%xmm0,(%1) \n"
594 "lea 0x8(%1),%1 \n"
595 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000596 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000597 : "+r"(src), // %0
598 "+r"(dst), // %1
599 "+r"(pix) // %2
600 :
601 : "memory", "cc"
602#if defined(__SSE2__)
603 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
604#endif
605 );
606}
607
fbarchard@google.comb6149762011-11-07 21:58:52 +0000608void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000609 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000610 "movdqa %4,%%xmm5 \n"
611 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000612 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000613 "1: \n"
614 "movdqa (%0),%%xmm0 \n"
615 "movdqa 0x10(%0),%%xmm1 \n"
616 "movdqa 0x20(%0),%%xmm2 \n"
617 "movdqa 0x30(%0),%%xmm3 \n"
618 "pmaddubsw %%xmm4,%%xmm0 \n"
619 "pmaddubsw %%xmm4,%%xmm1 \n"
620 "pmaddubsw %%xmm4,%%xmm2 \n"
621 "pmaddubsw %%xmm4,%%xmm3 \n"
622 "lea 0x40(%0),%0 \n"
623 "phaddw %%xmm1,%%xmm0 \n"
624 "phaddw %%xmm3,%%xmm2 \n"
625 "psrlw $0x7,%%xmm0 \n"
626 "psrlw $0x7,%%xmm2 \n"
627 "packuswb %%xmm2,%%xmm0 \n"
628 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000629 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000630 "movdqa %%xmm0,(%1) \n"
631 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000632 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000633 : "+r"(src_argb), // %0
634 "+r"(dst_y), // %1
635 "+r"(pix) // %2
636 : "m"(kARGBToY), // %3
637 "m"(kAddY16) // %4
638 : "memory", "cc"
639#if defined(__SSE2__)
640 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
641#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000642 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000643}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000644
645void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000646 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000647 "movdqa %4,%%xmm5 \n"
648 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000649 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000650 "1: \n"
651 "movdqu (%0),%%xmm0 \n"
652 "movdqu 0x10(%0),%%xmm1 \n"
653 "movdqu 0x20(%0),%%xmm2 \n"
654 "movdqu 0x30(%0),%%xmm3 \n"
655 "pmaddubsw %%xmm4,%%xmm0 \n"
656 "pmaddubsw %%xmm4,%%xmm1 \n"
657 "pmaddubsw %%xmm4,%%xmm2 \n"
658 "pmaddubsw %%xmm4,%%xmm3 \n"
659 "lea 0x40(%0),%0 \n"
660 "phaddw %%xmm1,%%xmm0 \n"
661 "phaddw %%xmm3,%%xmm2 \n"
662 "psrlw $0x7,%%xmm0 \n"
663 "psrlw $0x7,%%xmm2 \n"
664 "packuswb %%xmm2,%%xmm0 \n"
665 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000666 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000667 "movdqu %%xmm0,(%1) \n"
668 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000669 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000670 : "+r"(src_argb), // %0
671 "+r"(dst_y), // %1
672 "+r"(pix) // %2
673 : "m"(kARGBToY), // %3
674 "m"(kAddY16) // %4
675 : "memory", "cc"
676#if defined(__SSE2__)
677 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
678#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000679 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000680}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000681
fbarchard@google.com714050a2012-02-17 22:59:56 +0000682// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000683// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
684// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
685// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000686// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000687void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
688 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000689 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000690 "movdqa %0,%%xmm4 \n"
691 "movdqa %1,%%xmm3 \n"
692 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000693 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000694 : "m"(kARGBToU), // %0
695 "m"(kARGBToV), // %1
696 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000697 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000698 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000699 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000700 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000701 "1: \n"
702 "movdqa (%0),%%xmm0 \n"
703 "movdqa 0x10(%0),%%xmm1 \n"
704 "movdqa 0x20(%0),%%xmm2 \n"
705 "movdqa 0x30(%0),%%xmm6 \n"
706 "pavgb (%0,%4,1),%%xmm0 \n"
707 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
708 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
709 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
710 "lea 0x40(%0),%0 \n"
711 "movdqa %%xmm0,%%xmm7 \n"
712 "shufps $0x88,%%xmm1,%%xmm0 \n"
713 "shufps $0xdd,%%xmm1,%%xmm7 \n"
714 "pavgb %%xmm7,%%xmm0 \n"
715 "movdqa %%xmm2,%%xmm7 \n"
716 "shufps $0x88,%%xmm6,%%xmm2 \n"
717 "shufps $0xdd,%%xmm6,%%xmm7 \n"
718 "pavgb %%xmm7,%%xmm2 \n"
719 "movdqa %%xmm0,%%xmm1 \n"
720 "movdqa %%xmm2,%%xmm6 \n"
721 "pmaddubsw %%xmm4,%%xmm0 \n"
722 "pmaddubsw %%xmm4,%%xmm2 \n"
723 "pmaddubsw %%xmm3,%%xmm1 \n"
724 "pmaddubsw %%xmm3,%%xmm6 \n"
725 "phaddw %%xmm2,%%xmm0 \n"
726 "phaddw %%xmm6,%%xmm1 \n"
727 "psraw $0x8,%%xmm0 \n"
728 "psraw $0x8,%%xmm1 \n"
729 "packsswb %%xmm1,%%xmm0 \n"
730 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000731 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000732 "movlps %%xmm0,(%1) \n"
733 "movhps %%xmm0,(%1,%2,1) \n"
734 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000735 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000736 : "+r"(src_argb0), // %0
737 "+r"(dst_u), // %1
738 "+r"(dst_v), // %2
739 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000740 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000741 : "memory", "cc"
742#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000743 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000744#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000745 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000746}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000747
748void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
749 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000750 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000751 "movdqa %0,%%xmm4 \n"
752 "movdqa %1,%%xmm3 \n"
753 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000754 :
755 : "m"(kARGBToU), // %0
756 "m"(kARGBToV), // %1
757 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000758 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000759 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000760 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000761 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000762 "1: \n"
763 "movdqu (%0),%%xmm0 \n"
764 "movdqu 0x10(%0),%%xmm1 \n"
765 "movdqu 0x20(%0),%%xmm2 \n"
766 "movdqu 0x30(%0),%%xmm6 \n"
767 "movdqu (%0,%4,1),%%xmm7 \n"
768 "pavgb %%xmm7,%%xmm0 \n"
769 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
770 "pavgb %%xmm7,%%xmm1 \n"
771 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
772 "pavgb %%xmm7,%%xmm2 \n"
773 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
774 "pavgb %%xmm7,%%xmm6 \n"
775 "lea 0x40(%0),%0 \n"
776 "movdqa %%xmm0,%%xmm7 \n"
777 "shufps $0x88,%%xmm1,%%xmm0 \n"
778 "shufps $0xdd,%%xmm1,%%xmm7 \n"
779 "pavgb %%xmm7,%%xmm0 \n"
780 "movdqa %%xmm2,%%xmm7 \n"
781 "shufps $0x88,%%xmm6,%%xmm2 \n"
782 "shufps $0xdd,%%xmm6,%%xmm7 \n"
783 "pavgb %%xmm7,%%xmm2 \n"
784 "movdqa %%xmm0,%%xmm1 \n"
785 "movdqa %%xmm2,%%xmm6 \n"
786 "pmaddubsw %%xmm4,%%xmm0 \n"
787 "pmaddubsw %%xmm4,%%xmm2 \n"
788 "pmaddubsw %%xmm3,%%xmm1 \n"
789 "pmaddubsw %%xmm3,%%xmm6 \n"
790 "phaddw %%xmm2,%%xmm0 \n"
791 "phaddw %%xmm6,%%xmm1 \n"
792 "psraw $0x8,%%xmm0 \n"
793 "psraw $0x8,%%xmm1 \n"
794 "packsswb %%xmm1,%%xmm0 \n"
795 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000796 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000797 "movlps %%xmm0,(%1) \n"
798 "movhps %%xmm0,(%1,%2,1) \n"
799 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000800 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000801 : "+r"(src_argb0), // %0
802 "+r"(dst_u), // %1
803 "+r"(dst_v), // %2
804 "+rm"(width) // %3
805 : "r"(static_cast<intptr_t>(src_stride_argb))
806 : "memory", "cc"
807#if defined(__SSE2__)
808 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
809#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000810 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000811}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000812
fbarchard@google.com762c0502013-02-04 18:47:21 +0000813void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
814 int width) {
815 asm volatile (
816 "movdqa %0,%%xmm4 \n"
817 "movdqa %1,%%xmm3 \n"
818 "movdqa %2,%%xmm5 \n"
819 :
820 : "m"(kARGBToU), // %0
821 "m"(kARGBToV), // %1
822 "m"(kAddUV128) // %2
823 );
824 asm volatile (
825 "sub %1,%2 \n"
826 ".p2align 4 \n"
827 "1: \n"
828 "movdqa (%0),%%xmm0 \n"
829 "movdqa 0x10(%0),%%xmm1 \n"
830 "movdqa 0x20(%0),%%xmm2 \n"
831 "movdqa 0x30(%0),%%xmm6 \n"
832 "pmaddubsw %%xmm4,%%xmm0 \n"
833 "pmaddubsw %%xmm4,%%xmm1 \n"
834 "pmaddubsw %%xmm4,%%xmm2 \n"
835 "pmaddubsw %%xmm4,%%xmm6 \n"
836 "phaddw %%xmm1,%%xmm0 \n"
837 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000838 "psraw $0x8,%%xmm0 \n"
839 "psraw $0x8,%%xmm2 \n"
840 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000841 "paddb %%xmm5,%%xmm0 \n"
842 "sub $0x10,%3 \n"
843 "movdqa %%xmm0,(%1) \n"
844 "movdqa (%0),%%xmm0 \n"
845 "movdqa 0x10(%0),%%xmm1 \n"
846 "movdqa 0x20(%0),%%xmm2 \n"
847 "movdqa 0x30(%0),%%xmm6 \n"
848 "pmaddubsw %%xmm3,%%xmm0 \n"
849 "pmaddubsw %%xmm3,%%xmm1 \n"
850 "pmaddubsw %%xmm3,%%xmm2 \n"
851 "pmaddubsw %%xmm3,%%xmm6 \n"
852 "phaddw %%xmm1,%%xmm0 \n"
853 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000854 "psraw $0x8,%%xmm0 \n"
855 "psraw $0x8,%%xmm2 \n"
856 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000857 "paddb %%xmm5,%%xmm0 \n"
858 "lea 0x40(%0),%0 \n"
859 "movdqa %%xmm0,(%1,%2,1) \n"
860 "lea 0x10(%1),%1 \n"
861 "jg 1b \n"
862 : "+r"(src_argb), // %0
863 "+r"(dst_u), // %1
864 "+r"(dst_v), // %2
865 "+rm"(width) // %3
866 :
867 : "memory", "cc"
868#if defined(__SSE2__)
869 , "xmm0", "xmm1", "xmm2", "xmm6"
870#endif
871 );
872}
873
874void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
875 uint8* dst_v, int width) {
876 asm volatile (
877 "movdqa %0,%%xmm4 \n"
878 "movdqa %1,%%xmm3 \n"
879 "movdqa %2,%%xmm5 \n"
880 :
881 : "m"(kARGBToU), // %0
882 "m"(kARGBToV), // %1
883 "m"(kAddUV128) // %2
884 );
885 asm volatile (
886 "sub %1,%2 \n"
887 ".p2align 4 \n"
888 "1: \n"
889 "movdqu (%0),%%xmm0 \n"
890 "movdqu 0x10(%0),%%xmm1 \n"
891 "movdqu 0x20(%0),%%xmm2 \n"
892 "movdqu 0x30(%0),%%xmm6 \n"
893 "pmaddubsw %%xmm4,%%xmm0 \n"
894 "pmaddubsw %%xmm4,%%xmm1 \n"
895 "pmaddubsw %%xmm4,%%xmm2 \n"
896 "pmaddubsw %%xmm4,%%xmm6 \n"
897 "phaddw %%xmm1,%%xmm0 \n"
898 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000899 "psraw $0x8,%%xmm0 \n"
900 "psraw $0x8,%%xmm2 \n"
901 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000902 "paddb %%xmm5,%%xmm0 \n"
903 "sub $0x10,%3 \n"
904 "movdqu %%xmm0,(%1) \n"
905 "movdqu (%0),%%xmm0 \n"
906 "movdqu 0x10(%0),%%xmm1 \n"
907 "movdqu 0x20(%0),%%xmm2 \n"
908 "movdqu 0x30(%0),%%xmm6 \n"
909 "pmaddubsw %%xmm3,%%xmm0 \n"
910 "pmaddubsw %%xmm3,%%xmm1 \n"
911 "pmaddubsw %%xmm3,%%xmm2 \n"
912 "pmaddubsw %%xmm3,%%xmm6 \n"
913 "phaddw %%xmm1,%%xmm0 \n"
914 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000915 "psraw $0x8,%%xmm0 \n"
916 "psraw $0x8,%%xmm2 \n"
917 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000918 "paddb %%xmm5,%%xmm0 \n"
919 "lea 0x40(%0),%0 \n"
920 "movdqu %%xmm0,(%1,%2,1) \n"
921 "lea 0x10(%1),%1 \n"
922 "jg 1b \n"
923 : "+r"(src_argb), // %0
924 "+r"(dst_u), // %1
925 "+r"(dst_v), // %2
926 "+rm"(width) // %3
927 :
928 : "memory", "cc"
929#if defined(__SSE2__)
930 , "xmm0", "xmm1", "xmm2", "xmm6"
931#endif
932 );
933}
934
fbarchard@google.combdf7cb52012-11-05 23:40:11 +0000935void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
936 uint8* dst_u, uint8* dst_v, int width) {
937 asm volatile (
938 "movdqa %0,%%xmm4 \n"
939 "movdqa %1,%%xmm3 \n"
940 "movdqa %2,%%xmm5 \n"
941 :
942 : "m"(kARGBToU), // %0
943 "m"(kARGBToV), // %1
944 "m"(kAddUV128) // %2
945 );
946 asm volatile (
947 "sub %1,%2 \n"
948 ".p2align 4 \n"
949 "1: \n"
950 "movdqa (%0),%%xmm0 \n"
951 "movdqa 0x10(%0),%%xmm1 \n"
952 "movdqa 0x20(%0),%%xmm2 \n"
953 "movdqa 0x30(%0),%%xmm6 \n"
954 "lea 0x40(%0),%0 \n"
955 "movdqa %%xmm0,%%xmm7 \n"
956 "shufps $0x88,%%xmm1,%%xmm0 \n"
957 "shufps $0xdd,%%xmm1,%%xmm7 \n"
958 "pavgb %%xmm7,%%xmm0 \n"
959 "movdqa %%xmm2,%%xmm7 \n"
960 "shufps $0x88,%%xmm6,%%xmm2 \n"
961 "shufps $0xdd,%%xmm6,%%xmm7 \n"
962 "pavgb %%xmm7,%%xmm2 \n"
963 "movdqa %%xmm0,%%xmm1 \n"
964 "movdqa %%xmm2,%%xmm6 \n"
965 "pmaddubsw %%xmm4,%%xmm0 \n"
966 "pmaddubsw %%xmm4,%%xmm2 \n"
967 "pmaddubsw %%xmm3,%%xmm1 \n"
968 "pmaddubsw %%xmm3,%%xmm6 \n"
969 "phaddw %%xmm2,%%xmm0 \n"
970 "phaddw %%xmm6,%%xmm1 \n"
971 "psraw $0x8,%%xmm0 \n"
972 "psraw $0x8,%%xmm1 \n"
973 "packsswb %%xmm1,%%xmm0 \n"
974 "paddb %%xmm5,%%xmm0 \n"
975 "sub $0x10,%3 \n"
976 "movlps %%xmm0,(%1) \n"
977 "movhps %%xmm0,(%1,%2,1) \n"
978 "lea 0x8(%1),%1 \n"
979 "jg 1b \n"
980 : "+r"(src_argb0), // %0
981 "+r"(dst_u), // %1
982 "+r"(dst_v), // %2
983 "+rm"(width) // %3
984 :
985 : "memory", "cc"
986#if defined(__SSE2__)
987 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
988#endif
989 );
990}
991
992void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
993 uint8* dst_u, uint8* dst_v, int width) {
994 asm volatile (
995 "movdqa %0,%%xmm4 \n"
996 "movdqa %1,%%xmm3 \n"
997 "movdqa %2,%%xmm5 \n"
998 :
999 : "m"(kARGBToU), // %0
1000 "m"(kARGBToV), // %1
1001 "m"(kAddUV128) // %2
1002 );
1003 asm volatile (
1004 "sub %1,%2 \n"
1005 ".p2align 4 \n"
1006 "1: \n"
1007 "movdqu (%0),%%xmm0 \n"
1008 "movdqu 0x10(%0),%%xmm1 \n"
1009 "movdqu 0x20(%0),%%xmm2 \n"
1010 "movdqu 0x30(%0),%%xmm6 \n"
1011 "lea 0x40(%0),%0 \n"
1012 "movdqa %%xmm0,%%xmm7 \n"
1013 "shufps $0x88,%%xmm1,%%xmm0 \n"
1014 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1015 "pavgb %%xmm7,%%xmm0 \n"
1016 "movdqa %%xmm2,%%xmm7 \n"
1017 "shufps $0x88,%%xmm6,%%xmm2 \n"
1018 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1019 "pavgb %%xmm7,%%xmm2 \n"
1020 "movdqa %%xmm0,%%xmm1 \n"
1021 "movdqa %%xmm2,%%xmm6 \n"
1022 "pmaddubsw %%xmm4,%%xmm0 \n"
1023 "pmaddubsw %%xmm4,%%xmm2 \n"
1024 "pmaddubsw %%xmm3,%%xmm1 \n"
1025 "pmaddubsw %%xmm3,%%xmm6 \n"
1026 "phaddw %%xmm2,%%xmm0 \n"
1027 "phaddw %%xmm6,%%xmm1 \n"
1028 "psraw $0x8,%%xmm0 \n"
1029 "psraw $0x8,%%xmm1 \n"
1030 "packsswb %%xmm1,%%xmm0 \n"
1031 "paddb %%xmm5,%%xmm0 \n"
1032 "sub $0x10,%3 \n"
1033 "movlps %%xmm0,(%1) \n"
1034 "movhps %%xmm0,(%1,%2,1) \n"
1035 "lea 0x8(%1),%1 \n"
1036 "jg 1b \n"
1037 : "+r"(src_argb0), // %0
1038 "+r"(dst_u), // %1
1039 "+r"(dst_v), // %2
1040 "+rm"(width) // %3
1041 :
1042 : "memory", "cc"
1043#if defined(__SSE2__)
1044 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1045#endif
1046 );
1047}
1048
fbarchard@google.com714050a2012-02-17 22:59:56 +00001049void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001050 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001051 "movdqa %4,%%xmm5 \n"
1052 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001053 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001054 "1: \n"
1055 "movdqa (%0),%%xmm0 \n"
1056 "movdqa 0x10(%0),%%xmm1 \n"
1057 "movdqa 0x20(%0),%%xmm2 \n"
1058 "movdqa 0x30(%0),%%xmm3 \n"
1059 "pmaddubsw %%xmm4,%%xmm0 \n"
1060 "pmaddubsw %%xmm4,%%xmm1 \n"
1061 "pmaddubsw %%xmm4,%%xmm2 \n"
1062 "pmaddubsw %%xmm4,%%xmm3 \n"
1063 "lea 0x40(%0),%0 \n"
1064 "phaddw %%xmm1,%%xmm0 \n"
1065 "phaddw %%xmm3,%%xmm2 \n"
1066 "psrlw $0x7,%%xmm0 \n"
1067 "psrlw $0x7,%%xmm2 \n"
1068 "packuswb %%xmm2,%%xmm0 \n"
1069 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001070 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001071 "movdqa %%xmm0,(%1) \n"
1072 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001073 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001074 : "+r"(src_bgra), // %0
1075 "+r"(dst_y), // %1
1076 "+r"(pix) // %2
1077 : "m"(kBGRAToY), // %3
1078 "m"(kAddY16) // %4
1079 : "memory", "cc"
1080#if defined(__SSE2__)
1081 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001082#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +00001083 );
1084}
1085
1086void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001087 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001088 "movdqa %4,%%xmm5 \n"
1089 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001090 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001091 "1: \n"
1092 "movdqu (%0),%%xmm0 \n"
1093 "movdqu 0x10(%0),%%xmm1 \n"
1094 "movdqu 0x20(%0),%%xmm2 \n"
1095 "movdqu 0x30(%0),%%xmm3 \n"
1096 "pmaddubsw %%xmm4,%%xmm0 \n"
1097 "pmaddubsw %%xmm4,%%xmm1 \n"
1098 "pmaddubsw %%xmm4,%%xmm2 \n"
1099 "pmaddubsw %%xmm4,%%xmm3 \n"
1100 "lea 0x40(%0),%0 \n"
1101 "phaddw %%xmm1,%%xmm0 \n"
1102 "phaddw %%xmm3,%%xmm2 \n"
1103 "psrlw $0x7,%%xmm0 \n"
1104 "psrlw $0x7,%%xmm2 \n"
1105 "packuswb %%xmm2,%%xmm0 \n"
1106 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001107 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001108 "movdqu %%xmm0,(%1) \n"
1109 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001110 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001111 : "+r"(src_bgra), // %0
1112 "+r"(dst_y), // %1
1113 "+r"(pix) // %2
1114 : "m"(kBGRAToY), // %3
1115 "m"(kAddY16) // %4
1116 : "memory", "cc"
1117#if defined(__SSE2__)
1118 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1119#endif
1120 );
1121}
1122
1123void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1124 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001125 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001126 "movdqa %0,%%xmm4 \n"
1127 "movdqa %1,%%xmm3 \n"
1128 "movdqa %2,%%xmm5 \n"
1129 :
1130 : "m"(kBGRAToU), // %0
1131 "m"(kBGRAToV), // %1
1132 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001133 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001134 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001136 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001137 "1: \n"
1138 "movdqa (%0),%%xmm0 \n"
1139 "movdqa 0x10(%0),%%xmm1 \n"
1140 "movdqa 0x20(%0),%%xmm2 \n"
1141 "movdqa 0x30(%0),%%xmm6 \n"
1142 "pavgb (%0,%4,1),%%xmm0 \n"
1143 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1144 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1145 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1146 "lea 0x40(%0),%0 \n"
1147 "movdqa %%xmm0,%%xmm7 \n"
1148 "shufps $0x88,%%xmm1,%%xmm0 \n"
1149 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1150 "pavgb %%xmm7,%%xmm0 \n"
1151 "movdqa %%xmm2,%%xmm7 \n"
1152 "shufps $0x88,%%xmm6,%%xmm2 \n"
1153 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1154 "pavgb %%xmm7,%%xmm2 \n"
1155 "movdqa %%xmm0,%%xmm1 \n"
1156 "movdqa %%xmm2,%%xmm6 \n"
1157 "pmaddubsw %%xmm4,%%xmm0 \n"
1158 "pmaddubsw %%xmm4,%%xmm2 \n"
1159 "pmaddubsw %%xmm3,%%xmm1 \n"
1160 "pmaddubsw %%xmm3,%%xmm6 \n"
1161 "phaddw %%xmm2,%%xmm0 \n"
1162 "phaddw %%xmm6,%%xmm1 \n"
1163 "psraw $0x8,%%xmm0 \n"
1164 "psraw $0x8,%%xmm1 \n"
1165 "packsswb %%xmm1,%%xmm0 \n"
1166 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001167 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001168 "movlps %%xmm0,(%1) \n"
1169 "movhps %%xmm0,(%1,%2,1) \n"
1170 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001171 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001172 : "+r"(src_bgra0), // %0
1173 "+r"(dst_u), // %1
1174 "+r"(dst_v), // %2
1175 "+rm"(width) // %3
1176 : "r"(static_cast<intptr_t>(src_stride_bgra))
1177 : "memory", "cc"
1178#if defined(__SSE2__)
1179 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1180#endif
1181 );
1182}
1183
1184void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1185 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001186 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001187 "movdqa %0,%%xmm4 \n"
1188 "movdqa %1,%%xmm3 \n"
1189 "movdqa %2,%%xmm5 \n"
1190 :
1191 : "m"(kBGRAToU), // %0
1192 "m"(kBGRAToV), // %1
1193 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001194 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001195 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001196 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001197 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001198 "1: \n"
1199 "movdqu (%0),%%xmm0 \n"
1200 "movdqu 0x10(%0),%%xmm1 \n"
1201 "movdqu 0x20(%0),%%xmm2 \n"
1202 "movdqu 0x30(%0),%%xmm6 \n"
1203 "movdqu (%0,%4,1),%%xmm7 \n"
1204 "pavgb %%xmm7,%%xmm0 \n"
1205 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1206 "pavgb %%xmm7,%%xmm1 \n"
1207 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1208 "pavgb %%xmm7,%%xmm2 \n"
1209 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1210 "pavgb %%xmm7,%%xmm6 \n"
1211 "lea 0x40(%0),%0 \n"
1212 "movdqa %%xmm0,%%xmm7 \n"
1213 "shufps $0x88,%%xmm1,%%xmm0 \n"
1214 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1215 "pavgb %%xmm7,%%xmm0 \n"
1216 "movdqa %%xmm2,%%xmm7 \n"
1217 "shufps $0x88,%%xmm6,%%xmm2 \n"
1218 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1219 "pavgb %%xmm7,%%xmm2 \n"
1220 "movdqa %%xmm0,%%xmm1 \n"
1221 "movdqa %%xmm2,%%xmm6 \n"
1222 "pmaddubsw %%xmm4,%%xmm0 \n"
1223 "pmaddubsw %%xmm4,%%xmm2 \n"
1224 "pmaddubsw %%xmm3,%%xmm1 \n"
1225 "pmaddubsw %%xmm3,%%xmm6 \n"
1226 "phaddw %%xmm2,%%xmm0 \n"
1227 "phaddw %%xmm6,%%xmm1 \n"
1228 "psraw $0x8,%%xmm0 \n"
1229 "psraw $0x8,%%xmm1 \n"
1230 "packsswb %%xmm1,%%xmm0 \n"
1231 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001232 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001233 "movlps %%xmm0,(%1) \n"
1234 "movhps %%xmm0,(%1,%2,1) \n"
1235 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001236 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001237 : "+r"(src_bgra0), // %0
1238 "+r"(dst_u), // %1
1239 "+r"(dst_v), // %2
1240 "+rm"(width) // %3
1241 : "r"(static_cast<intptr_t>(src_stride_bgra))
1242 : "memory", "cc"
1243#if defined(__SSE2__)
1244 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1245#endif
1246 );
1247}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001248
1249void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001250 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001251 "movdqa %4,%%xmm5 \n"
1252 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001253 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001254 "1: \n"
1255 "movdqa (%0),%%xmm0 \n"
1256 "movdqa 0x10(%0),%%xmm1 \n"
1257 "movdqa 0x20(%0),%%xmm2 \n"
1258 "movdqa 0x30(%0),%%xmm3 \n"
1259 "pmaddubsw %%xmm4,%%xmm0 \n"
1260 "pmaddubsw %%xmm4,%%xmm1 \n"
1261 "pmaddubsw %%xmm4,%%xmm2 \n"
1262 "pmaddubsw %%xmm4,%%xmm3 \n"
1263 "lea 0x40(%0),%0 \n"
1264 "phaddw %%xmm1,%%xmm0 \n"
1265 "phaddw %%xmm3,%%xmm2 \n"
1266 "psrlw $0x7,%%xmm0 \n"
1267 "psrlw $0x7,%%xmm2 \n"
1268 "packuswb %%xmm2,%%xmm0 \n"
1269 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001270 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001271 "movdqa %%xmm0,(%1) \n"
1272 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001273 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001274 : "+r"(src_abgr), // %0
1275 "+r"(dst_y), // %1
1276 "+r"(pix) // %2
1277 : "m"(kABGRToY), // %3
1278 "m"(kAddY16) // %4
1279 : "memory", "cc"
1280#if defined(__SSE2__)
1281 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1282#endif
1283 );
1284}
1285
1286void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001287 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001288 "movdqa %4,%%xmm5 \n"
1289 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001290 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001291 "1: \n"
1292 "movdqu (%0),%%xmm0 \n"
1293 "movdqu 0x10(%0),%%xmm1 \n"
1294 "movdqu 0x20(%0),%%xmm2 \n"
1295 "movdqu 0x30(%0),%%xmm3 \n"
1296 "pmaddubsw %%xmm4,%%xmm0 \n"
1297 "pmaddubsw %%xmm4,%%xmm1 \n"
1298 "pmaddubsw %%xmm4,%%xmm2 \n"
1299 "pmaddubsw %%xmm4,%%xmm3 \n"
1300 "lea 0x40(%0),%0 \n"
1301 "phaddw %%xmm1,%%xmm0 \n"
1302 "phaddw %%xmm3,%%xmm2 \n"
1303 "psrlw $0x7,%%xmm0 \n"
1304 "psrlw $0x7,%%xmm2 \n"
1305 "packuswb %%xmm2,%%xmm0 \n"
1306 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001307 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001308 "movdqu %%xmm0,(%1) \n"
1309 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001310 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001311 : "+r"(src_abgr), // %0
1312 "+r"(dst_y), // %1
1313 "+r"(pix) // %2
1314 : "m"(kABGRToY), // %3
1315 "m"(kAddY16) // %4
1316 : "memory", "cc"
1317#if defined(__SSE2__)
1318 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1319#endif
1320 );
1321}
1322
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001323void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1324 asm volatile (
1325 "movdqa %4,%%xmm5 \n"
1326 "movdqa %3,%%xmm4 \n"
1327 ".p2align 4 \n"
1328 "1: \n"
1329 "movdqa (%0),%%xmm0 \n"
1330 "movdqa 0x10(%0),%%xmm1 \n"
1331 "movdqa 0x20(%0),%%xmm2 \n"
1332 "movdqa 0x30(%0),%%xmm3 \n"
1333 "pmaddubsw %%xmm4,%%xmm0 \n"
1334 "pmaddubsw %%xmm4,%%xmm1 \n"
1335 "pmaddubsw %%xmm4,%%xmm2 \n"
1336 "pmaddubsw %%xmm4,%%xmm3 \n"
1337 "lea 0x40(%0),%0 \n"
1338 "phaddw %%xmm1,%%xmm0 \n"
1339 "phaddw %%xmm3,%%xmm2 \n"
1340 "psrlw $0x7,%%xmm0 \n"
1341 "psrlw $0x7,%%xmm2 \n"
1342 "packuswb %%xmm2,%%xmm0 \n"
1343 "paddb %%xmm5,%%xmm0 \n"
1344 "sub $0x10,%2 \n"
1345 "movdqa %%xmm0,(%1) \n"
1346 "lea 0x10(%1),%1 \n"
1347 "jg 1b \n"
1348 : "+r"(src_rgba), // %0
1349 "+r"(dst_y), // %1
1350 "+r"(pix) // %2
1351 : "m"(kRGBAToY), // %3
1352 "m"(kAddY16) // %4
1353 : "memory", "cc"
1354#if defined(__SSE2__)
1355 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1356#endif
1357 );
1358}
1359
1360void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1361 asm volatile (
1362 "movdqa %4,%%xmm5 \n"
1363 "movdqa %3,%%xmm4 \n"
1364 ".p2align 4 \n"
1365 "1: \n"
1366 "movdqu (%0),%%xmm0 \n"
1367 "movdqu 0x10(%0),%%xmm1 \n"
1368 "movdqu 0x20(%0),%%xmm2 \n"
1369 "movdqu 0x30(%0),%%xmm3 \n"
1370 "pmaddubsw %%xmm4,%%xmm0 \n"
1371 "pmaddubsw %%xmm4,%%xmm1 \n"
1372 "pmaddubsw %%xmm4,%%xmm2 \n"
1373 "pmaddubsw %%xmm4,%%xmm3 \n"
1374 "lea 0x40(%0),%0 \n"
1375 "phaddw %%xmm1,%%xmm0 \n"
1376 "phaddw %%xmm3,%%xmm2 \n"
1377 "psrlw $0x7,%%xmm0 \n"
1378 "psrlw $0x7,%%xmm2 \n"
1379 "packuswb %%xmm2,%%xmm0 \n"
1380 "paddb %%xmm5,%%xmm0 \n"
1381 "sub $0x10,%2 \n"
1382 "movdqu %%xmm0,(%1) \n"
1383 "lea 0x10(%1),%1 \n"
1384 "jg 1b \n"
1385 : "+r"(src_rgba), // %0
1386 "+r"(dst_y), // %1
1387 "+r"(pix) // %2
1388 : "m"(kRGBAToY), // %3
1389 "m"(kAddY16) // %4
1390 : "memory", "cc"
1391#if defined(__SSE2__)
1392 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1393#endif
1394 );
1395}
1396
fbarchard@google.com714050a2012-02-17 22:59:56 +00001397void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1398 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001399 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001400 "movdqa %0,%%xmm4 \n"
1401 "movdqa %1,%%xmm3 \n"
1402 "movdqa %2,%%xmm5 \n"
1403 :
1404 : "m"(kABGRToU), // %0
1405 "m"(kABGRToV), // %1
1406 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001407 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001408 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001409 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001410 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001411 "1: \n"
1412 "movdqa (%0),%%xmm0 \n"
1413 "movdqa 0x10(%0),%%xmm1 \n"
1414 "movdqa 0x20(%0),%%xmm2 \n"
1415 "movdqa 0x30(%0),%%xmm6 \n"
1416 "pavgb (%0,%4,1),%%xmm0 \n"
1417 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1418 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1419 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1420 "lea 0x40(%0),%0 \n"
1421 "movdqa %%xmm0,%%xmm7 \n"
1422 "shufps $0x88,%%xmm1,%%xmm0 \n"
1423 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1424 "pavgb %%xmm7,%%xmm0 \n"
1425 "movdqa %%xmm2,%%xmm7 \n"
1426 "shufps $0x88,%%xmm6,%%xmm2 \n"
1427 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1428 "pavgb %%xmm7,%%xmm2 \n"
1429 "movdqa %%xmm0,%%xmm1 \n"
1430 "movdqa %%xmm2,%%xmm6 \n"
1431 "pmaddubsw %%xmm4,%%xmm0 \n"
1432 "pmaddubsw %%xmm4,%%xmm2 \n"
1433 "pmaddubsw %%xmm3,%%xmm1 \n"
1434 "pmaddubsw %%xmm3,%%xmm6 \n"
1435 "phaddw %%xmm2,%%xmm0 \n"
1436 "phaddw %%xmm6,%%xmm1 \n"
1437 "psraw $0x8,%%xmm0 \n"
1438 "psraw $0x8,%%xmm1 \n"
1439 "packsswb %%xmm1,%%xmm0 \n"
1440 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001441 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001442 "movlps %%xmm0,(%1) \n"
1443 "movhps %%xmm0,(%1,%2,1) \n"
1444 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001445 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001446 : "+r"(src_abgr0), // %0
1447 "+r"(dst_u), // %1
1448 "+r"(dst_v), // %2
1449 "+rm"(width) // %3
1450 : "r"(static_cast<intptr_t>(src_stride_abgr))
1451 : "memory", "cc"
1452#if defined(__SSE2__)
1453 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1454#endif
1455 );
1456}
1457
1458void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1459 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001460 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001461 "movdqa %0,%%xmm4 \n"
1462 "movdqa %1,%%xmm3 \n"
1463 "movdqa %2,%%xmm5 \n"
1464 :
1465 : "m"(kABGRToU), // %0
1466 "m"(kABGRToV), // %1
1467 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001468 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001469 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001470 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001471 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001472 "1: \n"
1473 "movdqu (%0),%%xmm0 \n"
1474 "movdqu 0x10(%0),%%xmm1 \n"
1475 "movdqu 0x20(%0),%%xmm2 \n"
1476 "movdqu 0x30(%0),%%xmm6 \n"
1477 "movdqu (%0,%4,1),%%xmm7 \n"
1478 "pavgb %%xmm7,%%xmm0 \n"
1479 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1480 "pavgb %%xmm7,%%xmm1 \n"
1481 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1482 "pavgb %%xmm7,%%xmm2 \n"
1483 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1484 "pavgb %%xmm7,%%xmm6 \n"
1485 "lea 0x40(%0),%0 \n"
1486 "movdqa %%xmm0,%%xmm7 \n"
1487 "shufps $0x88,%%xmm1,%%xmm0 \n"
1488 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1489 "pavgb %%xmm7,%%xmm0 \n"
1490 "movdqa %%xmm2,%%xmm7 \n"
1491 "shufps $0x88,%%xmm6,%%xmm2 \n"
1492 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1493 "pavgb %%xmm7,%%xmm2 \n"
1494 "movdqa %%xmm0,%%xmm1 \n"
1495 "movdqa %%xmm2,%%xmm6 \n"
1496 "pmaddubsw %%xmm4,%%xmm0 \n"
1497 "pmaddubsw %%xmm4,%%xmm2 \n"
1498 "pmaddubsw %%xmm3,%%xmm1 \n"
1499 "pmaddubsw %%xmm3,%%xmm6 \n"
1500 "phaddw %%xmm2,%%xmm0 \n"
1501 "phaddw %%xmm6,%%xmm1 \n"
1502 "psraw $0x8,%%xmm0 \n"
1503 "psraw $0x8,%%xmm1 \n"
1504 "packsswb %%xmm1,%%xmm0 \n"
1505 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001506 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001507 "movlps %%xmm0,(%1) \n"
1508 "movhps %%xmm0,(%1,%2,1) \n"
1509 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001510 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001511 : "+r"(src_abgr0), // %0
1512 "+r"(dst_u), // %1
1513 "+r"(dst_v), // %2
1514 "+rm"(width) // %3
1515 : "r"(static_cast<intptr_t>(src_stride_abgr))
1516 : "memory", "cc"
1517#if defined(__SSE2__)
1518 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1519#endif
1520 );
1521}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001522
1523void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1524 uint8* dst_u, uint8* dst_v, int width) {
1525 asm volatile (
1526 "movdqa %0,%%xmm4 \n"
1527 "movdqa %1,%%xmm3 \n"
1528 "movdqa %2,%%xmm5 \n"
1529 :
1530 : "m"(kRGBAToU), // %0
1531 "m"(kRGBAToV), // %1
1532 "m"(kAddUV128) // %2
1533 );
1534 asm volatile (
1535 "sub %1,%2 \n"
1536 ".p2align 4 \n"
1537 "1: \n"
1538 "movdqa (%0),%%xmm0 \n"
1539 "movdqa 0x10(%0),%%xmm1 \n"
1540 "movdqa 0x20(%0),%%xmm2 \n"
1541 "movdqa 0x30(%0),%%xmm6 \n"
1542 "pavgb (%0,%4,1),%%xmm0 \n"
1543 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1544 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1545 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1546 "lea 0x40(%0),%0 \n"
1547 "movdqa %%xmm0,%%xmm7 \n"
1548 "shufps $0x88,%%xmm1,%%xmm0 \n"
1549 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1550 "pavgb %%xmm7,%%xmm0 \n"
1551 "movdqa %%xmm2,%%xmm7 \n"
1552 "shufps $0x88,%%xmm6,%%xmm2 \n"
1553 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1554 "pavgb %%xmm7,%%xmm2 \n"
1555 "movdqa %%xmm0,%%xmm1 \n"
1556 "movdqa %%xmm2,%%xmm6 \n"
1557 "pmaddubsw %%xmm4,%%xmm0 \n"
1558 "pmaddubsw %%xmm4,%%xmm2 \n"
1559 "pmaddubsw %%xmm3,%%xmm1 \n"
1560 "pmaddubsw %%xmm3,%%xmm6 \n"
1561 "phaddw %%xmm2,%%xmm0 \n"
1562 "phaddw %%xmm6,%%xmm1 \n"
1563 "psraw $0x8,%%xmm0 \n"
1564 "psraw $0x8,%%xmm1 \n"
1565 "packsswb %%xmm1,%%xmm0 \n"
1566 "paddb %%xmm5,%%xmm0 \n"
1567 "sub $0x10,%3 \n"
1568 "movlps %%xmm0,(%1) \n"
1569 "movhps %%xmm0,(%1,%2,1) \n"
1570 "lea 0x8(%1),%1 \n"
1571 "jg 1b \n"
1572 : "+r"(src_rgba0), // %0
1573 "+r"(dst_u), // %1
1574 "+r"(dst_v), // %2
1575 "+rm"(width) // %3
1576 : "r"(static_cast<intptr_t>(src_stride_rgba))
1577 : "memory", "cc"
1578#if defined(__SSE2__)
1579 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1580#endif
1581 );
1582}
1583
1584void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1585 uint8* dst_u, uint8* dst_v, int width) {
1586 asm volatile (
1587 "movdqa %0,%%xmm4 \n"
1588 "movdqa %1,%%xmm3 \n"
1589 "movdqa %2,%%xmm5 \n"
1590 :
1591 : "m"(kRGBAToU), // %0
1592 "m"(kRGBAToV), // %1
1593 "m"(kAddUV128) // %2
1594 );
1595 asm volatile (
1596 "sub %1,%2 \n"
1597 ".p2align 4 \n"
1598 "1: \n"
1599 "movdqu (%0),%%xmm0 \n"
1600 "movdqu 0x10(%0),%%xmm1 \n"
1601 "movdqu 0x20(%0),%%xmm2 \n"
1602 "movdqu 0x30(%0),%%xmm6 \n"
1603 "movdqu (%0,%4,1),%%xmm7 \n"
1604 "pavgb %%xmm7,%%xmm0 \n"
1605 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1606 "pavgb %%xmm7,%%xmm1 \n"
1607 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1608 "pavgb %%xmm7,%%xmm2 \n"
1609 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1610 "pavgb %%xmm7,%%xmm6 \n"
1611 "lea 0x40(%0),%0 \n"
1612 "movdqa %%xmm0,%%xmm7 \n"
1613 "shufps $0x88,%%xmm1,%%xmm0 \n"
1614 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1615 "pavgb %%xmm7,%%xmm0 \n"
1616 "movdqa %%xmm2,%%xmm7 \n"
1617 "shufps $0x88,%%xmm6,%%xmm2 \n"
1618 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1619 "pavgb %%xmm7,%%xmm2 \n"
1620 "movdqa %%xmm0,%%xmm1 \n"
1621 "movdqa %%xmm2,%%xmm6 \n"
1622 "pmaddubsw %%xmm4,%%xmm0 \n"
1623 "pmaddubsw %%xmm4,%%xmm2 \n"
1624 "pmaddubsw %%xmm3,%%xmm1 \n"
1625 "pmaddubsw %%xmm3,%%xmm6 \n"
1626 "phaddw %%xmm2,%%xmm0 \n"
1627 "phaddw %%xmm6,%%xmm1 \n"
1628 "psraw $0x8,%%xmm0 \n"
1629 "psraw $0x8,%%xmm1 \n"
1630 "packsswb %%xmm1,%%xmm0 \n"
1631 "paddb %%xmm5,%%xmm0 \n"
1632 "sub $0x10,%3 \n"
1633 "movlps %%xmm0,(%1) \n"
1634 "movhps %%xmm0,(%1,%2,1) \n"
1635 "lea 0x8(%1),%1 \n"
1636 "jg 1b \n"
1637 : "+r"(src_rgba0), // %0
1638 "+r"(dst_u), // %1
1639 "+r"(dst_v), // %2
1640 "+rm"(width) // %3
1641 : "r"(static_cast<intptr_t>(src_stride_rgba))
1642 : "memory", "cc"
1643#if defined(__SSE2__)
1644 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1645#endif
1646 );
1647}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001648#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001649
fbarchard@google.come214fe32012-06-04 23:47:11 +00001650#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001651#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1652#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1653#define UR 0
1654
1655#define VB 0
1656#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1657#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1658
1659// Bias
1660#define BB UB * 128 + VB * 128
1661#define BG UG * 128 + VG * 128
1662#define BR UR * 128 + VR * 128
1663
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001664#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001665
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001666struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001667 vec8 kUVToB; // 0
1668 vec8 kUVToG; // 16
1669 vec8 kUVToR; // 32
1670 vec16 kUVBiasB; // 48
1671 vec16 kUVBiasG; // 64
1672 vec16 kUVBiasR; // 80
1673 vec16 kYSub16; // 96
1674 vec16 kYToRgb; // 112
1675 vec8 kVUToB; // 128
1676 vec8 kVUToG; // 144
1677 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001678} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001679 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1680 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1681 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1682 { BB, BB, BB, BB, BB, BB, BB, BB },
1683 { BG, BG, BG, BG, BG, BG, BG, BG },
1684 { BR, BR, BR, BR, BR, BR, BR, BR },
1685 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001686 { YG, YG, YG, YG, YG, YG, YG, YG },
1687 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1688 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1689 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001690};
1691
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001692
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001693// Read 8 UV from 411
1694#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001695 "movq (%[u_buf]),%%xmm0 \n" \
1696 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1697 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001698 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001699
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001700// Read 4 UV from 422, upsample to 8 UV
1701#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001702 "movd (%[u_buf]),%%xmm0 \n" \
1703 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1704 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001705 "punpcklbw %%xmm1,%%xmm0 \n" \
1706 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001707
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001708// Read 2 UV from 411, upsample to 8 UV
1709#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001710 "movd (%[u_buf]),%%xmm0 \n" \
1711 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1712 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001713 "punpcklbw %%xmm1,%%xmm0 \n" \
1714 "punpcklwd %%xmm0,%%xmm0 \n" \
1715 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001716
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001717// Read 4 UV from NV12, upsample to 8 UV
1718#define READNV12 \
1719 "movq (%[uv_buf]),%%xmm0 \n" \
1720 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001721 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001722
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001723// Convert 8 pixels: 8 UV and 8 Y
1724#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001725 "movdqa %%xmm0,%%xmm1 \n" \
1726 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001727 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1728 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1729 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1730 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1731 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1732 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1733 "movq (%[y_buf]),%%xmm3 \n" \
1734 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001735 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001736 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1737 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001738 "paddsw %%xmm3,%%xmm0 \n" \
1739 "paddsw %%xmm3,%%xmm1 \n" \
1740 "paddsw %%xmm3,%%xmm2 \n" \
1741 "psraw $0x6,%%xmm0 \n" \
1742 "psraw $0x6,%%xmm1 \n" \
1743 "psraw $0x6,%%xmm2 \n" \
1744 "packuswb %%xmm0,%%xmm0 \n" \
1745 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001746 "packuswb %%xmm2,%%xmm2 \n" \
1747
1748// Convert 8 pixels: 8 VU and 8 Y
1749#define YVUTORGB \
1750 "movdqa %%xmm0,%%xmm1 \n" \
1751 "movdqa %%xmm0,%%xmm2 \n" \
1752 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1753 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1754 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1755 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1756 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1757 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1758 "movq (%[y_buf]),%%xmm3 \n" \
1759 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1760 "punpcklbw %%xmm4,%%xmm3 \n" \
1761 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1762 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1763 "paddsw %%xmm3,%%xmm0 \n" \
1764 "paddsw %%xmm3,%%xmm1 \n" \
1765 "paddsw %%xmm3,%%xmm2 \n" \
1766 "psraw $0x6,%%xmm0 \n" \
1767 "psraw $0x6,%%xmm1 \n" \
1768 "psraw $0x6,%%xmm2 \n" \
1769 "packuswb %%xmm0,%%xmm0 \n" \
1770 "packuswb %%xmm1,%%xmm1 \n" \
1771 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001772
1773void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001774 const uint8* u_buf,
1775 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001776 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001777 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001778 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001779 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001780 "pcmpeqb %%xmm5,%%xmm5 \n"
1781 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001782 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001783 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001784 READYUV444
1785 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001786 "punpcklbw %%xmm1,%%xmm0 \n"
1787 "punpcklbw %%xmm5,%%xmm2 \n"
1788 "movdqa %%xmm0,%%xmm1 \n"
1789 "punpcklwd %%xmm2,%%xmm0 \n"
1790 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001791 "movdqa %%xmm0,(%[dst_argb]) \n"
1792 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
1793 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001794 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001795 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001796 : [y_buf]"+r"(y_buf), // %[y_buf]
1797 [u_buf]"+r"(u_buf), // %[u_buf]
1798 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001799 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001800 [width]"+rm"(width) // %[width]
1801 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001802 : "memory", "cc"
1803#if defined(__SSE2__)
1804 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1805#endif
1806 );
1807}
1808
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001809void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1810 const uint8* u_buf,
1811 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001812 uint8* dst_rgb24,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001813 int width) {
1814// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1815#ifdef __APPLE__
1816 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001817 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1818 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1819 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1820 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001821#endif
1822
1823 asm volatile (
1824#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001825 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1826 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001827#endif
1828 "sub %[u_buf],%[v_buf] \n"
1829 "pxor %%xmm4,%%xmm4 \n"
1830 ".p2align 4 \n"
1831 "1: \n"
1832 READYUV422
1833 YUVTORGB
1834 "punpcklbw %%xmm1,%%xmm0 \n"
1835 "punpcklbw %%xmm2,%%xmm2 \n"
1836 "movdqa %%xmm0,%%xmm1 \n"
1837 "punpcklwd %%xmm2,%%xmm0 \n"
1838 "punpckhwd %%xmm2,%%xmm1 \n"
1839 "pshufb %%xmm5,%%xmm0 \n"
1840 "pshufb %%xmm6,%%xmm1 \n"
1841 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001842 "movq %%xmm0,(%[dst_rgb24]) \n"
1843 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
1844 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001845 "sub $0x8,%[width] \n"
1846 "jg 1b \n"
1847 : [y_buf]"+r"(y_buf), // %[y_buf]
1848 [u_buf]"+r"(u_buf), // %[u_buf]
1849 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001850 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001851 [width]"+rm"(width) // %[width]
1852 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1853#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001854 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1855 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001856#endif
1857 : "memory", "cc"
1858#if defined(__SSE2__)
1859 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1860#endif
1861 );
1862}
1863
1864void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1865 const uint8* u_buf,
1866 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001867 uint8* dst_raw,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001868 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001869// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001870#ifdef __APPLE__
1871 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001872 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1873 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1874 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1875 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001876#endif
1877
1878 asm volatile (
1879#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001880 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1881 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001882#endif
1883 "sub %[u_buf],%[v_buf] \n"
1884 "pxor %%xmm4,%%xmm4 \n"
1885 ".p2align 4 \n"
1886 "1: \n"
1887 READYUV422
1888 YUVTORGB
1889 "punpcklbw %%xmm1,%%xmm0 \n"
1890 "punpcklbw %%xmm2,%%xmm2 \n"
1891 "movdqa %%xmm0,%%xmm1 \n"
1892 "punpcklwd %%xmm2,%%xmm0 \n"
1893 "punpckhwd %%xmm2,%%xmm1 \n"
1894 "pshufb %%xmm5,%%xmm0 \n"
1895 "pshufb %%xmm6,%%xmm1 \n"
1896 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001897 "movq %%xmm0,(%[dst_raw]) \n"
1898 "movdqu %%xmm1,0x8(%[dst_raw]) \n"
1899 "lea 0x18(%[dst_raw]),%[dst_raw] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001900 "sub $0x8,%[width] \n"
1901 "jg 1b \n"
1902 : [y_buf]"+r"(y_buf), // %[y_buf]
1903 [u_buf]"+r"(u_buf), // %[u_buf]
1904 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001905 [dst_raw]"+r"(dst_raw), // %[dst_raw]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001906 [width]"+rm"(width) // %[width]
1907 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1908#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001909 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1910 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001911#endif
1912 : "memory", "cc"
1913#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001914 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001915#endif
1916 );
1917}
1918
fbarchard@google.come214fe32012-06-04 23:47:11 +00001919void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001920 const uint8* u_buf,
1921 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001922 uint8* dst_argb,
fbarchard@google.comdbcabea2012-10-29 21:20:25 +00001923 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001924 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001925 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001926 "pcmpeqb %%xmm5,%%xmm5 \n"
1927 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001928 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001929 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001930 READYUV422
1931 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001932 "punpcklbw %%xmm1,%%xmm0 \n"
1933 "punpcklbw %%xmm5,%%xmm2 \n"
1934 "movdqa %%xmm0,%%xmm1 \n"
1935 "punpcklwd %%xmm2,%%xmm0 \n"
1936 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001937 "movdqa %%xmm0,(%[dst_argb]) \n"
1938 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
1939 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001940 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001941 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001942 : [y_buf]"+r"(y_buf), // %[y_buf]
1943 [u_buf]"+r"(u_buf), // %[u_buf]
1944 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001945 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001946 [width]"+rm"(width) // %[width]
1947 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001948 : "memory", "cc"
1949#if defined(__SSE2__)
1950 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1951#endif
1952 );
1953}
1954
1955void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1956 const uint8* u_buf,
1957 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001958 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001959 int width) {
1960 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001961 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001962 "pcmpeqb %%xmm5,%%xmm5 \n"
1963 "pxor %%xmm4,%%xmm4 \n"
1964 ".p2align 4 \n"
1965 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001966 READYUV411
1967 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001968 "punpcklbw %%xmm1,%%xmm0 \n"
1969 "punpcklbw %%xmm5,%%xmm2 \n"
1970 "movdqa %%xmm0,%%xmm1 \n"
1971 "punpcklwd %%xmm2,%%xmm0 \n"
1972 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001973 "movdqa %%xmm0,(%[dst_argb]) \n"
1974 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
1975 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001976 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001977 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001978 : [y_buf]"+r"(y_buf), // %[y_buf]
1979 [u_buf]"+r"(u_buf), // %[u_buf]
1980 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001981 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001982 [width]"+rm"(width) // %[width]
1983 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1984 : "memory", "cc"
1985#if defined(__SSE2__)
1986 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1987#endif
1988 );
1989}
1990
1991void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1992 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001993 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001994 int width) {
1995 asm volatile (
1996 "pcmpeqb %%xmm5,%%xmm5 \n"
1997 "pxor %%xmm4,%%xmm4 \n"
1998 ".p2align 4 \n"
1999 "1: \n"
2000 READNV12
2001 YUVTORGB
2002 "punpcklbw %%xmm1,%%xmm0 \n"
2003 "punpcklbw %%xmm5,%%xmm2 \n"
2004 "movdqa %%xmm0,%%xmm1 \n"
2005 "punpcklwd %%xmm2,%%xmm0 \n"
2006 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002007 "movdqa %%xmm0,(%[dst_argb]) \n"
2008 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2009 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002010 "sub $0x8,%[width] \n"
2011 "jg 1b \n"
2012 : [y_buf]"+r"(y_buf), // %[y_buf]
2013 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002014 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002015 [width]"+rm"(width) // %[width]
2016 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2017 : "memory", "cc"
2018#if defined(__SSE2__)
2019 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2020#endif
2021 );
2022}
2023
2024void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002025 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002026 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002027 int width) {
2028 asm volatile (
2029 "pcmpeqb %%xmm5,%%xmm5 \n"
2030 "pxor %%xmm4,%%xmm4 \n"
2031 ".p2align 4 \n"
2032 "1: \n"
2033 READNV12
2034 YVUTORGB
2035 "punpcklbw %%xmm1,%%xmm0 \n"
2036 "punpcklbw %%xmm5,%%xmm2 \n"
2037 "movdqa %%xmm0,%%xmm1 \n"
2038 "punpcklwd %%xmm2,%%xmm0 \n"
2039 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002040 "movdqa %%xmm0,(%[dst_argb]) \n"
2041 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2042 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002043 "sub $0x8,%[width] \n"
2044 "jg 1b \n"
2045 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002046 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2047 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002048 [width]"+rm"(width) // %[width]
2049 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002050 : "memory", "cc"
2051#if defined(__SSE2__)
2052 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2053#endif
2054 );
2055}
2056
2057void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2058 const uint8* u_buf,
2059 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002060 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002061 int width) {
2062 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002063 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002064 "pcmpeqb %%xmm5,%%xmm5 \n"
2065 "pxor %%xmm4,%%xmm4 \n"
2066 ".p2align 4 \n"
2067 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002068 READYUV444
2069 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002070 "punpcklbw %%xmm1,%%xmm0 \n"
2071 "punpcklbw %%xmm5,%%xmm2 \n"
2072 "movdqa %%xmm0,%%xmm1 \n"
2073 "punpcklwd %%xmm2,%%xmm0 \n"
2074 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002075 "movdqu %%xmm0,(%[dst_argb]) \n"
2076 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2077 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002078 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002079 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002080 : [y_buf]"+r"(y_buf), // %[y_buf]
2081 [u_buf]"+r"(u_buf), // %[u_buf]
2082 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002083 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002084 [width]"+rm"(width) // %[width]
2085 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002086 : "memory", "cc"
2087#if defined(__SSE2__)
2088 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2089#endif
2090 );
2091}
2092
2093void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2094 const uint8* u_buf,
2095 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002096 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002097 int width) {
2098 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002099 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002100 "pcmpeqb %%xmm5,%%xmm5 \n"
2101 "pxor %%xmm4,%%xmm4 \n"
2102 ".p2align 4 \n"
2103 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002104 READYUV422
2105 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002106 "punpcklbw %%xmm1,%%xmm0 \n"
2107 "punpcklbw %%xmm5,%%xmm2 \n"
2108 "movdqa %%xmm0,%%xmm1 \n"
2109 "punpcklwd %%xmm2,%%xmm0 \n"
2110 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002111 "movdqu %%xmm0,(%[dst_argb]) \n"
2112 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2113 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002114 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002115 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002116 : [y_buf]"+r"(y_buf), // %[y_buf]
2117 [u_buf]"+r"(u_buf), // %[u_buf]
2118 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002119 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002120 [width]"+rm"(width) // %[width]
2121 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002122 : "memory", "cc"
2123#if defined(__SSE2__)
2124 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2125#endif
2126 );
2127}
2128
2129void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2130 const uint8* u_buf,
2131 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002132 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002133 int width) {
2134 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002135 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002136 "pcmpeqb %%xmm5,%%xmm5 \n"
2137 "pxor %%xmm4,%%xmm4 \n"
2138 ".p2align 4 \n"
2139 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002140 READYUV411
2141 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002142 "punpcklbw %%xmm1,%%xmm0 \n"
2143 "punpcklbw %%xmm5,%%xmm2 \n"
2144 "movdqa %%xmm0,%%xmm1 \n"
2145 "punpcklwd %%xmm2,%%xmm0 \n"
2146 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002147 "movdqu %%xmm0,(%[dst_argb]) \n"
2148 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2149 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002150 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002151 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002152 : [y_buf]"+r"(y_buf), // %[y_buf]
2153 [u_buf]"+r"(u_buf), // %[u_buf]
2154 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002155 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002156 [width]"+rm"(width) // %[width]
2157 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2158 : "memory", "cc"
2159#if defined(__SSE2__)
2160 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2161#endif
2162 );
2163}
2164
2165void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2166 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002167 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002168 int width) {
2169 asm volatile (
2170 "pcmpeqb %%xmm5,%%xmm5 \n"
2171 "pxor %%xmm4,%%xmm4 \n"
2172 ".p2align 4 \n"
2173 "1: \n"
2174 READNV12
2175 YUVTORGB
2176 "punpcklbw %%xmm1,%%xmm0 \n"
2177 "punpcklbw %%xmm5,%%xmm2 \n"
2178 "movdqa %%xmm0,%%xmm1 \n"
2179 "punpcklwd %%xmm2,%%xmm0 \n"
2180 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002181 "movdqu %%xmm0,(%[dst_argb]) \n"
2182 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2183 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002184 "sub $0x8,%[width] \n"
2185 "jg 1b \n"
2186 : [y_buf]"+r"(y_buf), // %[y_buf]
2187 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002188 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002189 [width]"+rm"(width) // %[width]
2190 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2191 : "memory", "cc"
2192#if defined(__SSE2__)
2193 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2194#endif
2195 );
2196}
2197
2198void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002199 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002200 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002201 int width) {
2202 asm volatile (
2203 "pcmpeqb %%xmm5,%%xmm5 \n"
2204 "pxor %%xmm4,%%xmm4 \n"
2205 ".p2align 4 \n"
2206 "1: \n"
2207 READNV12
2208 YVUTORGB
2209 "punpcklbw %%xmm1,%%xmm0 \n"
2210 "punpcklbw %%xmm5,%%xmm2 \n"
2211 "movdqa %%xmm0,%%xmm1 \n"
2212 "punpcklwd %%xmm2,%%xmm0 \n"
2213 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002214 "movdqu %%xmm0,(%[dst_argb]) \n"
2215 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2216 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002217 "sub $0x8,%[width] \n"
2218 "jg 1b \n"
2219 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002220 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2221 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002222 [width]"+rm"(width) // %[width]
2223 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002224 : "memory", "cc"
2225#if defined(__SSE2__)
2226 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2227#endif
2228 );
2229}
2230
2231void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2232 const uint8* u_buf,
2233 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002234 uint8* dst_bgra,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002235 int width) {
2236 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002237 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002238 "pcmpeqb %%xmm5,%%xmm5 \n"
2239 "pxor %%xmm4,%%xmm4 \n"
2240 ".p2align 4 \n"
2241 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002242 READYUV422
2243 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002244 "pcmpeqb %%xmm5,%%xmm5 \n"
2245 "punpcklbw %%xmm0,%%xmm1 \n"
2246 "punpcklbw %%xmm2,%%xmm5 \n"
2247 "movdqa %%xmm5,%%xmm0 \n"
2248 "punpcklwd %%xmm1,%%xmm5 \n"
2249 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002250 "movdqa %%xmm5,(%[dst_bgra]) \n"
2251 "movdqa %%xmm0,0x10(%[dst_bgra]) \n"
2252 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002253 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002254 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002255 : [y_buf]"+r"(y_buf), // %[y_buf]
2256 [u_buf]"+r"(u_buf), // %[u_buf]
2257 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002258 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002259 [width]"+rm"(width) // %[width]
2260 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002261 : "memory", "cc"
2262#if defined(__SSE2__)
2263 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2264#endif
2265 );
2266}
2267
fbarchard@google.come214fe32012-06-04 23:47:11 +00002268void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002269 const uint8* u_buf,
2270 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002271 uint8* dst_abgr,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002272 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002273 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002274 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002275 "pcmpeqb %%xmm5,%%xmm5 \n"
2276 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002277 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002278 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002279 READYUV422
2280 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002281 "punpcklbw %%xmm1,%%xmm2 \n"
2282 "punpcklbw %%xmm5,%%xmm0 \n"
2283 "movdqa %%xmm2,%%xmm1 \n"
2284 "punpcklwd %%xmm0,%%xmm2 \n"
2285 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002286 "movdqa %%xmm2,(%[dst_abgr]) \n"
2287 "movdqa %%xmm1,0x10(%[dst_abgr]) \n"
2288 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002289 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002290 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002291 : [y_buf]"+r"(y_buf), // %[y_buf]
2292 [u_buf]"+r"(u_buf), // %[u_buf]
2293 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002294 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002295 [width]"+rm"(width) // %[width]
2296 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002297 : "memory", "cc"
2298#if defined(__SSE2__)
2299 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2300#endif
2301 );
2302}
2303
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002304void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2305 const uint8* u_buf,
2306 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002307 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002308 int width) {
2309 asm volatile (
2310 "sub %[u_buf],%[v_buf] \n"
2311 "pcmpeqb %%xmm5,%%xmm5 \n"
2312 "pxor %%xmm4,%%xmm4 \n"
2313 ".p2align 4 \n"
2314 "1: \n"
2315 READYUV422
2316 YUVTORGB
2317 "pcmpeqb %%xmm5,%%xmm5 \n"
2318 "punpcklbw %%xmm2,%%xmm1 \n"
2319 "punpcklbw %%xmm0,%%xmm5 \n"
2320 "movdqa %%xmm5,%%xmm0 \n"
2321 "punpcklwd %%xmm1,%%xmm5 \n"
2322 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002323 "movdqa %%xmm5,(%[dst_rgba]) \n"
2324 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2325 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002326 "sub $0x8,%[width] \n"
2327 "jg 1b \n"
2328 : [y_buf]"+r"(y_buf), // %[y_buf]
2329 [u_buf]"+r"(u_buf), // %[u_buf]
2330 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002331 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002332 [width]"+rm"(width) // %[width]
2333 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2334 : "memory", "cc"
2335#if defined(__SSE2__)
2336 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2337#endif
2338 );
2339}
2340
fbarchard@google.come214fe32012-06-04 23:47:11 +00002341void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002342 const uint8* u_buf,
2343 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002344 uint8* dst_bgra,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002345 int width) {
2346 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002347 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002348 "pcmpeqb %%xmm5,%%xmm5 \n"
2349 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002350 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002351 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002352 READYUV422
2353 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002354 "pcmpeqb %%xmm5,%%xmm5 \n"
2355 "punpcklbw %%xmm0,%%xmm1 \n"
2356 "punpcklbw %%xmm2,%%xmm5 \n"
2357 "movdqa %%xmm5,%%xmm0 \n"
2358 "punpcklwd %%xmm1,%%xmm5 \n"
2359 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002360 "movdqu %%xmm5,(%[dst_bgra]) \n"
2361 "movdqu %%xmm0,0x10(%[dst_bgra]) \n"
2362 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002363 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002364 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002365 : [y_buf]"+r"(y_buf), // %[y_buf]
2366 [u_buf]"+r"(u_buf), // %[u_buf]
2367 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002368 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002369 [width]"+rm"(width) // %[width]
2370 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002371 : "memory", "cc"
2372#if defined(__SSE2__)
2373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2374#endif
2375 );
2376}
2377
fbarchard@google.come214fe32012-06-04 23:47:11 +00002378void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002379 const uint8* u_buf,
2380 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002381 uint8* dst_abgr,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002382 int width) {
2383 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002384 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002385 "pcmpeqb %%xmm5,%%xmm5 \n"
2386 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002387 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002388 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002389 READYUV422
2390 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002391 "punpcklbw %%xmm1,%%xmm2 \n"
2392 "punpcklbw %%xmm5,%%xmm0 \n"
2393 "movdqa %%xmm2,%%xmm1 \n"
2394 "punpcklwd %%xmm0,%%xmm2 \n"
2395 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002396 "movdqu %%xmm2,(%[dst_abgr]) \n"
2397 "movdqu %%xmm1,0x10(%[dst_abgr]) \n"
2398 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002399 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002400 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002401 : [y_buf]"+r"(y_buf), // %[y_buf]
2402 [u_buf]"+r"(u_buf), // %[u_buf]
2403 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002404 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002405 [width]"+rm"(width) // %[width]
2406 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002407 : "memory", "cc"
2408#if defined(__SSE2__)
2409 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2410#endif
2411 );
2412}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002413
2414void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2415 const uint8* u_buf,
2416 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002417 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002418 int width) {
2419 asm volatile (
2420 "sub %[u_buf],%[v_buf] \n"
2421 "pcmpeqb %%xmm5,%%xmm5 \n"
2422 "pxor %%xmm4,%%xmm4 \n"
2423 ".p2align 4 \n"
2424 "1: \n"
2425 READYUV422
2426 YUVTORGB
2427 "pcmpeqb %%xmm5,%%xmm5 \n"
2428 "punpcklbw %%xmm2,%%xmm1 \n"
2429 "punpcklbw %%xmm0,%%xmm5 \n"
2430 "movdqa %%xmm5,%%xmm0 \n"
2431 "punpcklwd %%xmm1,%%xmm5 \n"
2432 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002433 "movdqa %%xmm5,(%[dst_rgba]) \n"
2434 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2435 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002436 "sub $0x8,%[width] \n"
2437 "jg 1b \n"
2438 : [y_buf]"+r"(y_buf), // %[y_buf]
2439 [u_buf]"+r"(u_buf), // %[u_buf]
2440 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002441 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002442 [width]"+rm"(width) // %[width]
2443 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2444 : "memory", "cc"
2445#if defined(__SSE2__)
2446 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2447#endif
2448 );
2449}
2450
fbarchard@google.come214fe32012-06-04 23:47:11 +00002451#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002452
2453#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002454void YToARGBRow_SSE2(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002455 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002456 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002457 asm volatile (
fbarchard@google.com30859f72012-11-02 09:51:29 +00002458 "pxor %%xmm5,%%xmm5 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002459 "pcmpeqb %%xmm4,%%xmm4 \n"
2460 "pslld $0x18,%%xmm4 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002461 "mov $0x00100010,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002462 "movd %%eax,%%xmm3 \n"
2463 "pshufd $0x0,%%xmm3,%%xmm3 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002464 "mov $0x004a004a,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002465 "movd %%eax,%%xmm2 \n"
2466 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002467 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002468 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002469 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002470 "movq (%0),%%xmm0 \n"
2471 "lea 0x8(%0),%0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002472 "punpcklbw %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002473 "psubusw %%xmm3,%%xmm0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002474 "pmullw %%xmm2,%%xmm0 \n"
2475 "psrlw $6, %%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002476 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002477
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002478 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002479 "punpcklbw %%xmm0,%%xmm0 \n"
2480 "movdqa %%xmm0,%%xmm1 \n"
2481 "punpcklwd %%xmm0,%%xmm0 \n"
2482 "punpckhwd %%xmm1,%%xmm1 \n"
2483 "por %%xmm4,%%xmm0 \n"
2484 "por %%xmm4,%%xmm1 \n"
2485 "movdqa %%xmm0,(%1) \n"
2486 "movdqa %%xmm1,16(%1) \n"
2487 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002488
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002489 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002490 "jg 1b \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002491 : "+r"(y_buf), // %0
2492 "+r"(dst_argb), // %1
2493 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002494 :
2495 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002496#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002497 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002498#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002499 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002500}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002501#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002502
fbarchard@google.com42831e02012-01-21 02:54:17 +00002503#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002504// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002505CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002506 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2507};
2508
fbarchard@google.com42831e02012-01-21 02:54:17 +00002509void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002510 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002511 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002512 "movdqa %3,%%xmm5 \n"
2513 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002514 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002515 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002516 "movdqa (%0,%2),%%xmm0 \n"
2517 "pshufb %%xmm5,%%xmm0 \n"
2518 "sub $0x10,%2 \n"
2519 "movdqa %%xmm0,(%1) \n"
2520 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002521 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002522 : "+r"(src), // %0
2523 "+r"(dst), // %1
2524 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002525 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002526 : "memory", "cc"
2527#if defined(__SSE2__)
2528 , "xmm0", "xmm5"
2529#endif
2530 );
2531}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002532#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002533
fbarchard@google.com42831e02012-01-21 02:54:17 +00002534#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002535void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002536 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002537 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002538 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002539 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002540 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002541 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002542 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002543 "psllw $0x8,%%xmm0 \n"
2544 "psrlw $0x8,%%xmm1 \n"
2545 "por %%xmm1,%%xmm0 \n"
2546 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2547 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2548 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2549 "sub $0x10,%2 \n"
2550 "movdqu %%xmm0,(%1) \n"
2551 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002552 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002553 : "+r"(src), // %0
2554 "+r"(dst), // %1
2555 "+r"(temp_width) // %2
2556 :
2557 : "memory", "cc"
2558#if defined(__SSE2__)
2559 , "xmm0", "xmm1"
2560#endif
2561 );
2562}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002563#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002564
fbarchard@google.com16a96642012-03-02 22:38:09 +00002565#ifdef HAS_MIRRORROW_UV_SSSE3
2566// Shuffle table for reversing the bytes of UV channels.
2567CONST uvec8 kShuffleMirrorUV = {
2568 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2569};
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002570void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
fbarchard@google.com16a96642012-03-02 22:38:09 +00002571 int width) {
2572 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002573 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002574 "movdqa %4,%%xmm1 \n"
2575 "lea -16(%0,%3,2),%0 \n"
2576 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002577 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002578 "1: \n"
2579 "movdqa (%0),%%xmm0 \n"
2580 "lea -16(%0),%0 \n"
2581 "pshufb %%xmm1,%%xmm0 \n"
2582 "sub $8,%3 \n"
2583 "movlpd %%xmm0,(%1) \n"
2584 "movhpd %%xmm0,(%1,%2) \n"
2585 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002586 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002587 : "+r"(src), // %0
2588 "+r"(dst_u), // %1
2589 "+r"(dst_v), // %2
2590 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002591 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002592 : "memory", "cc"
2593#if defined(__SSE2__)
2594 , "xmm0", "xmm1"
2595#endif
2596 );
2597}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002598#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002599
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002600#ifdef HAS_ARGBMIRRORROW_SSSE3
2601// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002602CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002603 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2604};
2605
2606void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2607 intptr_t temp_width = static_cast<intptr_t>(width);
2608 asm volatile (
2609 "movdqa %3,%%xmm5 \n"
2610 "lea -0x10(%0),%0 \n"
2611 ".p2align 4 \n"
2612 "1: \n"
2613 "movdqa (%0,%2,4),%%xmm0 \n"
2614 "pshufb %%xmm5,%%xmm0 \n"
2615 "sub $0x4,%2 \n"
2616 "movdqa %%xmm0,(%1) \n"
2617 "lea 0x10(%1),%1 \n"
2618 "jg 1b \n"
2619 : "+r"(src), // %0
2620 "+r"(dst), // %1
2621 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002622 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002623 : "memory", "cc"
2624#if defined(__SSE2__)
2625 , "xmm0", "xmm5"
2626#endif
2627 );
2628}
2629#endif // HAS_ARGBMIRRORROW_SSSE3
2630
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002631#ifdef HAS_SPLITUVROW_SSE2
2632void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002633 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002634 "pcmpeqb %%xmm5,%%xmm5 \n"
2635 "psrlw $0x8,%%xmm5 \n"
2636 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002637 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002638 "1: \n"
2639 "movdqa (%0),%%xmm0 \n"
2640 "movdqa 0x10(%0),%%xmm1 \n"
2641 "lea 0x20(%0),%0 \n"
2642 "movdqa %%xmm0,%%xmm2 \n"
2643 "movdqa %%xmm1,%%xmm3 \n"
2644 "pand %%xmm5,%%xmm0 \n"
2645 "pand %%xmm5,%%xmm1 \n"
2646 "packuswb %%xmm1,%%xmm0 \n"
2647 "psrlw $0x8,%%xmm2 \n"
2648 "psrlw $0x8,%%xmm3 \n"
2649 "packuswb %%xmm3,%%xmm2 \n"
2650 "movdqa %%xmm0,(%1) \n"
2651 "movdqa %%xmm2,(%1,%2) \n"
2652 "lea 0x10(%1),%1 \n"
2653 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002654 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002655 : "+r"(src_uv), // %0
2656 "+r"(dst_u), // %1
2657 "+r"(dst_v), // %2
2658 "+r"(pix) // %3
2659 :
2660 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002661#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002662 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002663#endif
2664 );
2665}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002666
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002667void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2668 int pix) {
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002669 asm volatile (
2670 "pcmpeqb %%xmm5,%%xmm5 \n"
2671 "psrlw $0x8,%%xmm5 \n"
2672 "sub %1,%2 \n"
2673 ".p2align 4 \n"
2674 "1: \n"
2675 "movdqu (%0),%%xmm0 \n"
2676 "movdqu 0x10(%0),%%xmm1 \n"
2677 "lea 0x20(%0),%0 \n"
2678 "movdqa %%xmm0,%%xmm2 \n"
2679 "movdqa %%xmm1,%%xmm3 \n"
2680 "pand %%xmm5,%%xmm0 \n"
2681 "pand %%xmm5,%%xmm1 \n"
2682 "packuswb %%xmm1,%%xmm0 \n"
2683 "psrlw $0x8,%%xmm2 \n"
2684 "psrlw $0x8,%%xmm3 \n"
2685 "packuswb %%xmm3,%%xmm2 \n"
2686 "movdqu %%xmm0,(%1) \n"
2687 "movdqu %%xmm2,(%1,%2) \n"
2688 "lea 0x10(%1),%1 \n"
2689 "sub $0x10,%3 \n"
2690 "jg 1b \n"
2691 : "+r"(src_uv), // %0
2692 "+r"(dst_u), // %1
2693 "+r"(dst_v), // %2
2694 "+r"(pix) // %3
2695 :
2696 : "memory", "cc"
2697#if defined(__SSE2__)
2698 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2699#endif
2700 );
2701}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002702#endif // HAS_SPLITUVROW_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002703
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002704#ifdef HAS_MERGEUVROW_SSE2
2705void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2706 int width) {
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002707 asm volatile (
2708 "sub %0,%1 \n"
2709 ".p2align 4 \n"
2710 "1: \n"
2711 "movdqa (%0),%%xmm0 \n"
2712 "movdqa (%0,%1,1),%%xmm1 \n"
2713 "lea 0x10(%0),%0 \n"
2714 "movdqa %%xmm0,%%xmm2 \n"
2715 "punpcklbw %%xmm1,%%xmm0 \n"
2716 "punpckhbw %%xmm1,%%xmm2 \n"
2717 "movdqa %%xmm0,(%2) \n"
2718 "movdqa %%xmm2,0x10(%2) \n"
2719 "lea 0x20(%2),%2 \n"
2720 "sub $0x10,%3 \n"
2721 "jg 1b \n"
2722 : "+r"(src_u), // %0
2723 "+r"(src_v), // %1
2724 "+r"(dst_uv), // %2
2725 "+r"(width) // %3
2726 :
2727 : "memory", "cc"
2728#if defined(__SSE2__)
2729 , "xmm0", "xmm1", "xmm2"
2730#endif
2731 );
2732}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002733
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002734void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2735 uint8* dst_uv, int width) {
fbarchard@google.come0d86482012-10-27 19:07:55 +00002736 asm volatile (
2737 "sub %0,%1 \n"
2738 ".p2align 4 \n"
2739 "1: \n"
2740 "movdqu (%0),%%xmm0 \n"
2741 "movdqu (%0,%1,1),%%xmm1 \n"
2742 "lea 0x10(%0),%0 \n"
2743 "movdqa %%xmm0,%%xmm2 \n"
2744 "punpcklbw %%xmm1,%%xmm0 \n"
2745 "punpckhbw %%xmm1,%%xmm2 \n"
2746 "movdqu %%xmm0,(%2) \n"
2747 "movdqu %%xmm2,0x10(%2) \n"
2748 "lea 0x20(%2),%2 \n"
2749 "sub $0x10,%3 \n"
2750 "jg 1b \n"
2751 : "+r"(src_u), // %0
2752 "+r"(src_v), // %1
2753 "+r"(dst_uv), // %2
2754 "+r"(width) // %3
2755 :
2756 : "memory", "cc"
2757#if defined(__SSE2__)
2758 , "xmm0", "xmm1", "xmm2"
2759#endif
2760 );
2761}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002762#endif // HAS_MERGEUVROW_SSE2
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002763
fbarchard@google.com19932f82012-02-16 22:19:14 +00002764#ifdef HAS_COPYROW_SSE2
2765void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002766 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002767 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002768 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002769 "1: \n"
2770 "movdqa (%0),%%xmm0 \n"
2771 "movdqa 0x10(%0),%%xmm1 \n"
2772 "movdqa %%xmm0,(%0,%1) \n"
2773 "movdqa %%xmm1,0x10(%0,%1) \n"
2774 "lea 0x20(%0),%0 \n"
2775 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002776 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002777 : "+r"(src), // %0
2778 "+r"(dst), // %1
2779 "+r"(count) // %2
2780 :
2781 : "memory", "cc"
2782#if defined(__SSE2__)
2783 , "xmm0", "xmm1"
2784#endif
2785 );
2786}
2787#endif // HAS_COPYROW_SSE2
2788
2789#ifdef HAS_COPYROW_X86
2790void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2791 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002792 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002793 "shr $0x2,%2 \n"
2794 "rep movsl \n"
2795 : "+S"(src), // %0
2796 "+D"(dst), // %1
2797 "+c"(width_tmp) // %2
2798 :
2799 : "memory", "cc"
2800 );
2801}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002802#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002803
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002804#ifdef HAS_SETROW_X86
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002805void SetRow_X86(uint8* dst, uint32 v32, int width) {
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002806 size_t width_tmp = static_cast<size_t>(width);
2807 asm volatile (
2808 "shr $0x2,%1 \n"
2809 "rep stosl \n"
2810 : "+D"(dst), // %0
2811 "+c"(width_tmp) // %1
2812 : "a"(v32) // %2
2813 : "memory", "cc");
2814}
2815
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002816void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002817 int dst_stride, int height) {
2818 for (int y = 0; y < height; ++y) {
2819 size_t width_tmp = static_cast<size_t>(width);
2820 uint32* d = reinterpret_cast<uint32*>(dst);
2821 asm volatile (
2822 "rep stosl \n"
2823 : "+D"(d), // %0
2824 "+c"(width_tmp) // %1
2825 : "a"(v32) // %2
2826 : "memory", "cc");
2827 dst += dst_stride;
2828 }
2829}
2830#endif // HAS_SETROW_X86
2831
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002832#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002833void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002834 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002835 "pcmpeqb %%xmm5,%%xmm5 \n"
2836 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002837 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002838 "1: \n"
2839 "movdqa (%0),%%xmm0 \n"
2840 "movdqa 0x10(%0),%%xmm1 \n"
2841 "lea 0x20(%0),%0 \n"
2842 "pand %%xmm5,%%xmm0 \n"
2843 "pand %%xmm5,%%xmm1 \n"
2844 "packuswb %%xmm1,%%xmm0 \n"
2845 "movdqa %%xmm0,(%1) \n"
2846 "lea 0x10(%1),%1 \n"
2847 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002848 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002849 : "+r"(src_yuy2), // %0
2850 "+r"(dst_y), // %1
2851 "+r"(pix) // %2
2852 :
2853 : "memory", "cc"
2854#if defined(__SSE2__)
2855 , "xmm0", "xmm1", "xmm5"
2856#endif
2857 );
2858}
2859
2860void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002861 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002862 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002863 "pcmpeqb %%xmm5,%%xmm5 \n"
2864 "psrlw $0x8,%%xmm5 \n"
2865 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002866 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002867 "1: \n"
2868 "movdqa (%0),%%xmm0 \n"
2869 "movdqa 0x10(%0),%%xmm1 \n"
2870 "movdqa (%0,%4,1),%%xmm2 \n"
2871 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2872 "lea 0x20(%0),%0 \n"
2873 "pavgb %%xmm2,%%xmm0 \n"
2874 "pavgb %%xmm3,%%xmm1 \n"
2875 "psrlw $0x8,%%xmm0 \n"
2876 "psrlw $0x8,%%xmm1 \n"
2877 "packuswb %%xmm1,%%xmm0 \n"
2878 "movdqa %%xmm0,%%xmm1 \n"
2879 "pand %%xmm5,%%xmm0 \n"
2880 "packuswb %%xmm0,%%xmm0 \n"
2881 "psrlw $0x8,%%xmm1 \n"
2882 "packuswb %%xmm1,%%xmm1 \n"
2883 "movq %%xmm0,(%1) \n"
2884 "movq %%xmm1,(%1,%2) \n"
2885 "lea 0x8(%1),%1 \n"
2886 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002887 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002888 : "+r"(src_yuy2), // %0
2889 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002890 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002891 "+r"(pix) // %3
2892 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2893 : "memory", "cc"
2894#if defined(__SSE2__)
2895 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2896#endif
2897 );
2898}
2899
fbarchard@google.comc704f782012-08-30 19:53:48 +00002900void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2901 uint8* dst_u, uint8* dst_v, int pix) {
2902 asm volatile (
2903 "pcmpeqb %%xmm5,%%xmm5 \n"
2904 "psrlw $0x8,%%xmm5 \n"
2905 "sub %1,%2 \n"
2906 ".p2align 4 \n"
2907 "1: \n"
2908 "movdqa (%0),%%xmm0 \n"
2909 "movdqa 0x10(%0),%%xmm1 \n"
2910 "lea 0x20(%0),%0 \n"
2911 "psrlw $0x8,%%xmm0 \n"
2912 "psrlw $0x8,%%xmm1 \n"
2913 "packuswb %%xmm1,%%xmm0 \n"
2914 "movdqa %%xmm0,%%xmm1 \n"
2915 "pand %%xmm5,%%xmm0 \n"
2916 "packuswb %%xmm0,%%xmm0 \n"
2917 "psrlw $0x8,%%xmm1 \n"
2918 "packuswb %%xmm1,%%xmm1 \n"
2919 "movq %%xmm0,(%1) \n"
2920 "movq %%xmm1,(%1,%2) \n"
2921 "lea 0x8(%1),%1 \n"
2922 "sub $0x10,%3 \n"
2923 "jg 1b \n"
2924 : "+r"(src_yuy2), // %0
2925 "+r"(dst_u), // %1
2926 "+r"(dst_v), // %2
2927 "+r"(pix) // %3
2928 :
2929 : "memory", "cc"
2930#if defined(__SSE2__)
2931 , "xmm0", "xmm1", "xmm5"
2932#endif
2933 );
2934}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002935
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002936void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2937 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002938 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002939 "pcmpeqb %%xmm5,%%xmm5 \n"
2940 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002941 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002942 "1: \n"
2943 "movdqu (%0),%%xmm0 \n"
2944 "movdqu 0x10(%0),%%xmm1 \n"
2945 "lea 0x20(%0),%0 \n"
2946 "pand %%xmm5,%%xmm0 \n"
2947 "pand %%xmm5,%%xmm1 \n"
2948 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002949 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002950 "movdqu %%xmm0,(%1) \n"
2951 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002952 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002953 : "+r"(src_yuy2), // %0
2954 "+r"(dst_y), // %1
2955 "+r"(pix) // %2
2956 :
2957 : "memory", "cc"
2958#if defined(__SSE2__)
2959 , "xmm0", "xmm1", "xmm5"
2960#endif
2961 );
2962}
2963
2964void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2965 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002966 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002967 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002968 "pcmpeqb %%xmm5,%%xmm5 \n"
2969 "psrlw $0x8,%%xmm5 \n"
2970 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002971 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002972 "1: \n"
2973 "movdqu (%0),%%xmm0 \n"
2974 "movdqu 0x10(%0),%%xmm1 \n"
2975 "movdqu (%0,%4,1),%%xmm2 \n"
2976 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2977 "lea 0x20(%0),%0 \n"
2978 "pavgb %%xmm2,%%xmm0 \n"
2979 "pavgb %%xmm3,%%xmm1 \n"
2980 "psrlw $0x8,%%xmm0 \n"
2981 "psrlw $0x8,%%xmm1 \n"
2982 "packuswb %%xmm1,%%xmm0 \n"
2983 "movdqa %%xmm0,%%xmm1 \n"
2984 "pand %%xmm5,%%xmm0 \n"
2985 "packuswb %%xmm0,%%xmm0 \n"
2986 "psrlw $0x8,%%xmm1 \n"
2987 "packuswb %%xmm1,%%xmm1 \n"
2988 "movq %%xmm0,(%1) \n"
2989 "movq %%xmm1,(%1,%2) \n"
2990 "lea 0x8(%1),%1 \n"
2991 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002992 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002993 : "+r"(src_yuy2), // %0
2994 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002995 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002996 "+r"(pix) // %3
2997 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2998 : "memory", "cc"
2999#if defined(__SSE2__)
3000 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3001#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00003002 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003003}
3004
fbarchard@google.comc704f782012-08-30 19:53:48 +00003005void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
3006 uint8* dst_u, uint8* dst_v, int pix) {
3007 asm volatile (
3008 "pcmpeqb %%xmm5,%%xmm5 \n"
3009 "psrlw $0x8,%%xmm5 \n"
3010 "sub %1,%2 \n"
3011 ".p2align 4 \n"
3012 "1: \n"
3013 "movdqu (%0),%%xmm0 \n"
3014 "movdqu 0x10(%0),%%xmm1 \n"
3015 "lea 0x20(%0),%0 \n"
3016 "psrlw $0x8,%%xmm0 \n"
3017 "psrlw $0x8,%%xmm1 \n"
3018 "packuswb %%xmm1,%%xmm0 \n"
3019 "movdqa %%xmm0,%%xmm1 \n"
3020 "pand %%xmm5,%%xmm0 \n"
3021 "packuswb %%xmm0,%%xmm0 \n"
3022 "psrlw $0x8,%%xmm1 \n"
3023 "packuswb %%xmm1,%%xmm1 \n"
3024 "movq %%xmm0,(%1) \n"
3025 "movq %%xmm1,(%1,%2) \n"
3026 "lea 0x8(%1),%1 \n"
3027 "sub $0x10,%3 \n"
3028 "jg 1b \n"
3029 : "+r"(src_yuy2), // %0
3030 "+r"(dst_u), // %1
3031 "+r"(dst_v), // %2
3032 "+r"(pix) // %3
3033 :
3034 : "memory", "cc"
3035#if defined(__SSE2__)
3036 , "xmm0", "xmm1", "xmm5"
3037#endif
3038 );
3039}
3040
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003041void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003042 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003043 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003044 "1: \n"
3045 "movdqa (%0),%%xmm0 \n"
3046 "movdqa 0x10(%0),%%xmm1 \n"
3047 "lea 0x20(%0),%0 \n"
3048 "psrlw $0x8,%%xmm0 \n"
3049 "psrlw $0x8,%%xmm1 \n"
3050 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003051 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003052 "movdqa %%xmm0,(%1) \n"
3053 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003054 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003055 : "+r"(src_uyvy), // %0
3056 "+r"(dst_y), // %1
3057 "+r"(pix) // %2
3058 :
3059 : "memory", "cc"
3060#if defined(__SSE2__)
3061 , "xmm0", "xmm1"
3062#endif
3063 );
3064}
3065
3066void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003067 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003068 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003069 "pcmpeqb %%xmm5,%%xmm5 \n"
3070 "psrlw $0x8,%%xmm5 \n"
3071 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003072 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003073 "1: \n"
3074 "movdqa (%0),%%xmm0 \n"
3075 "movdqa 0x10(%0),%%xmm1 \n"
3076 "movdqa (%0,%4,1),%%xmm2 \n"
3077 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
3078 "lea 0x20(%0),%0 \n"
3079 "pavgb %%xmm2,%%xmm0 \n"
3080 "pavgb %%xmm3,%%xmm1 \n"
3081 "pand %%xmm5,%%xmm0 \n"
3082 "pand %%xmm5,%%xmm1 \n"
3083 "packuswb %%xmm1,%%xmm0 \n"
3084 "movdqa %%xmm0,%%xmm1 \n"
3085 "pand %%xmm5,%%xmm0 \n"
3086 "packuswb %%xmm0,%%xmm0 \n"
3087 "psrlw $0x8,%%xmm1 \n"
3088 "packuswb %%xmm1,%%xmm1 \n"
3089 "movq %%xmm0,(%1) \n"
3090 "movq %%xmm1,(%1,%2) \n"
3091 "lea 0x8(%1),%1 \n"
3092 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003093 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003094 : "+r"(src_uyvy), // %0
3095 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003096 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003097 "+r"(pix) // %3
3098 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3099 : "memory", "cc"
3100#if defined(__SSE2__)
3101 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3102#endif
3103 );
3104}
3105
fbarchard@google.comc704f782012-08-30 19:53:48 +00003106void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3107 uint8* dst_u, uint8* dst_v, int pix) {
3108 asm volatile (
3109 "pcmpeqb %%xmm5,%%xmm5 \n"
3110 "psrlw $0x8,%%xmm5 \n"
3111 "sub %1,%2 \n"
3112 ".p2align 4 \n"
3113 "1: \n"
3114 "movdqa (%0),%%xmm0 \n"
3115 "movdqa 0x10(%0),%%xmm1 \n"
3116 "lea 0x20(%0),%0 \n"
3117 "pand %%xmm5,%%xmm0 \n"
3118 "pand %%xmm5,%%xmm1 \n"
3119 "packuswb %%xmm1,%%xmm0 \n"
3120 "movdqa %%xmm0,%%xmm1 \n"
3121 "pand %%xmm5,%%xmm0 \n"
3122 "packuswb %%xmm0,%%xmm0 \n"
3123 "psrlw $0x8,%%xmm1 \n"
3124 "packuswb %%xmm1,%%xmm1 \n"
3125 "movq %%xmm0,(%1) \n"
3126 "movq %%xmm1,(%1,%2) \n"
3127 "lea 0x8(%1),%1 \n"
3128 "sub $0x10,%3 \n"
3129 "jg 1b \n"
3130 : "+r"(src_uyvy), // %0
3131 "+r"(dst_u), // %1
3132 "+r"(dst_v), // %2
3133 "+r"(pix) // %3
3134 :
3135 : "memory", "cc"
3136#if defined(__SSE2__)
3137 , "xmm0", "xmm1", "xmm5"
3138#endif
3139 );
3140}
3141
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003142void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3143 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003144 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003145 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003146 "1: \n"
3147 "movdqu (%0),%%xmm0 \n"
3148 "movdqu 0x10(%0),%%xmm1 \n"
3149 "lea 0x20(%0),%0 \n"
3150 "psrlw $0x8,%%xmm0 \n"
3151 "psrlw $0x8,%%xmm1 \n"
3152 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003153 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003154 "movdqu %%xmm0,(%1) \n"
3155 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003156 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003157 : "+r"(src_uyvy), // %0
3158 "+r"(dst_y), // %1
3159 "+r"(pix) // %2
3160 :
3161 : "memory", "cc"
3162#if defined(__SSE2__)
3163 , "xmm0", "xmm1"
3164#endif
3165 );
3166}
3167
3168void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003169 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003170 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003171 "pcmpeqb %%xmm5,%%xmm5 \n"
3172 "psrlw $0x8,%%xmm5 \n"
3173 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003174 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003175 "1: \n"
3176 "movdqu (%0),%%xmm0 \n"
3177 "movdqu 0x10(%0),%%xmm1 \n"
3178 "movdqu (%0,%4,1),%%xmm2 \n"
3179 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3180 "lea 0x20(%0),%0 \n"
3181 "pavgb %%xmm2,%%xmm0 \n"
3182 "pavgb %%xmm3,%%xmm1 \n"
3183 "pand %%xmm5,%%xmm0 \n"
3184 "pand %%xmm5,%%xmm1 \n"
3185 "packuswb %%xmm1,%%xmm0 \n"
3186 "movdqa %%xmm0,%%xmm1 \n"
3187 "pand %%xmm5,%%xmm0 \n"
3188 "packuswb %%xmm0,%%xmm0 \n"
3189 "psrlw $0x8,%%xmm1 \n"
3190 "packuswb %%xmm1,%%xmm1 \n"
3191 "movq %%xmm0,(%1) \n"
3192 "movq %%xmm1,(%1,%2) \n"
3193 "lea 0x8(%1),%1 \n"
3194 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003195 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003196 : "+r"(src_uyvy), // %0
3197 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003198 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003199 "+r"(pix) // %3
3200 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3201 : "memory", "cc"
3202#if defined(__SSE2__)
3203 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3204#endif
3205 );
3206}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003207
3208void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3209 uint8* dst_u, uint8* dst_v, int pix) {
3210 asm volatile (
3211 "pcmpeqb %%xmm5,%%xmm5 \n"
3212 "psrlw $0x8,%%xmm5 \n"
3213 "sub %1,%2 \n"
3214 ".p2align 4 \n"
3215 "1: \n"
3216 "movdqu (%0),%%xmm0 \n"
3217 "movdqu 0x10(%0),%%xmm1 \n"
3218 "lea 0x20(%0),%0 \n"
3219 "pand %%xmm5,%%xmm0 \n"
3220 "pand %%xmm5,%%xmm1 \n"
3221 "packuswb %%xmm1,%%xmm0 \n"
3222 "movdqa %%xmm0,%%xmm1 \n"
3223 "pand %%xmm5,%%xmm0 \n"
3224 "packuswb %%xmm0,%%xmm0 \n"
3225 "psrlw $0x8,%%xmm1 \n"
3226 "packuswb %%xmm1,%%xmm1 \n"
3227 "movq %%xmm0,(%1) \n"
3228 "movq %%xmm1,(%1,%2) \n"
3229 "lea 0x8(%1),%1 \n"
3230 "sub $0x10,%3 \n"
3231 "jg 1b \n"
3232 : "+r"(src_uyvy), // %0
3233 "+r"(dst_u), // %1
3234 "+r"(dst_v), // %2
3235 "+r"(pix) // %3
3236 :
3237 : "memory", "cc"
3238#if defined(__SSE2__)
3239 , "xmm0", "xmm1", "xmm5"
3240#endif
3241 );
3242}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003243#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003244
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003245#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003246// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003247void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3248 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003249 asm volatile (
3250 "pcmpeqb %%xmm7,%%xmm7 \n"
3251 "psrlw $0xf,%%xmm7 \n"
3252 "pcmpeqb %%xmm6,%%xmm6 \n"
3253 "psrlw $0x8,%%xmm6 \n"
3254 "pcmpeqb %%xmm5,%%xmm5 \n"
3255 "psllw $0x8,%%xmm5 \n"
3256 "pcmpeqb %%xmm4,%%xmm4 \n"
3257 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003258 "sub $0x1,%3 \n"
3259 "je 91f \n"
3260 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003261
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003262 // 1 pixel loop until destination pointer is aligned.
3263 "10: \n"
3264 "test $0xf,%2 \n"
3265 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003266 "movd (%0),%%xmm3 \n"
3267 "lea 0x4(%0),%0 \n"
3268 "movdqa %%xmm3,%%xmm0 \n"
3269 "pxor %%xmm4,%%xmm3 \n"
3270 "movd (%1),%%xmm2 \n"
3271 "psrlw $0x8,%%xmm3 \n"
3272 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3273 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3274 "pand %%xmm6,%%xmm2 \n"
3275 "paddw %%xmm7,%%xmm3 \n"
3276 "pmullw %%xmm3,%%xmm2 \n"
3277 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003278 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003279 "psrlw $0x8,%%xmm1 \n"
3280 "por %%xmm4,%%xmm0 \n"
3281 "pmullw %%xmm3,%%xmm1 \n"
3282 "psrlw $0x8,%%xmm2 \n"
3283 "paddusb %%xmm2,%%xmm0 \n"
3284 "pand %%xmm5,%%xmm1 \n"
3285 "paddusb %%xmm1,%%xmm0 \n"
3286 "sub $0x1,%3 \n"
3287 "movd %%xmm0,(%2) \n"
3288 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003289 "jge 10b \n"
3290
3291 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003292 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003293 "jl 49f \n"
3294
fbarchard@google.com794fe122012-06-15 01:05:01 +00003295 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003296 ".p2align 2 \n"
3297 "41: \n"
3298 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003299 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003300 "movdqa %%xmm3,%%xmm0 \n"
3301 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003302 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003303 "psrlw $0x8,%%xmm3 \n"
3304 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3305 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003306 "pand %%xmm6,%%xmm2 \n"
3307 "paddw %%xmm7,%%xmm3 \n"
3308 "pmullw %%xmm3,%%xmm2 \n"
3309 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003310 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003311 "psrlw $0x8,%%xmm1 \n"
3312 "por %%xmm4,%%xmm0 \n"
3313 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003314 "psrlw $0x8,%%xmm2 \n"
3315 "paddusb %%xmm2,%%xmm0 \n"
3316 "pand %%xmm5,%%xmm1 \n"
3317 "paddusb %%xmm1,%%xmm0 \n"
3318 "sub $0x4,%3 \n"
3319 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003320 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003321 "jge 41b \n"
3322
3323 "49: \n"
3324 "add $0x3,%3 \n"
3325 "jl 99f \n"
3326
fbarchard@google.com794fe122012-06-15 01:05:01 +00003327 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003328 "91: \n"
3329 "movd (%0),%%xmm3 \n"
3330 "lea 0x4(%0),%0 \n"
3331 "movdqa %%xmm3,%%xmm0 \n"
3332 "pxor %%xmm4,%%xmm3 \n"
3333 "movd (%1),%%xmm2 \n"
3334 "psrlw $0x8,%%xmm3 \n"
3335 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3336 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3337 "pand %%xmm6,%%xmm2 \n"
3338 "paddw %%xmm7,%%xmm3 \n"
3339 "pmullw %%xmm3,%%xmm2 \n"
3340 "movd (%1),%%xmm1 \n"
3341 "lea 0x4(%1),%1 \n"
3342 "psrlw $0x8,%%xmm1 \n"
3343 "por %%xmm4,%%xmm0 \n"
3344 "pmullw %%xmm3,%%xmm1 \n"
3345 "psrlw $0x8,%%xmm2 \n"
3346 "paddusb %%xmm2,%%xmm0 \n"
3347 "pand %%xmm5,%%xmm1 \n"
3348 "paddusb %%xmm1,%%xmm0 \n"
3349 "sub $0x1,%3 \n"
3350 "movd %%xmm0,(%2) \n"
3351 "lea 0x4(%2),%2 \n"
3352 "jge 91b \n"
3353 "99: \n"
3354 : "+r"(src_argb0), // %0
3355 "+r"(src_argb1), // %1
3356 "+r"(dst_argb), // %2
3357 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003358 :
3359 : "memory", "cc"
3360#if defined(__SSE2__)
3361 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3362#endif
3363 );
3364}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003365#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003366
fbarchard@google.com96af8702012-04-06 18:22:27 +00003367#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003368// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003369CONST uvec8 kShuffleAlpha = {
3370 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3371 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3372};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003373
3374// Blend 8 pixels at a time
3375// Shuffle table for reversing the bytes.
3376
3377// Same as SSE2, but replaces
3378// psrlw xmm3, 8 // alpha
3379// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3380// pshuflw xmm3, xmm3,0F5h
3381// with..
3382// pshufb xmm3, kShuffleAlpha // alpha
3383
3384void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3385 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003386 asm volatile (
3387 "pcmpeqb %%xmm7,%%xmm7 \n"
3388 "psrlw $0xf,%%xmm7 \n"
3389 "pcmpeqb %%xmm6,%%xmm6 \n"
3390 "psrlw $0x8,%%xmm6 \n"
3391 "pcmpeqb %%xmm5,%%xmm5 \n"
3392 "psllw $0x8,%%xmm5 \n"
3393 "pcmpeqb %%xmm4,%%xmm4 \n"
3394 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003395 "sub $0x1,%3 \n"
3396 "je 91f \n"
3397 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003398
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003399 // 1 pixel loop until destination pointer is aligned.
3400 "10: \n"
3401 "test $0xf,%2 \n"
3402 "je 19f \n"
3403 "movd (%0),%%xmm3 \n"
3404 "lea 0x4(%0),%0 \n"
3405 "movdqa %%xmm3,%%xmm0 \n"
3406 "pxor %%xmm4,%%xmm3 \n"
3407 "movd (%1),%%xmm2 \n"
3408 "pshufb %4,%%xmm3 \n"
3409 "pand %%xmm6,%%xmm2 \n"
3410 "paddw %%xmm7,%%xmm3 \n"
3411 "pmullw %%xmm3,%%xmm2 \n"
3412 "movd (%1),%%xmm1 \n"
3413 "lea 0x4(%1),%1 \n"
3414 "psrlw $0x8,%%xmm1 \n"
3415 "por %%xmm4,%%xmm0 \n"
3416 "pmullw %%xmm3,%%xmm1 \n"
3417 "psrlw $0x8,%%xmm2 \n"
3418 "paddusb %%xmm2,%%xmm0 \n"
3419 "pand %%xmm5,%%xmm1 \n"
3420 "paddusb %%xmm1,%%xmm0 \n"
3421 "sub $0x1,%3 \n"
3422 "movd %%xmm0,(%2) \n"
3423 "lea 0x4(%2),%2 \n"
3424 "jge 10b \n"
3425
3426 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003427 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003428 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003429 "test $0xf,%0 \n"
3430 "jne 41f \n"
3431 "test $0xf,%1 \n"
3432 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003433
fbarchard@google.com794fe122012-06-15 01:05:01 +00003434 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003435 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003436 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003437 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003438 "lea 0x10(%0),%0 \n"
3439 "movdqa %%xmm3,%%xmm0 \n"
3440 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003441 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003442 "pshufb %4,%%xmm3 \n"
3443 "pand %%xmm6,%%xmm2 \n"
3444 "paddw %%xmm7,%%xmm3 \n"
3445 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003446 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003447 "lea 0x10(%1),%1 \n"
3448 "psrlw $0x8,%%xmm1 \n"
3449 "por %%xmm4,%%xmm0 \n"
3450 "pmullw %%xmm3,%%xmm1 \n"
3451 "psrlw $0x8,%%xmm2 \n"
3452 "paddusb %%xmm2,%%xmm0 \n"
3453 "pand %%xmm5,%%xmm1 \n"
3454 "paddusb %%xmm1,%%xmm0 \n"
3455 "sub $0x4,%3 \n"
3456 "movdqa %%xmm0,(%2) \n"
3457 "lea 0x10(%2),%2 \n"
3458 "jge 40b \n"
3459 "jmp 49f \n"
3460
3461 // 4 pixel unaligned loop.
3462 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003463 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003464 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003465 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003466 "movdqa %%xmm3,%%xmm0 \n"
3467 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003468 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003469 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003470 "pand %%xmm6,%%xmm2 \n"
3471 "paddw %%xmm7,%%xmm3 \n"
3472 "pmullw %%xmm3,%%xmm2 \n"
3473 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003474 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003475 "psrlw $0x8,%%xmm1 \n"
3476 "por %%xmm4,%%xmm0 \n"
3477 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003478 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003479 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003480 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003481 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003482 "sub $0x4,%3 \n"
3483 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003484 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003485 "jge 41b \n"
3486
3487 "49: \n"
3488 "add $0x3,%3 \n"
3489 "jl 99f \n"
3490
fbarchard@google.com794fe122012-06-15 01:05:01 +00003491 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003492 "91: \n"
3493 "movd (%0),%%xmm3 \n"
3494 "lea 0x4(%0),%0 \n"
3495 "movdqa %%xmm3,%%xmm0 \n"
3496 "pxor %%xmm4,%%xmm3 \n"
3497 "movd (%1),%%xmm2 \n"
3498 "pshufb %4,%%xmm3 \n"
3499 "pand %%xmm6,%%xmm2 \n"
3500 "paddw %%xmm7,%%xmm3 \n"
3501 "pmullw %%xmm3,%%xmm2 \n"
3502 "movd (%1),%%xmm1 \n"
3503 "lea 0x4(%1),%1 \n"
3504 "psrlw $0x8,%%xmm1 \n"
3505 "por %%xmm4,%%xmm0 \n"
3506 "pmullw %%xmm3,%%xmm1 \n"
3507 "psrlw $0x8,%%xmm2 \n"
3508 "paddusb %%xmm2,%%xmm0 \n"
3509 "pand %%xmm5,%%xmm1 \n"
3510 "paddusb %%xmm1,%%xmm0 \n"
3511 "sub $0x1,%3 \n"
3512 "movd %%xmm0,(%2) \n"
3513 "lea 0x4(%2),%2 \n"
3514 "jge 91b \n"
3515 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003516 : "+r"(src_argb0), // %0
3517 "+r"(src_argb1), // %1
3518 "+r"(dst_argb), // %2
3519 "+r"(width) // %3
3520 : "m"(kShuffleAlpha) // %4
3521 : "memory", "cc"
3522#if defined(__SSE2__)
3523 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3524#endif
3525 );
3526}
3527#endif // HAS_ARGBBLENDROW_SSSE3
3528
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003529#ifdef HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003530// Attenuate 4 pixels at a time.
3531// aligned to 16 bytes
3532void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3533 asm volatile (
3534 "sub %0,%1 \n"
3535 "pcmpeqb %%xmm4,%%xmm4 \n"
3536 "pslld $0x18,%%xmm4 \n"
3537 "pcmpeqb %%xmm5,%%xmm5 \n"
3538 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003539
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003540 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003541 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003542 "1: \n"
3543 "movdqa (%0),%%xmm0 \n"
3544 "punpcklbw %%xmm0,%%xmm0 \n"
3545 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3546 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3547 "pmulhuw %%xmm2,%%xmm0 \n"
3548 "movdqa (%0),%%xmm1 \n"
3549 "punpckhbw %%xmm1,%%xmm1 \n"
3550 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3551 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3552 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003553 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003554 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003555 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003556 "psrlw $0x8,%%xmm1 \n"
3557 "packuswb %%xmm1,%%xmm0 \n"
3558 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003559 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003560 "sub $0x4,%2 \n"
3561 "movdqa %%xmm0,(%0,%1,1) \n"
3562 "lea 0x10(%0),%0 \n"
3563 "jg 1b \n"
3564 : "+r"(src_argb), // %0
3565 "+r"(dst_argb), // %1
3566 "+r"(width) // %2
3567 :
3568 : "memory", "cc"
3569#if defined(__SSE2__)
3570 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3571#endif
3572 );
3573}
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003574#endif // HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003575
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003576#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003577// Shuffle table duplicating alpha
3578CONST uvec8 kShuffleAlpha0 = {
3579 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3580};
3581CONST uvec8 kShuffleAlpha1 = {
3582 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3583 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3584};
3585// Attenuate 4 pixels at a time.
3586// aligned to 16 bytes
3587void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3588 asm volatile (
3589 "sub %0,%1 \n"
3590 "pcmpeqb %%xmm3,%%xmm3 \n"
3591 "pslld $0x18,%%xmm3 \n"
3592 "movdqa %3,%%xmm4 \n"
3593 "movdqa %4,%%xmm5 \n"
3594
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003595 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003596 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003597 "1: \n"
3598 "movdqa (%0),%%xmm0 \n"
3599 "pshufb %%xmm4,%%xmm0 \n"
3600 "movdqa (%0),%%xmm1 \n"
3601 "punpcklbw %%xmm1,%%xmm1 \n"
3602 "pmulhuw %%xmm1,%%xmm0 \n"
3603 "movdqa (%0),%%xmm1 \n"
3604 "pshufb %%xmm5,%%xmm1 \n"
3605 "movdqa (%0),%%xmm2 \n"
3606 "punpckhbw %%xmm2,%%xmm2 \n"
3607 "pmulhuw %%xmm2,%%xmm1 \n"
3608 "movdqa (%0),%%xmm2 \n"
3609 "pand %%xmm3,%%xmm2 \n"
3610 "psrlw $0x8,%%xmm0 \n"
3611 "psrlw $0x8,%%xmm1 \n"
3612 "packuswb %%xmm1,%%xmm0 \n"
3613 "por %%xmm2,%%xmm0 \n"
3614 "sub $0x4,%2 \n"
3615 "movdqa %%xmm0,(%0,%1,1) \n"
3616 "lea 0x10(%0),%0 \n"
3617 "jg 1b \n"
3618 : "+r"(src_argb), // %0
3619 "+r"(dst_argb), // %1
3620 "+r"(width) // %2
3621 : "m"(kShuffleAlpha0), // %3
3622 "m"(kShuffleAlpha1) // %4
3623 : "memory", "cc"
3624#if defined(__SSE2__)
3625 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3626#endif
3627 );
3628}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003629#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003630
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003631#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003632// Unattenuate 4 pixels at a time.
3633// aligned to 16 bytes
3634void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3635 int width) {
3636 uintptr_t alpha = 0;
3637 asm volatile (
3638 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003639
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003640 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003641 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003642 "1: \n"
3643 "movdqa (%0),%%xmm0 \n"
3644 "movzb 0x3(%0),%3 \n"
3645 "punpcklbw %%xmm0,%%xmm0 \n"
3646 "movd 0x0(%4,%3,4),%%xmm2 \n"
3647 "movzb 0x7(%0),%3 \n"
3648 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003649 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3650 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003651 "movlhps %%xmm3,%%xmm2 \n"
3652 "pmulhuw %%xmm2,%%xmm0 \n"
3653 "movdqa (%0),%%xmm1 \n"
3654 "movzb 0xb(%0),%3 \n"
3655 "punpckhbw %%xmm1,%%xmm1 \n"
3656 "movd 0x0(%4,%3,4),%%xmm2 \n"
3657 "movzb 0xf(%0),%3 \n"
3658 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003659 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3660 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003661 "movlhps %%xmm3,%%xmm2 \n"
3662 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003663 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003664 "sub $0x4,%2 \n"
3665 "movdqa %%xmm0,(%0,%1,1) \n"
3666 "lea 0x10(%0),%0 \n"
3667 "jg 1b \n"
3668 : "+r"(src_argb), // %0
3669 "+r"(dst_argb), // %1
3670 "+r"(width), // %2
3671 "+r"(alpha) // %3
3672 : "r"(fixed_invtbl8) // %4
3673 : "memory", "cc"
3674#if defined(__SSE2__)
3675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3676#endif
3677 );
3678}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003679#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003680
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003681#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003682// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003683CONST vec8 kARGBToGray = {
3684 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3685};
3686
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003687// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003688void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003689 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003690 "movdqa %3,%%xmm4 \n"
3691 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003692
3693 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003694 ".p2align 4 \n"
3695 "1: \n"
3696 "movdqa (%0),%%xmm0 \n"
3697 "movdqa 0x10(%0),%%xmm1 \n"
3698 "pmaddubsw %%xmm4,%%xmm0 \n"
3699 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003700 "phaddw %%xmm1,%%xmm0 \n"
3701 "psrlw $0x7,%%xmm0 \n"
3702 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003703 "movdqa (%0),%%xmm2 \n"
3704 "movdqa 0x10(%0),%%xmm3 \n"
3705 "psrld $0x18,%%xmm2 \n"
3706 "psrld $0x18,%%xmm3 \n"
3707 "packuswb %%xmm3,%%xmm2 \n"
3708 "packuswb %%xmm2,%%xmm2 \n"
3709 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003710 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003711 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003712 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003713 "punpcklwd %%xmm3,%%xmm0 \n"
3714 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003715 "sub $0x8,%2 \n"
3716 "movdqa %%xmm0,(%0,%1,1) \n"
3717 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003718 "lea 0x20(%0),%0 \n"
3719 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003720 : "+r"(src_argb), // %0
3721 "+r"(dst_argb), // %1
3722 "+r"(width) // %2
3723 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003724 : "memory", "cc"
3725#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003726 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003727#endif
3728 );
3729}
3730#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003731
3732#ifdef HAS_ARGBSEPIAROW_SSSE3
3733// b = (r * 35 + g * 68 + b * 17) >> 7
3734// g = (r * 45 + g * 88 + b * 22) >> 7
3735// r = (r * 50 + g * 98 + b * 24) >> 7
3736// Constant for ARGB color to sepia tone
3737CONST vec8 kARGBToSepiaB = {
3738 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3739};
3740
3741CONST vec8 kARGBToSepiaG = {
3742 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3743};
3744
3745CONST vec8 kARGBToSepiaR = {
3746 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3747};
3748
fbarchard@google.come442dc42012-06-18 17:37:09 +00003749// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003750void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3751 asm volatile (
3752 "movdqa %2,%%xmm2 \n"
3753 "movdqa %3,%%xmm3 \n"
3754 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003755
3756 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003757 ".p2align 4 \n"
3758 "1: \n"
3759 "movdqa (%0),%%xmm0 \n"
3760 "movdqa 0x10(%0),%%xmm6 \n"
3761 "pmaddubsw %%xmm2,%%xmm0 \n"
3762 "pmaddubsw %%xmm2,%%xmm6 \n"
3763 "phaddw %%xmm6,%%xmm0 \n"
3764 "psrlw $0x7,%%xmm0 \n"
3765 "packuswb %%xmm0,%%xmm0 \n"
3766 "movdqa (%0),%%xmm5 \n"
3767 "movdqa 0x10(%0),%%xmm1 \n"
3768 "pmaddubsw %%xmm3,%%xmm5 \n"
3769 "pmaddubsw %%xmm3,%%xmm1 \n"
3770 "phaddw %%xmm1,%%xmm5 \n"
3771 "psrlw $0x7,%%xmm5 \n"
3772 "packuswb %%xmm5,%%xmm5 \n"
3773 "punpcklbw %%xmm5,%%xmm0 \n"
3774 "movdqa (%0),%%xmm5 \n"
3775 "movdqa 0x10(%0),%%xmm1 \n"
3776 "pmaddubsw %%xmm4,%%xmm5 \n"
3777 "pmaddubsw %%xmm4,%%xmm1 \n"
3778 "phaddw %%xmm1,%%xmm5 \n"
3779 "psrlw $0x7,%%xmm5 \n"
3780 "packuswb %%xmm5,%%xmm5 \n"
3781 "movdqa (%0),%%xmm6 \n"
3782 "movdqa 0x10(%0),%%xmm1 \n"
3783 "psrld $0x18,%%xmm6 \n"
3784 "psrld $0x18,%%xmm1 \n"
3785 "packuswb %%xmm1,%%xmm6 \n"
3786 "packuswb %%xmm6,%%xmm6 \n"
3787 "punpcklbw %%xmm6,%%xmm5 \n"
3788 "movdqa %%xmm0,%%xmm1 \n"
3789 "punpcklwd %%xmm5,%%xmm0 \n"
3790 "punpckhwd %%xmm5,%%xmm1 \n"
3791 "sub $0x8,%1 \n"
3792 "movdqa %%xmm0,(%0) \n"
3793 "movdqa %%xmm1,0x10(%0) \n"
3794 "lea 0x20(%0),%0 \n"
3795 "jg 1b \n"
3796 : "+r"(dst_argb), // %0
3797 "+r"(width) // %1
3798 : "m"(kARGBToSepiaB), // %2
3799 "m"(kARGBToSepiaG), // %3
3800 "m"(kARGBToSepiaR) // %4
3801 : "memory", "cc"
3802#if defined(__SSE2__)
3803 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3804#endif
3805 );
3806}
3807#endif // HAS_ARGBSEPIAROW_SSSE3
3808
fbarchard@google.come442dc42012-06-18 17:37:09 +00003809#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3810// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3811// Same as Sepia except matrix is provided.
3812void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3813 int width) {
3814 asm volatile (
3815 "movd (%2),%%xmm2 \n"
3816 "movd 0x4(%2),%%xmm3 \n"
3817 "movd 0x8(%2),%%xmm4 \n"
3818 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3819 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3820 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003821
3822 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003823 ".p2align 4 \n"
3824 "1: \n"
3825 "movdqa (%0),%%xmm0 \n"
3826 "movdqa 0x10(%0),%%xmm6 \n"
3827 "pmaddubsw %%xmm2,%%xmm0 \n"
3828 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003829 "movdqa (%0),%%xmm5 \n"
3830 "movdqa 0x10(%0),%%xmm1 \n"
3831 "pmaddubsw %%xmm3,%%xmm5 \n"
3832 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003833 "phaddsw %%xmm6,%%xmm0 \n"
3834 "phaddsw %%xmm1,%%xmm5 \n"
3835 "psraw $0x7,%%xmm0 \n"
3836 "psraw $0x7,%%xmm5 \n"
3837 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003838 "packuswb %%xmm5,%%xmm5 \n"
3839 "punpcklbw %%xmm5,%%xmm0 \n"
3840 "movdqa (%0),%%xmm5 \n"
3841 "movdqa 0x10(%0),%%xmm1 \n"
3842 "pmaddubsw %%xmm4,%%xmm5 \n"
3843 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003844 "phaddsw %%xmm1,%%xmm5 \n"
3845 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003846 "packuswb %%xmm5,%%xmm5 \n"
3847 "movdqa (%0),%%xmm6 \n"
3848 "movdqa 0x10(%0),%%xmm1 \n"
3849 "psrld $0x18,%%xmm6 \n"
3850 "psrld $0x18,%%xmm1 \n"
3851 "packuswb %%xmm1,%%xmm6 \n"
3852 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003853 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003854 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003855 "punpcklwd %%xmm5,%%xmm0 \n"
3856 "punpckhwd %%xmm5,%%xmm1 \n"
3857 "sub $0x8,%1 \n"
3858 "movdqa %%xmm0,(%0) \n"
3859 "movdqa %%xmm1,0x10(%0) \n"
3860 "lea 0x20(%0),%0 \n"
3861 "jg 1b \n"
3862 : "+r"(dst_argb), // %0
3863 "+r"(width) // %1
3864 : "r"(matrix_argb) // %2
3865 : "memory", "cc"
3866#if defined(__SSE2__)
3867 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3868#endif
3869 );
3870}
3871#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3872
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003873#ifdef HAS_ARGBQUANTIZEROW_SSE2
3874// Quantize 4 ARGB pixels (16 bytes).
3875// aligned to 16 bytes
3876void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3877 int interval_offset, int width) {
3878 asm volatile (
3879 "movd %2,%%xmm2 \n"
3880 "movd %3,%%xmm3 \n"
3881 "movd %4,%%xmm4 \n"
3882 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3883 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3884 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3885 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3886 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3887 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3888 "pxor %%xmm5,%%xmm5 \n"
3889 "pcmpeqb %%xmm6,%%xmm6 \n"
3890 "pslld $0x18,%%xmm6 \n"
3891
3892 // 4 pixel loop.
3893 ".p2align 2 \n"
3894 "1: \n"
3895 "movdqa (%0),%%xmm0 \n"
3896 "punpcklbw %%xmm5,%%xmm0 \n"
3897 "pmulhuw %%xmm2,%%xmm0 \n"
3898 "movdqa (%0),%%xmm1 \n"
3899 "punpckhbw %%xmm5,%%xmm1 \n"
3900 "pmulhuw %%xmm2,%%xmm1 \n"
3901 "pmullw %%xmm3,%%xmm0 \n"
3902 "movdqa (%0),%%xmm7 \n"
3903 "pmullw %%xmm3,%%xmm1 \n"
3904 "pand %%xmm6,%%xmm7 \n"
3905 "paddw %%xmm4,%%xmm0 \n"
3906 "paddw %%xmm4,%%xmm1 \n"
3907 "packuswb %%xmm1,%%xmm0 \n"
3908 "por %%xmm7,%%xmm0 \n"
3909 "sub $0x4,%1 \n"
3910 "movdqa %%xmm0,(%0) \n"
3911 "lea 0x10(%0),%0 \n"
3912 "jg 1b \n"
3913 : "+r"(dst_argb), // %0
3914 "+r"(width) // %1
3915 : "r"(scale), // %2
3916 "r"(interval_size), // %3
3917 "r"(interval_offset) // %4
3918 : "memory", "cc"
3919#if defined(__SSE2__)
3920 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3921#endif
3922 );
3923}
3924#endif // HAS_ARGBQUANTIZEROW_SSE2
3925
fbarchard@google.comb94b1392012-12-03 20:36:40 +00003926#ifdef HAS_ARGBSHADEROW_SSE2
3927// Shade 4 pixels at a time by specified value.
3928// Aligned to 16 bytes.
3929void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3930 uint32 value) {
3931 asm volatile (
3932 "movd %3,%%xmm2 \n"
3933 "sub %0,%1 \n"
3934 "punpcklbw %%xmm2,%%xmm2 \n"
3935 "punpcklqdq %%xmm2,%%xmm2 \n"
3936
3937 // 4 pixel loop.
3938 ".p2align 2 \n"
3939 "1: \n"
3940 "movdqa (%0),%%xmm0 \n"
3941 "movdqa %%xmm0,%%xmm1 \n"
3942 "punpcklbw %%xmm0,%%xmm0 \n"
3943 "punpckhbw %%xmm1,%%xmm1 \n"
3944 "pmulhuw %%xmm2,%%xmm0 \n"
3945 "pmulhuw %%xmm2,%%xmm1 \n"
3946 "psrlw $0x8,%%xmm0 \n"
3947 "psrlw $0x8,%%xmm1 \n"
3948 "packuswb %%xmm1,%%xmm0 \n"
3949 "sub $0x4,%2 \n"
3950 "movdqa %%xmm0,(%0,%1,1) \n"
3951 "lea 0x10(%0),%0 \n"
3952 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00003953 : "+r"(src_argb), // %0
3954 "+r"(dst_argb), // %1
3955 "+r"(width) // %2
3956 : "r"(value) // %3
fbarchard@google.comb94b1392012-12-03 20:36:40 +00003957 : "memory", "cc"
3958#if defined(__SSE2__)
3959 , "xmm0", "xmm1", "xmm2"
3960#endif
3961 );
3962}
3963#endif // HAS_ARGBSHADEROW_SSE2
3964
fbarchard@google.com8ec60332013-01-17 20:18:08 +00003965#ifdef HAS_ARGBMULTIPLYROW_SSE2
fbarchard@google.com83e1b172013-01-18 23:03:56 +00003966// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
fbarchard@google.com8ec60332013-01-17 20:18:08 +00003967// Aligned to 16 bytes.
fbarchard@google.com8fa76342013-01-18 02:25:04 +00003968void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3969 uint8* dst_argb, int width) {
fbarchard@google.com8ec60332013-01-17 20:18:08 +00003970 asm volatile (
3971 "pxor %%xmm5,%%xmm5 \n"
3972 "sub %0,%1 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00003973 "sub %0,%2 \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00003974
3975 // 4 pixel loop.
3976 ".p2align 4 \n"
3977 "1: \n"
3978 "movdqa (%0),%%xmm0 \n"
3979 "movdqa (%0,%1),%%xmm2 \n"
3980 "movdqa %%xmm0,%%xmm1 \n"
3981 "movdqa %%xmm2,%%xmm3 \n"
3982 "punpcklbw %%xmm0,%%xmm0 \n"
3983 "punpckhbw %%xmm1,%%xmm1 \n"
3984 "punpcklbw %%xmm5,%%xmm2 \n"
3985 "punpckhbw %%xmm5,%%xmm3 \n"
3986 "pmulhuw %%xmm2,%%xmm0 \n"
3987 "pmulhuw %%xmm3,%%xmm1 \n"
3988 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00003989 "sub $0x4,%3 \n"
3990 "movdqa %%xmm0,(%0,%2,1) \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00003991 "lea 0x10(%0),%0 \n"
3992 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00003993 : "+r"(src_argb0), // %0
3994 "+r"(src_argb1), // %1
fbarchard@google.com8fa76342013-01-18 02:25:04 +00003995 "+r"(dst_argb), // %2
3996 "+r"(width) // %3
fbarchard@google.com8ec60332013-01-17 20:18:08 +00003997 :
3998 : "memory", "cc"
3999#if defined(__SSE2__)
4000 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4001#endif
4002 );
4003}
4004#endif // HAS_ARGBMULTIPLYROW_SSE2
4005
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004006#ifdef HAS_ARGBADDROW_SSE2
4007// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4008// Aligned to 16 bytes.
4009void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4010 uint8* dst_argb, int width) {
4011 asm volatile (
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004012 "sub %0,%1 \n"
4013 "sub %0,%2 \n"
4014
4015 // 4 pixel loop.
4016 ".p2align 4 \n"
4017 "1: \n"
4018 "movdqa (%0),%%xmm0 \n"
4019 "movdqa (%0,%1),%%xmm1 \n"
4020 "paddusb %%xmm1,%%xmm0 \n"
4021 "sub $0x4,%3 \n"
4022 "movdqa %%xmm0,(%0,%2,1) \n"
4023 "lea 0x10(%0),%0 \n"
4024 "jg 1b \n"
4025 : "+r"(src_argb0), // %0
4026 "+r"(src_argb1), // %1
4027 "+r"(dst_argb), // %2
4028 "+r"(width) // %3
4029 :
4030 : "memory", "cc"
4031#if defined(__SSE2__)
fbarchard@google.com573a8832013-01-24 23:08:12 +00004032 , "xmm0", "xmm1"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004033#endif
4034 );
4035}
4036#endif // HAS_ARGBADDROW_SSE2
4037
fbarchard@google.com573a8832013-01-24 23:08:12 +00004038#ifdef HAS_ARGBSUBTRACTROW_SSE2
4039// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4040// Aligned to 16 bytes.
4041void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4042 uint8* dst_argb, int width) {
4043 asm volatile (
4044 "sub %0,%1 \n"
4045 "sub %0,%2 \n"
4046
4047 // 4 pixel loop.
4048 ".p2align 4 \n"
4049 "1: \n"
4050 "movdqa (%0),%%xmm0 \n"
4051 "movdqa (%0,%1),%%xmm1 \n"
4052 "psubusb %%xmm1,%%xmm0 \n"
4053 "sub $0x4,%3 \n"
4054 "movdqa %%xmm0,(%0,%2,1) \n"
4055 "lea 0x10(%0),%0 \n"
4056 "jg 1b \n"
4057 : "+r"(src_argb0), // %0
4058 "+r"(src_argb1), // %1
4059 "+r"(dst_argb), // %2
4060 "+r"(width) // %3
4061 :
4062 : "memory", "cc"
4063#if defined(__SSE2__)
4064 , "xmm0", "xmm1"
4065#endif
4066 );
4067}
4068#endif // HAS_ARGBSUBTRACTROW_SSE2
4069
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004070#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4071// Creates a table of cumulative sums where each value is a sum of all values
4072// above and to the left of the value, inclusive of the value.
4073void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00004074 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004075 asm volatile (
4076 "sub %1,%2 \n"
4077 "pxor %%xmm0,%%xmm0 \n"
4078 "pxor %%xmm1,%%xmm1 \n"
4079 "sub $0x4,%3 \n"
4080 "jl 49f \n"
4081 "test $0xf,%1 \n"
4082 "jne 49f \n"
4083
4084 // 4 pixel loop \n"
4085 ".p2align 2 \n"
4086 "40: \n"
4087 "movdqu (%0),%%xmm2 \n"
4088 "lea 0x10(%0),%0 \n"
4089 "movdqa %%xmm2,%%xmm4 \n"
4090 "punpcklbw %%xmm1,%%xmm2 \n"
4091 "movdqa %%xmm2,%%xmm3 \n"
4092 "punpcklwd %%xmm1,%%xmm2 \n"
4093 "punpckhwd %%xmm1,%%xmm3 \n"
4094 "punpckhbw %%xmm1,%%xmm4 \n"
4095 "movdqa %%xmm4,%%xmm5 \n"
4096 "punpcklwd %%xmm1,%%xmm4 \n"
4097 "punpckhwd %%xmm1,%%xmm5 \n"
4098 "paddd %%xmm2,%%xmm0 \n"
4099 "movdqa (%1,%2,1),%%xmm2 \n"
4100 "paddd %%xmm0,%%xmm2 \n"
4101 "paddd %%xmm3,%%xmm0 \n"
4102 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
4103 "paddd %%xmm0,%%xmm3 \n"
4104 "paddd %%xmm4,%%xmm0 \n"
4105 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
4106 "paddd %%xmm0,%%xmm4 \n"
4107 "paddd %%xmm5,%%xmm0 \n"
4108 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
4109 "paddd %%xmm0,%%xmm5 \n"
4110 "movdqa %%xmm2,(%1) \n"
4111 "movdqa %%xmm3,0x10(%1) \n"
4112 "movdqa %%xmm4,0x20(%1) \n"
4113 "movdqa %%xmm5,0x30(%1) \n"
4114 "lea 0x40(%1),%1 \n"
4115 "sub $0x4,%3 \n"
4116 "jge 40b \n"
4117
4118 "49: \n"
4119 "add $0x3,%3 \n"
4120 "jl 19f \n"
4121
4122 // 1 pixel loop \n"
4123 ".p2align 2 \n"
4124 "10: \n"
4125 "movd (%0),%%xmm2 \n"
4126 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00004127 "punpcklbw %%xmm1,%%xmm2 \n"
4128 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004129 "paddd %%xmm2,%%xmm0 \n"
4130 "movdqu (%1,%2,1),%%xmm2 \n"
4131 "paddd %%xmm0,%%xmm2 \n"
4132 "movdqu %%xmm2,(%1) \n"
4133 "lea 0x10(%1),%1 \n"
4134 "sub $0x1,%3 \n"
4135 "jge 10b \n"
4136
4137 "19: \n"
4138 : "+r"(row), // %0
4139 "+r"(cumsum), // %1
4140 "+r"(previous_cumsum), // %2
4141 "+r"(width) // %3
4142 :
4143 : "memory", "cc"
4144#if defined(__SSE2__)
4145 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4146#endif
4147 );
4148}
4149#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4150
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004151#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4152void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4153 int width, int area, uint8* dst,
4154 int count) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004155 asm volatile (
4156 "movd %5,%%xmm4 \n"
4157 "cvtdq2ps %%xmm4,%%xmm4 \n"
4158 "rcpss %%xmm4,%%xmm4 \n"
4159 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4160 "sub $0x4,%3 \n"
4161 "jl 49f \n"
4162
4163 // 4 pixel loop \n"
4164 ".p2align 2 \n"
4165 "40: \n"
4166 "movdqa (%0),%%xmm0 \n"
4167 "movdqa 0x10(%0),%%xmm1 \n"
4168 "movdqa 0x20(%0),%%xmm2 \n"
4169 "movdqa 0x30(%0),%%xmm3 \n"
4170 "psubd (%0,%4,4),%%xmm0 \n"
4171 "psubd 0x10(%0,%4,4),%%xmm1 \n"
4172 "psubd 0x20(%0,%4,4),%%xmm2 \n"
4173 "psubd 0x30(%0,%4,4),%%xmm3 \n"
4174 "lea 0x40(%0),%0 \n"
4175 "psubd (%1),%%xmm0 \n"
4176 "psubd 0x10(%1),%%xmm1 \n"
4177 "psubd 0x20(%1),%%xmm2 \n"
4178 "psubd 0x30(%1),%%xmm3 \n"
4179 "paddd (%1,%4,4),%%xmm0 \n"
4180 "paddd 0x10(%1,%4,4),%%xmm1 \n"
4181 "paddd 0x20(%1,%4,4),%%xmm2 \n"
4182 "paddd 0x30(%1,%4,4),%%xmm3 \n"
4183 "lea 0x40(%1),%1 \n"
4184 "cvtdq2ps %%xmm0,%%xmm0 \n"
4185 "cvtdq2ps %%xmm1,%%xmm1 \n"
4186 "mulps %%xmm4,%%xmm0 \n"
4187 "mulps %%xmm4,%%xmm1 \n"
4188 "cvtdq2ps %%xmm2,%%xmm2 \n"
4189 "cvtdq2ps %%xmm3,%%xmm3 \n"
4190 "mulps %%xmm4,%%xmm2 \n"
4191 "mulps %%xmm4,%%xmm3 \n"
4192 "cvtps2dq %%xmm0,%%xmm0 \n"
4193 "cvtps2dq %%xmm1,%%xmm1 \n"
4194 "cvtps2dq %%xmm2,%%xmm2 \n"
4195 "cvtps2dq %%xmm3,%%xmm3 \n"
4196 "packssdw %%xmm1,%%xmm0 \n"
4197 "packssdw %%xmm3,%%xmm2 \n"
4198 "packuswb %%xmm2,%%xmm0 \n"
4199 "movdqu %%xmm0,(%2) \n"
4200 "lea 0x10(%2),%2 \n"
4201 "sub $0x4,%3 \n"
4202 "jge 40b \n"
4203
4204 "49: \n"
4205 "add $0x3,%3 \n"
4206 "jl 19f \n"
4207
4208 // 1 pixel loop \n"
4209 ".p2align 2 \n"
4210 "10: \n"
4211 "movdqa (%0),%%xmm0 \n"
4212 "psubd (%0,%4,4),%%xmm0 \n"
4213 "lea 0x10(%0),%0 \n"
4214 "psubd (%1),%%xmm0 \n"
4215 "paddd (%1,%4,4),%%xmm0 \n"
4216 "lea 0x10(%1),%1 \n"
4217 "cvtdq2ps %%xmm0,%%xmm0 \n"
4218 "mulps %%xmm4,%%xmm0 \n"
4219 "cvtps2dq %%xmm0,%%xmm0 \n"
4220 "packssdw %%xmm0,%%xmm0 \n"
4221 "packuswb %%xmm0,%%xmm0 \n"
4222 "movd %%xmm0,(%2) \n"
4223 "lea 0x4(%2),%2 \n"
4224 "sub $0x1,%3 \n"
4225 "jge 10b \n"
4226 "19: \n"
4227 : "+r"(topleft), // %0
4228 "+r"(botleft), // %1
4229 "+r"(dst), // %2
4230 "+rm"(count) // %3
4231 : "r"(static_cast<intptr_t>(width)), // %4
4232 "rm"(area) // %5
4233 : "memory", "cc"
4234#if defined(__SSE2__)
4235 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
4236#endif
4237 );
4238}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004239#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004240
fbarchard@google.com73444402012-08-09 17:33:29 +00004241#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004242// TODO(fbarchard): Find 64 bit way to avoid masking.
fbarchard@google.com73444402012-08-09 17:33:29 +00004243// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004244// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00004245// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004246
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00004247LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00004248void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004249 uint8* dst_argb, const float* src_dudv, int width) {
fbarchard@google.com73444402012-08-09 17:33:29 +00004250 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004251 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00004252 asm volatile (
4253 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004254 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004255 "shl $0x10,%1 \n"
4256 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004257 "movd %1,%%xmm5 \n"
4258 "sub $0x4,%4 \n"
4259 "jl 49f \n"
4260
4261 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4262 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004263 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004264 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004265 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004266 "movdqa %%xmm7,%%xmm4 \n"
4267 "addps %%xmm4,%%xmm4 \n"
4268 "movdqa %%xmm2,%%xmm3 \n"
4269 "addps %%xmm4,%%xmm3 \n"
4270 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004271
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004272 // 4 pixel loop \n"
4273 ".p2align 4 \n"
4274 "40: \n"
4275 "cvttps2dq %%xmm2,%%xmm0 \n"
4276 "cvttps2dq %%xmm3,%%xmm1 \n"
4277 "packssdw %%xmm1,%%xmm0 \n"
4278 "pmaddwd %%xmm5,%%xmm0 \n"
4279#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004280 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004281 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004282 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004283 "shr $32,%5 \n"
4284 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4285#else
4286 "movd %%xmm0,%1 \n"
4287 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4288 "movd %%xmm0,%5 \n"
4289 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4290#endif
4291 "movd (%0,%1,1),%%xmm1 \n"
4292 "movd (%0,%5,1),%%xmm6 \n"
4293 "punpckldq %%xmm6,%%xmm1 \n"
4294 "addps %%xmm4,%%xmm2 \n"
4295 "movq %%xmm1,(%2) \n"
4296#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004297 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004298 "mov %1,%5 \n"
4299 "and $0x0fffffff,%1 \n"
4300 "shr $32,%5 \n"
4301#else
4302 "movd %%xmm0,%1 \n"
4303 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4304 "movd %%xmm0,%5 \n"
4305#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004306 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004307 "movd (%0,%5,1),%%xmm6 \n"
4308 "punpckldq %%xmm6,%%xmm0 \n"
4309 "addps %%xmm4,%%xmm3 \n"
4310 "sub $0x4,%4 \n"
4311 "movq %%xmm0,0x08(%2) \n"
4312 "lea 0x10(%2),%2 \n"
4313 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004314
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004315 "49: \n"
4316 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004317 "jl 19f \n"
4318
4319 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004320 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004321 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004322 "cvttps2dq %%xmm2,%%xmm0 \n"
4323 "packssdw %%xmm0,%%xmm0 \n"
4324 "pmaddwd %%xmm5,%%xmm0 \n"
4325 "addps %%xmm7,%%xmm2 \n"
4326 "movd %%xmm0,%1 \n"
4327#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004328 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004329#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004330 "movd (%0,%1,1),%%xmm0 \n"
4331 "sub $0x1,%4 \n"
4332 "movd %%xmm0,(%2) \n"
4333 "lea 0x4(%2),%2 \n"
4334 "jge 10b \n"
4335 "19: \n"
4336 : "+r"(src_argb), // %0
4337 "+r"(src_argb_stride_temp), // %1
4338 "+r"(dst_argb), // %2
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004339 "+r"(src_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004340 "+rm"(width), // %4
4341 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004342 :
4343 : "memory", "cc"
4344#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004345 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004346#endif
4347 );
4348}
4349#endif // HAS_ARGBAFFINEROW_SSE2
4350
fbarchard@google.comb5491752012-11-20 09:44:46 +00004351// Bilinear image filtering.
4352// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4353void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004354 ptrdiff_t src_stride, int dst_width,
4355 int source_y_fraction) {
4356 asm volatile (
4357 "sub %1,%0 \n"
4358 "shr %3 \n"
4359 "cmp $0x0,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004360 "je 100f \n"
4361 "cmp $0x20,%3 \n"
4362 "je 75f \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004363 "cmp $0x40,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004364 "je 50f \n"
4365 "cmp $0x60,%3 \n"
4366 "je 25f \n"
4367
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004368 "movd %3,%%xmm0 \n"
4369 "neg %3 \n"
4370 "add $0x80,%3 \n"
4371 "movd %3,%%xmm5 \n"
4372 "punpcklbw %%xmm0,%%xmm5 \n"
4373 "punpcklwd %%xmm5,%%xmm5 \n"
4374 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004375
4376 // General purpose row blend.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004377 ".p2align 4 \n"
4378 "1: \n"
4379 "movdqa (%1),%%xmm0 \n"
4380 "movdqa (%1,%4,1),%%xmm2 \n"
4381 "movdqa %%xmm0,%%xmm1 \n"
4382 "punpcklbw %%xmm2,%%xmm0 \n"
4383 "punpckhbw %%xmm2,%%xmm1 \n"
4384 "pmaddubsw %%xmm5,%%xmm0 \n"
4385 "pmaddubsw %%xmm5,%%xmm1 \n"
4386 "psrlw $0x7,%%xmm0 \n"
4387 "psrlw $0x7,%%xmm1 \n"
4388 "packuswb %%xmm1,%%xmm0 \n"
4389 "sub $0x4,%2 \n"
4390 "movdqa %%xmm0,(%1,%0,1) \n"
4391 "lea 0x10(%1),%1 \n"
4392 "jg 1b \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004393 "jmp 99f \n"
4394
4395 // Blend 25 / 75.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004396 ".p2align 4 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004397 "25: \n"
4398 "movdqa (%1),%%xmm0 \n"
4399 "movdqa (%1,%4,1),%%xmm1 \n"
4400 "pavgb %%xmm1,%%xmm0 \n"
4401 "pavgb %%xmm1,%%xmm0 \n"
4402 "sub $0x4,%2 \n"
4403 "movdqa %%xmm0,(%1,%0,1) \n"
4404 "lea 0x10(%1),%1 \n"
4405 "jg 25b \n"
4406 "jmp 99f \n"
4407
4408 // Blend 50 / 50.
4409 ".p2align 4 \n"
4410 "50: \n"
4411 "movdqa (%1),%%xmm0 \n"
4412 "movdqa (%1,%4,1),%%xmm1 \n"
4413 "pavgb %%xmm1,%%xmm0 \n"
4414 "sub $0x4,%2 \n"
4415 "movdqa %%xmm0,(%1,%0,1) \n"
4416 "lea 0x10(%1),%1 \n"
4417 "jg 50b \n"
4418 "jmp 99f \n"
4419
4420 // Blend 75 / 25.
4421 ".p2align 4 \n"
4422 "75: \n"
4423 "movdqa (%1),%%xmm1 \n"
4424 "movdqa (%1,%4,1),%%xmm0 \n"
4425 "pavgb %%xmm1,%%xmm0 \n"
4426 "pavgb %%xmm1,%%xmm0 \n"
4427 "sub $0x4,%2 \n"
4428 "movdqa %%xmm0,(%1,%0,1) \n"
4429 "lea 0x10(%1),%1 \n"
4430 "jg 75b \n"
4431 "jmp 99f \n"
4432
4433 // Blend 100 / 0 - Copy row unchanged.
4434 ".p2align 4 \n"
4435 "100: \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004436 "movdqa (%1),%%xmm0 \n"
4437 "sub $0x4,%2 \n"
4438 "movdqa %%xmm0,(%1,%0,1) \n"
4439 "lea 0x10(%1),%1 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004440 "jg 100b \n"
4441
fbarchard@google.comb5491752012-11-20 09:44:46 +00004442 "99: \n"
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004443 : "+r"(dst_argb), // %0
4444 "+r"(src_argb), // %1
fbarchard@google.comb5491752012-11-20 09:44:46 +00004445 "+r"(dst_width), // %2
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004446 "+r"(source_y_fraction) // %3
4447 : "r"(static_cast<intptr_t>(src_stride)) // %4
4448 : "memory", "cc"
4449#if defined(__SSE2__)
4450 , "xmm0", "xmm1", "xmm2", "xmm5"
4451#endif
4452 );
4453}
4454
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004455// Bilinear image filtering.
4456// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4457void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
4458 ptrdiff_t src_stride, int dst_width,
4459 int source_y_fraction) {
4460 asm volatile (
4461 "sub %1,%0 \n"
4462 "shr %3 \n"
4463 "cmp $0x0,%3 \n"
4464 "je 100f \n"
4465 "cmp $0x20,%3 \n"
4466 "je 75f \n"
4467 "cmp $0x40,%3 \n"
4468 "je 50f \n"
4469 "cmp $0x60,%3 \n"
4470 "je 25f \n"
4471
4472 "movd %3,%%xmm0 \n"
4473 "neg %3 \n"
4474 "add $0x80,%3 \n"
4475 "movd %3,%%xmm5 \n"
4476 "punpcklbw %%xmm0,%%xmm5 \n"
4477 "punpcklwd %%xmm5,%%xmm5 \n"
4478 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4479 "pxor %%xmm4,%%xmm4 \n"
4480
4481 // General purpose row blend.
4482 ".p2align 4 \n"
4483 "1: \n"
4484 "movdqa (%1),%%xmm0 \n"
4485 "movdqa (%1,%4,1),%%xmm2 \n"
4486 "movdqa %%xmm0,%%xmm1 \n"
4487 "movdqa %%xmm2,%%xmm3 \n"
4488 "punpcklbw %%xmm4,%%xmm2 \n"
4489 "punpckhbw %%xmm4,%%xmm3 \n"
4490 "punpcklbw %%xmm4,%%xmm0 \n"
4491 "punpckhbw %%xmm4,%%xmm1 \n"
4492 "psubw %%xmm0,%%xmm2 \n"
4493 "psubw %%xmm1,%%xmm3 \n"
4494 "paddw %%xmm2,%%xmm2 \n"
4495 "paddw %%xmm3,%%xmm3 \n"
4496 "pmulhw %%xmm5,%%xmm2 \n"
4497 "pmulhw %%xmm5,%%xmm3 \n"
4498 "paddw %%xmm2,%%xmm0 \n"
4499 "paddw %%xmm3,%%xmm1 \n"
4500 "packuswb %%xmm1,%%xmm0 \n"
4501 "sub $0x4,%2 \n"
4502 "movdqa %%xmm0,(%1,%0,1) \n"
4503 "lea 0x10(%1),%1 \n"
4504 "jg 1b \n"
4505 "jmp 99f \n"
4506
4507 // Blend 25 / 75.
4508 ".p2align 4 \n"
4509 "25: \n"
4510 "movdqa (%1),%%xmm0 \n"
4511 "movdqa (%1,%4,1),%%xmm1 \n"
4512 "pavgb %%xmm1,%%xmm0 \n"
4513 "pavgb %%xmm1,%%xmm0 \n"
4514 "sub $0x4,%2 \n"
4515 "movdqa %%xmm0,(%1,%0,1) \n"
4516 "lea 0x10(%1),%1 \n"
4517 "jg 25b \n"
4518 "jmp 99f \n"
4519
4520 // Blend 50 / 50.
4521 ".p2align 4 \n"
4522 "50: \n"
4523 "movdqa (%1),%%xmm0 \n"
4524 "movdqa (%1,%4,1),%%xmm1 \n"
4525 "pavgb %%xmm1,%%xmm0 \n"
4526 "sub $0x4,%2 \n"
4527 "movdqa %%xmm0,(%1,%0,1) \n"
4528 "lea 0x10(%1),%1 \n"
4529 "jg 50b \n"
4530 "jmp 99f \n"
4531
4532 // Blend 75 / 25.
4533 ".p2align 4 \n"
4534 "75: \n"
4535 "movdqa (%1),%%xmm1 \n"
4536 "movdqa (%1,%4,1),%%xmm0 \n"
4537 "pavgb %%xmm1,%%xmm0 \n"
4538 "pavgb %%xmm1,%%xmm0 \n"
4539 "sub $0x4,%2 \n"
4540 "movdqa %%xmm0,(%1,%0,1) \n"
4541 "lea 0x10(%1),%1 \n"
4542 "jg 75b \n"
4543 "jmp 99f \n"
4544
4545 // Blend 100 / 0 - Copy row unchanged.
4546 ".p2align 4 \n"
4547 "100: \n"
4548 "movdqa (%1),%%xmm0 \n"
4549 "sub $0x4,%2 \n"
4550 "movdqa %%xmm0,(%1,%0,1) \n"
4551 "lea 0x10(%1),%1 \n"
4552 "jg 100b \n"
4553
4554 "99: \n"
4555 : "+r"(dst_argb), // %0
4556 "+r"(src_argb), // %1
4557 "+r"(dst_width), // %2
4558 "+r"(source_y_fraction) // %3
4559 : "r"(static_cast<intptr_t>(src_stride)) // %4
4560 : "memory", "cc"
4561#if defined(__SSE2__)
4562 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4563#endif
4564 );
4565}
4566
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004567void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4568 uint8* dst_uv, int pix) {
4569 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00004570 "sub %0,%1 \n"
4571 ".p2align 4 \n"
4572 "1: \n"
4573 "movdqa (%0),%%xmm0 \n"
4574 "pavgb (%0,%3),%%xmm0 \n"
4575 "sub $0x10,%2 \n"
4576 "movdqa %%xmm0,(%0,%1) \n"
4577 "lea 0x10(%0),%0 \n"
4578 "jg 1b \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004579 : "+r"(src_uv), // %0
4580 "+r"(dst_uv), // %1
4581 "+r"(pix) // %2
4582 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4583 : "memory", "cc"
4584#if defined(__SSE2__)
4585 , "xmm0"
4586#endif
4587 );
4588}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004589
4590void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4591 uint32 selector, int pix) {
4592 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00004593 "movd %3,%%xmm5 \n"
4594 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004595 ".p2align 4 \n"
4596 "1: \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00004597 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004598 "movdqa 0x10(%0),%%xmm1 \n"
4599 "lea 0x20(%0),%0 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00004600 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004601 "pshufb %%xmm5,%%xmm1 \n"
fbarchard@google.coma3be4702013-03-22 05:20:02 +00004602 "punpckldq %%xmm1,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004603 "sub $0x8,%2 \n"
4604 "movq %%xmm0,(%1) \n"
4605 "lea 0x8(%1),%1 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00004606 "jg 1b \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004607 : "+r"(src_argb), // %0
4608 "+r"(dst_bayer), // %1
4609 "+r"(pix) // %2
4610 : "g"(selector) // %3
4611 : "memory", "cc"
4612#if defined(__SSE2__)
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004613 , "xmm0", "xmm1", "xmm5"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004614#endif
4615 );
4616}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004617
fbarchard@google.com10965432013-03-08 23:22:32 +00004618// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4619void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4620 const uint8* shuffler, int pix) {
4621 asm volatile (
4622 "movdqa (%3),%%xmm5 \n"
4623 ".p2align 4 \n"
4624 "1: \n"
4625 "movdqa (%0),%%xmm0 \n"
4626 "movdqa 0x10(%0),%%xmm1 \n"
4627 "lea 0x20(%0),%0 \n"
4628 "pshufb %%xmm5,%%xmm0 \n"
4629 "pshufb %%xmm5,%%xmm1 \n"
4630 "sub $0x8,%2 \n"
4631 "movdqa %%xmm0,(%1) \n"
4632 "movdqa %%xmm1,0x10(%1) \n"
4633 "lea 0x20(%1),%1 \n"
4634 "jg 1b \n"
4635 : "+r"(src_argb), // %0
4636 "+r"(dst_argb), // %1
4637 "+r"(pix) // %2
4638 : "r"(shuffler) // %3
4639 : "memory", "cc"
4640#if defined(__SSE2__)
4641 , "xmm0", "xmm1", "xmm5"
4642#endif
4643 );
4644}
4645
4646void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
4647 const uint8* shuffler, int pix) {
4648 asm volatile (
4649 "movdqa (%3),%%xmm5 \n"
4650 ".p2align 4 \n"
4651 "1: \n"
4652 "movdqu (%0),%%xmm0 \n"
4653 "movdqu 0x10(%0),%%xmm1 \n"
4654 "lea 0x20(%0),%0 \n"
4655 "pshufb %%xmm5,%%xmm0 \n"
4656 "pshufb %%xmm5,%%xmm1 \n"
4657 "sub $0x8,%2 \n"
4658 "movdqu %%xmm0,(%1) \n"
4659 "movdqu %%xmm1,0x10(%1) \n"
4660 "lea 0x20(%1),%1 \n"
4661 "jg 1b \n"
4662 : "+r"(src_argb), // %0
4663 "+r"(dst_argb), // %1
4664 "+r"(pix) // %2
4665 : "r"(shuffler) // %3
4666 : "memory", "cc"
4667#if defined(__SSE2__)
4668 , "xmm0", "xmm1", "xmm5"
4669#endif
4670 );
4671}
4672
fbarchard@google.com9de88672012-10-12 06:23:33 +00004673void I422ToYUY2Row_SSE2(const uint8* src_y,
4674 const uint8* src_u,
4675 const uint8* src_v,
4676 uint8* dst_frame, int width) {
4677 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00004678 "sub %1,%2 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00004679 ".p2align 4 \n"
4680 "1: \n"
4681 "movq (%1),%%xmm2 \n"
4682 "movq (%1,%2,1),%%xmm3 \n"
4683 "lea 0x8(%1),%1 \n"
4684 "punpcklbw %%xmm3,%%xmm2 \n"
4685 "movdqa (%0),%%xmm0 \n"
4686 "lea 0x10(%0),%0 \n"
4687 "movdqa %%xmm0,%%xmm1 \n"
4688 "punpcklbw %%xmm2,%%xmm0 \n"
4689 "punpckhbw %%xmm2,%%xmm1 \n"
4690 "movdqa %%xmm0,(%3) \n"
4691 "movdqa %%xmm1,0x10(%3) \n"
4692 "lea 0x20(%3),%3 \n"
4693 "sub $0x10,%4 \n"
4694 "jg 1b \n"
4695 : "+r"(src_y), // %0
4696 "+r"(src_u), // %1
4697 "+r"(src_v), // %2
4698 "+r"(dst_frame), // %3
4699 "+rm"(width) // %4
4700 :
4701 : "memory", "cc"
4702#if defined(__SSE2__)
4703 , "xmm0", "xmm1", "xmm2", "xmm3"
4704#endif
4705 );
4706}
4707
4708void I422ToUYVYRow_SSE2(const uint8* src_y,
4709 const uint8* src_u,
4710 const uint8* src_v,
4711 uint8* dst_frame, int width) {
4712 asm volatile (
4713 "sub %1,%2 \n"
4714 ".p2align 4 \n"
4715 "1: \n"
4716 "movq (%1),%%xmm2 \n"
4717 "movq (%1,%2,1),%%xmm3 \n"
4718 "lea 0x8(%1),%1 \n"
4719 "punpcklbw %%xmm3,%%xmm2 \n"
4720 "movdqa (%0),%%xmm0 \n"
4721 "movdqa %%xmm2,%%xmm1 \n"
4722 "lea 0x10(%0),%0 \n"
4723 "punpcklbw %%xmm0,%%xmm1 \n"
4724 "punpckhbw %%xmm0,%%xmm2 \n"
4725 "movdqa %%xmm1,(%3) \n"
4726 "movdqa %%xmm2,0x10(%3) \n"
4727 "lea 0x20(%3),%3 \n"
4728 "sub $0x10,%4 \n"
4729 "jg 1b \n"
4730 : "+r"(src_y), // %0
4731 "+r"(src_u), // %1
4732 "+r"(src_v), // %2
4733 "+r"(dst_frame), // %3
4734 "+rm"(width) // %4
4735 :
4736 : "memory", "cc"
4737#if defined(__SSE2__)
4738 , "xmm0", "xmm1", "xmm2", "xmm3"
4739#endif
4740 );
4741}
4742
fbarchard@google.com2d11d432012-02-16 02:50:39 +00004743#endif // defined(__x86_64__) || defined(__i386__)
4744
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004745#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00004746} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004747} // namespace libyuv
4748#endif