blob: db2e5f5cc80ff9aae3c09e0df4faf1b482b42184 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
fbarchard@google.comcde58702013-01-28 00:02:35 +00007 * in the file PATENTS. All contributing project authors may
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00008 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.com83a63e62013-02-27 00:20:29 +000021#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
fbarchard@google.com4de0c432012-10-11 01:25:46 +000072// Constants for RGBA.
73CONST vec8 kRGBAToY = {
74 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
75};
76
77CONST vec8 kRGBAToU = {
78 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
79};
80
81CONST vec8 kRGBAToV = {
82 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
83};
84
fbarchard@google.com714050a2012-02-17 22:59:56 +000085CONST uvec8 kAddY16 = {
86 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000087};
fbarchard@google.com2430e042011-11-11 21:57:06 +000088
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000089CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
92};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000093
fbarchard@google.comba1f5262012-01-12 19:22:41 +000094// Shuffle table for converting RGB24 to ARGB.
95CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000096 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
97};
98
99// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000100CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000101 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
102};
103
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000104// Shuffle table for converting ARGB to RGB24.
105CONST uvec8 kShuffleMaskARGBToRGB24 = {
106 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
107};
108
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109// Shuffle table for converting ARGB to RAW.
110CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000111 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000112};
113
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000114// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000115CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
116 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
117};
118
119// Shuffle table for converting ARGB to RAW.
120CONST uvec8 kShuffleMaskARGBToRAW_0 = {
121 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
122};
123
fbarchard@google.comb6149762011-11-07 21:58:52 +0000124void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000125 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000126 "pcmpeqb %%xmm5,%%xmm5 \n"
127 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000128 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000129 "1: \n"
130 "movq (%0),%%xmm0 \n"
131 "lea 0x8(%0),%0 \n"
132 "punpcklbw %%xmm0,%%xmm0 \n"
133 "movdqa %%xmm0,%%xmm1 \n"
134 "punpcklwd %%xmm0,%%xmm0 \n"
135 "punpckhwd %%xmm1,%%xmm1 \n"
136 "por %%xmm5,%%xmm0 \n"
137 "por %%xmm5,%%xmm1 \n"
138 "movdqa %%xmm0,(%1) \n"
139 "movdqa %%xmm1,0x10(%1) \n"
140 "lea 0x20(%1),%1 \n"
141 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000142 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000143 : "+r"(src_y), // %0
144 "+r"(dst_argb), // %1
145 "+r"(pix) // %2
146 :
147 : "memory", "cc"
148#if defined(__SSE2__)
149 , "xmm0", "xmm1", "xmm5"
150#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000151 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000152}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000153
fbarchard@google.com00b69a22012-11-02 06:03:28 +0000154void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
155 int pix) {
156 asm volatile (
157 "pcmpeqb %%xmm5,%%xmm5 \n"
158 "pslld $0x18,%%xmm5 \n"
159 ".p2align 4 \n"
160 "1: \n"
161 "movq (%0),%%xmm0 \n"
162 "lea 0x8(%0),%0 \n"
163 "punpcklbw %%xmm0,%%xmm0 \n"
164 "movdqa %%xmm0,%%xmm1 \n"
165 "punpcklwd %%xmm0,%%xmm0 \n"
166 "punpckhwd %%xmm1,%%xmm1 \n"
167 "por %%xmm5,%%xmm0 \n"
168 "por %%xmm5,%%xmm1 \n"
169 "movdqu %%xmm0,(%1) \n"
170 "movdqu %%xmm1,0x10(%1) \n"
171 "lea 0x20(%1),%1 \n"
172 "sub $0x8,%2 \n"
173 "jg 1b \n"
174 : "+r"(src_y), // %0
175 "+r"(dst_argb), // %1
176 "+r"(pix) // %2
177 :
178 : "memory", "cc"
179#if defined(__SSE2__)
180 , "xmm0", "xmm1", "xmm5"
181#endif
182 );
183}
184
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000185void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000186 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000187 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
188 "pslld $0x18,%%xmm5 \n"
189 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000190 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000191 "1: \n"
192 "movdqu (%0),%%xmm0 \n"
193 "movdqu 0x10(%0),%%xmm1 \n"
194 "movdqu 0x20(%0),%%xmm3 \n"
195 "lea 0x30(%0),%0 \n"
196 "movdqa %%xmm3,%%xmm2 \n"
197 "palignr $0x8,%%xmm1,%%xmm2 \n"
198 "pshufb %%xmm4,%%xmm2 \n"
199 "por %%xmm5,%%xmm2 \n"
200 "palignr $0xc,%%xmm0,%%xmm1 \n"
201 "pshufb %%xmm4,%%xmm0 \n"
202 "movdqa %%xmm2,0x20(%1) \n"
203 "por %%xmm5,%%xmm0 \n"
204 "pshufb %%xmm4,%%xmm1 \n"
205 "movdqa %%xmm0,(%1) \n"
206 "por %%xmm5,%%xmm1 \n"
207 "palignr $0x4,%%xmm3,%%xmm3 \n"
208 "pshufb %%xmm4,%%xmm3 \n"
209 "movdqa %%xmm1,0x10(%1) \n"
210 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000211 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000212 "movdqa %%xmm3,0x30(%1) \n"
213 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000215 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000216 "+r"(dst_argb), // %1
217 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000218 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000219 : "memory", "cc"
220#if defined(__SSE2__)
221 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
222#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000223 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000224}
225
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000226void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000227 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000228 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
229 "pslld $0x18,%%xmm5 \n"
230 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000231 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000232 "1: \n"
233 "movdqu (%0),%%xmm0 \n"
234 "movdqu 0x10(%0),%%xmm1 \n"
235 "movdqu 0x20(%0),%%xmm3 \n"
236 "lea 0x30(%0),%0 \n"
237 "movdqa %%xmm3,%%xmm2 \n"
238 "palignr $0x8,%%xmm1,%%xmm2 \n"
239 "pshufb %%xmm4,%%xmm2 \n"
240 "por %%xmm5,%%xmm2 \n"
241 "palignr $0xc,%%xmm0,%%xmm1 \n"
242 "pshufb %%xmm4,%%xmm0 \n"
243 "movdqa %%xmm2,0x20(%1) \n"
244 "por %%xmm5,%%xmm0 \n"
245 "pshufb %%xmm4,%%xmm1 \n"
246 "movdqa %%xmm0,(%1) \n"
247 "por %%xmm5,%%xmm1 \n"
248 "palignr $0x4,%%xmm3,%%xmm3 \n"
249 "pshufb %%xmm4,%%xmm3 \n"
250 "movdqa %%xmm1,0x10(%1) \n"
251 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000252 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000253 "movdqa %%xmm3,0x30(%1) \n"
254 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000255 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000256 : "+r"(src_raw), // %0
257 "+r"(dst_argb), // %1
258 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000259 : "m"(kShuffleMaskRAWToARGB) // %3
260 : "memory", "cc"
261#if defined(__SSE2__)
262 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
263#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000264 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000265}
266
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000268 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000269 "mov $0x1080108,%%eax \n"
270 "movd %%eax,%%xmm5 \n"
271 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000272 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000273 "movd %%eax,%%xmm6 \n"
274 "pshufd $0x0,%%xmm6,%%xmm6 \n"
275 "pcmpeqb %%xmm3,%%xmm3 \n"
276 "psllw $0xb,%%xmm3 \n"
277 "pcmpeqb %%xmm4,%%xmm4 \n"
278 "psllw $0xa,%%xmm4 \n"
279 "psrlw $0x5,%%xmm4 \n"
280 "pcmpeqb %%xmm7,%%xmm7 \n"
281 "psllw $0x8,%%xmm7 \n"
282 "sub %0,%1 \n"
283 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000284 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000285 "1: \n"
286 "movdqu (%0),%%xmm0 \n"
287 "movdqa %%xmm0,%%xmm1 \n"
288 "movdqa %%xmm0,%%xmm2 \n"
289 "pand %%xmm3,%%xmm1 \n"
290 "psllw $0xb,%%xmm2 \n"
291 "pmulhuw %%xmm5,%%xmm1 \n"
292 "pmulhuw %%xmm5,%%xmm2 \n"
293 "psllw $0x8,%%xmm1 \n"
294 "por %%xmm2,%%xmm1 \n"
295 "pand %%xmm4,%%xmm0 \n"
296 "pmulhuw %%xmm6,%%xmm0 \n"
297 "por %%xmm7,%%xmm0 \n"
298 "movdqa %%xmm1,%%xmm2 \n"
299 "punpcklbw %%xmm0,%%xmm1 \n"
300 "punpckhbw %%xmm0,%%xmm2 \n"
301 "movdqa %%xmm1,(%1,%0,2) \n"
302 "movdqa %%xmm2,0x10(%1,%0,2) \n"
303 "lea 0x10(%0),%0 \n"
304 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000305 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000306 : "+r"(src), // %0
307 "+r"(dst), // %1
308 "+r"(pix) // %2
309 :
310 : "memory", "cc", "eax"
311#if defined(__SSE2__)
312 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
313#endif
314 );
315}
316
317void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000318 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000319 "mov $0x1080108,%%eax \n"
320 "movd %%eax,%%xmm5 \n"
321 "pshufd $0x0,%%xmm5,%%xmm5 \n"
322 "mov $0x42004200,%%eax \n"
323 "movd %%eax,%%xmm6 \n"
324 "pshufd $0x0,%%xmm6,%%xmm6 \n"
325 "pcmpeqb %%xmm3,%%xmm3 \n"
326 "psllw $0xb,%%xmm3 \n"
327 "movdqa %%xmm3,%%xmm4 \n"
328 "psrlw $0x6,%%xmm4 \n"
329 "pcmpeqb %%xmm7,%%xmm7 \n"
330 "psllw $0x8,%%xmm7 \n"
331 "sub %0,%1 \n"
332 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000333 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000334 "1: \n"
335 "movdqu (%0),%%xmm0 \n"
336 "movdqa %%xmm0,%%xmm1 \n"
337 "movdqa %%xmm0,%%xmm2 \n"
338 "psllw $0x1,%%xmm1 \n"
339 "psllw $0xb,%%xmm2 \n"
340 "pand %%xmm3,%%xmm1 \n"
341 "pmulhuw %%xmm5,%%xmm2 \n"
342 "pmulhuw %%xmm5,%%xmm1 \n"
343 "psllw $0x8,%%xmm1 \n"
344 "por %%xmm2,%%xmm1 \n"
345 "movdqa %%xmm0,%%xmm2 \n"
346 "pand %%xmm4,%%xmm0 \n"
347 "psraw $0x8,%%xmm2 \n"
348 "pmulhuw %%xmm6,%%xmm0 \n"
349 "pand %%xmm7,%%xmm2 \n"
350 "por %%xmm2,%%xmm0 \n"
351 "movdqa %%xmm1,%%xmm2 \n"
352 "punpcklbw %%xmm0,%%xmm1 \n"
353 "punpckhbw %%xmm0,%%xmm2 \n"
354 "movdqa %%xmm1,(%1,%0,2) \n"
355 "movdqa %%xmm2,0x10(%1,%0,2) \n"
356 "lea 0x10(%0),%0 \n"
357 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000358 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000359 : "+r"(src), // %0
360 "+r"(dst), // %1
361 "+r"(pix) // %2
362 :
363 : "memory", "cc", "eax"
364#if defined(__SSE2__)
365 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
366#endif
367 );
368}
369
370void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000371 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000372 "mov $0xf0f0f0f,%%eax \n"
373 "movd %%eax,%%xmm4 \n"
374 "pshufd $0x0,%%xmm4,%%xmm4 \n"
375 "movdqa %%xmm4,%%xmm5 \n"
376 "pslld $0x4,%%xmm5 \n"
377 "sub %0,%1 \n"
378 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000379 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000380 "1: \n"
381 "movdqu (%0),%%xmm0 \n"
382 "movdqa %%xmm0,%%xmm2 \n"
383 "pand %%xmm4,%%xmm0 \n"
384 "pand %%xmm5,%%xmm2 \n"
385 "movdqa %%xmm0,%%xmm1 \n"
386 "movdqa %%xmm2,%%xmm3 \n"
387 "psllw $0x4,%%xmm1 \n"
388 "psrlw $0x4,%%xmm3 \n"
389 "por %%xmm1,%%xmm0 \n"
390 "por %%xmm3,%%xmm2 \n"
391 "movdqa %%xmm0,%%xmm1 \n"
392 "punpcklbw %%xmm2,%%xmm0 \n"
393 "punpckhbw %%xmm2,%%xmm1 \n"
394 "movdqa %%xmm0,(%1,%0,2) \n"
395 "movdqa %%xmm1,0x10(%1,%0,2) \n"
396 "lea 0x10(%0),%0 \n"
397 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000398 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000399 : "+r"(src), // %0
400 "+r"(dst), // %1
401 "+r"(pix) // %2
402 :
403 : "memory", "cc", "eax"
404#if defined(__SSE2__)
405 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
406#endif
407 );
408}
409
410void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000411 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000412 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000413 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000414 "1: \n"
415 "movdqa (%0),%%xmm0 \n"
416 "movdqa 0x10(%0),%%xmm1 \n"
417 "movdqa 0x20(%0),%%xmm2 \n"
418 "movdqa 0x30(%0),%%xmm3 \n"
419 "lea 0x40(%0),%0 \n"
420 "pshufb %%xmm6,%%xmm0 \n"
421 "pshufb %%xmm6,%%xmm1 \n"
422 "pshufb %%xmm6,%%xmm2 \n"
423 "pshufb %%xmm6,%%xmm3 \n"
424 "movdqa %%xmm1,%%xmm4 \n"
425 "psrldq $0x4,%%xmm1 \n"
426 "pslldq $0xc,%%xmm4 \n"
427 "movdqa %%xmm2,%%xmm5 \n"
428 "por %%xmm4,%%xmm0 \n"
429 "pslldq $0x8,%%xmm5 \n"
430 "movdqa %%xmm0,(%1) \n"
431 "por %%xmm5,%%xmm1 \n"
432 "psrldq $0x8,%%xmm2 \n"
433 "pslldq $0x4,%%xmm3 \n"
434 "por %%xmm3,%%xmm2 \n"
435 "movdqa %%xmm1,0x10(%1) \n"
436 "movdqa %%xmm2,0x20(%1) \n"
437 "lea 0x30(%1),%1 \n"
438 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000439 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000440 : "+r"(src), // %0
441 "+r"(dst), // %1
442 "+r"(pix) // %2
443 : "m"(kShuffleMaskARGBToRGB24) // %3
444 : "memory", "cc"
445#if defined(__SSE2__)
446 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
447#endif
448 );
449}
450
451void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000452 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000453 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000454 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000455 "1: \n"
456 "movdqa (%0),%%xmm0 \n"
457 "movdqa 0x10(%0),%%xmm1 \n"
458 "movdqa 0x20(%0),%%xmm2 \n"
459 "movdqa 0x30(%0),%%xmm3 \n"
460 "lea 0x40(%0),%0 \n"
461 "pshufb %%xmm6,%%xmm0 \n"
462 "pshufb %%xmm6,%%xmm1 \n"
463 "pshufb %%xmm6,%%xmm2 \n"
464 "pshufb %%xmm6,%%xmm3 \n"
465 "movdqa %%xmm1,%%xmm4 \n"
466 "psrldq $0x4,%%xmm1 \n"
467 "pslldq $0xc,%%xmm4 \n"
468 "movdqa %%xmm2,%%xmm5 \n"
469 "por %%xmm4,%%xmm0 \n"
470 "pslldq $0x8,%%xmm5 \n"
471 "movdqa %%xmm0,(%1) \n"
472 "por %%xmm5,%%xmm1 \n"
473 "psrldq $0x8,%%xmm2 \n"
474 "pslldq $0x4,%%xmm3 \n"
475 "por %%xmm3,%%xmm2 \n"
476 "movdqa %%xmm1,0x10(%1) \n"
477 "movdqa %%xmm2,0x20(%1) \n"
478 "lea 0x30(%1),%1 \n"
479 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000480 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000481 : "+r"(src), // %0
482 "+r"(dst), // %1
483 "+r"(pix) // %2
484 : "m"(kShuffleMaskARGBToRAW) // %3
485 : "memory", "cc"
486#if defined(__SSE2__)
487 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
488#endif
489 );
490}
491
fbarchard@google.comdbcabea2012-10-29 21:20:25 +0000492void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000493 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000494 "pcmpeqb %%xmm3,%%xmm3 \n"
495 "psrld $0x1b,%%xmm3 \n"
496 "pcmpeqb %%xmm4,%%xmm4 \n"
497 "psrld $0x1a,%%xmm4 \n"
498 "pslld $0x5,%%xmm4 \n"
499 "pcmpeqb %%xmm5,%%xmm5 \n"
500 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000501 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000502 "1: \n"
503 "movdqa (%0),%%xmm0 \n"
504 "movdqa %%xmm0,%%xmm1 \n"
505 "movdqa %%xmm0,%%xmm2 \n"
506 "pslld $0x8,%%xmm0 \n"
507 "psrld $0x3,%%xmm1 \n"
508 "psrld $0x5,%%xmm2 \n"
509 "psrad $0x10,%%xmm0 \n"
510 "pand %%xmm3,%%xmm1 \n"
511 "pand %%xmm4,%%xmm2 \n"
512 "pand %%xmm5,%%xmm0 \n"
513 "por %%xmm2,%%xmm1 \n"
514 "por %%xmm1,%%xmm0 \n"
515 "packssdw %%xmm0,%%xmm0 \n"
516 "lea 0x10(%0),%0 \n"
517 "movq %%xmm0,(%1) \n"
518 "lea 0x8(%1),%1 \n"
519 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000520 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000521 : "+r"(src), // %0
522 "+r"(dst), // %1
523 "+r"(pix) // %2
524 :
525 : "memory", "cc"
526#if defined(__SSE2__)
527 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
528#endif
529 );
530}
531
532void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000533 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000534 "pcmpeqb %%xmm4,%%xmm4 \n"
535 "psrld $0x1b,%%xmm4 \n"
536 "movdqa %%xmm4,%%xmm5 \n"
537 "pslld $0x5,%%xmm5 \n"
538 "movdqa %%xmm4,%%xmm6 \n"
539 "pslld $0xa,%%xmm6 \n"
540 "pcmpeqb %%xmm7,%%xmm7 \n"
541 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000542 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000543 "1: \n"
544 "movdqa (%0),%%xmm0 \n"
545 "movdqa %%xmm0,%%xmm1 \n"
546 "movdqa %%xmm0,%%xmm2 \n"
547 "movdqa %%xmm0,%%xmm3 \n"
548 "psrad $0x10,%%xmm0 \n"
549 "psrld $0x3,%%xmm1 \n"
550 "psrld $0x6,%%xmm2 \n"
551 "psrld $0x9,%%xmm3 \n"
552 "pand %%xmm7,%%xmm0 \n"
553 "pand %%xmm4,%%xmm1 \n"
554 "pand %%xmm5,%%xmm2 \n"
555 "pand %%xmm6,%%xmm3 \n"
556 "por %%xmm1,%%xmm0 \n"
557 "por %%xmm3,%%xmm2 \n"
558 "por %%xmm2,%%xmm0 \n"
559 "packssdw %%xmm0,%%xmm0 \n"
560 "lea 0x10(%0),%0 \n"
561 "movq %%xmm0,(%1) \n"
562 "lea 0x8(%1),%1 \n"
563 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000564 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000565 : "+r"(src), // %0
566 "+r"(dst), // %1
567 "+r"(pix) // %2
568 :
569 : "memory", "cc"
570#if defined(__SSE2__)
571 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
572#endif
573 );
574}
575
576void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000577 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000578 "pcmpeqb %%xmm4,%%xmm4 \n"
579 "psllw $0xc,%%xmm4 \n"
580 "movdqa %%xmm4,%%xmm3 \n"
581 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000582 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000583 "1: \n"
584 "movdqa (%0),%%xmm0 \n"
585 "movdqa %%xmm0,%%xmm1 \n"
586 "pand %%xmm3,%%xmm0 \n"
587 "pand %%xmm4,%%xmm1 \n"
588 "psrlq $0x4,%%xmm0 \n"
589 "psrlq $0x8,%%xmm1 \n"
590 "por %%xmm1,%%xmm0 \n"
591 "packuswb %%xmm0,%%xmm0 \n"
592 "lea 0x10(%0),%0 \n"
593 "movq %%xmm0,(%1) \n"
594 "lea 0x8(%1),%1 \n"
595 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000596 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000597 : "+r"(src), // %0
598 "+r"(dst), // %1
599 "+r"(pix) // %2
600 :
601 : "memory", "cc"
602#if defined(__SSE2__)
603 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
604#endif
605 );
606}
607
fbarchard@google.comb6149762011-11-07 21:58:52 +0000608void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000609 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000610 "movdqa %4,%%xmm5 \n"
611 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000612 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000613 "1: \n"
614 "movdqa (%0),%%xmm0 \n"
615 "movdqa 0x10(%0),%%xmm1 \n"
616 "movdqa 0x20(%0),%%xmm2 \n"
617 "movdqa 0x30(%0),%%xmm3 \n"
618 "pmaddubsw %%xmm4,%%xmm0 \n"
619 "pmaddubsw %%xmm4,%%xmm1 \n"
620 "pmaddubsw %%xmm4,%%xmm2 \n"
621 "pmaddubsw %%xmm4,%%xmm3 \n"
622 "lea 0x40(%0),%0 \n"
623 "phaddw %%xmm1,%%xmm0 \n"
624 "phaddw %%xmm3,%%xmm2 \n"
625 "psrlw $0x7,%%xmm0 \n"
626 "psrlw $0x7,%%xmm2 \n"
627 "packuswb %%xmm2,%%xmm0 \n"
628 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000629 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000630 "movdqa %%xmm0,(%1) \n"
631 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000632 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000633 : "+r"(src_argb), // %0
634 "+r"(dst_y), // %1
635 "+r"(pix) // %2
636 : "m"(kARGBToY), // %3
637 "m"(kAddY16) // %4
638 : "memory", "cc"
639#if defined(__SSE2__)
640 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
641#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000642 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000643}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000644
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000645void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
646 asm volatile (
647 "movdqa %3,%%xmm4 \n"
648 ".p2align 4 \n"
649 "1: \n"
650 "movdqa (%0),%%xmm0 \n"
651 "movdqa 0x10(%0),%%xmm1 \n"
652 "movdqa 0x20(%0),%%xmm2 \n"
653 "movdqa 0x30(%0),%%xmm3 \n"
654 "pmaddubsw %%xmm4,%%xmm0 \n"
655 "pmaddubsw %%xmm4,%%xmm1 \n"
656 "pmaddubsw %%xmm4,%%xmm2 \n"
657 "pmaddubsw %%xmm4,%%xmm3 \n"
658 "lea 0x40(%0),%0 \n"
659 "phaddw %%xmm1,%%xmm0 \n"
660 "phaddw %%xmm3,%%xmm2 \n"
661 "psrlw $0x7,%%xmm0 \n"
662 "psrlw $0x7,%%xmm2 \n"
663 "packuswb %%xmm2,%%xmm0 \n"
664 "sub $0x10,%2 \n"
665 "movdqa %%xmm0,(%1) \n"
666 "lea 0x10(%1),%1 \n"
667 "jg 1b \n"
668 : "+r"(src_argb), // %0
669 "+r"(dst_y), // %1
670 "+r"(pix) // %2
671 : "m"(kARGBToY) // %3
672 : "memory", "cc"
673#if defined(__SSE2__)
674 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
675#endif
676 );
677}
678
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000679void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000680 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000681 "movdqa %4,%%xmm5 \n"
682 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000683 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000684 "1: \n"
685 "movdqu (%0),%%xmm0 \n"
686 "movdqu 0x10(%0),%%xmm1 \n"
687 "movdqu 0x20(%0),%%xmm2 \n"
688 "movdqu 0x30(%0),%%xmm3 \n"
689 "pmaddubsw %%xmm4,%%xmm0 \n"
690 "pmaddubsw %%xmm4,%%xmm1 \n"
691 "pmaddubsw %%xmm4,%%xmm2 \n"
692 "pmaddubsw %%xmm4,%%xmm3 \n"
693 "lea 0x40(%0),%0 \n"
694 "phaddw %%xmm1,%%xmm0 \n"
695 "phaddw %%xmm3,%%xmm2 \n"
696 "psrlw $0x7,%%xmm0 \n"
697 "psrlw $0x7,%%xmm2 \n"
698 "packuswb %%xmm2,%%xmm0 \n"
699 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000700 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000701 "movdqu %%xmm0,(%1) \n"
702 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000703 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000704 : "+r"(src_argb), // %0
705 "+r"(dst_y), // %1
706 "+r"(pix) // %2
707 : "m"(kARGBToY), // %3
708 "m"(kAddY16) // %4
709 : "memory", "cc"
710#if defined(__SSE2__)
711 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
712#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000713 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000714}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000715
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000716void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
717 asm volatile (
718 "movdqa %3,%%xmm4 \n"
719 ".p2align 4 \n"
720 "1: \n"
721 "movdqu (%0),%%xmm0 \n"
722 "movdqu 0x10(%0),%%xmm1 \n"
723 "movdqu 0x20(%0),%%xmm2 \n"
724 "movdqu 0x30(%0),%%xmm3 \n"
725 "pmaddubsw %%xmm4,%%xmm0 \n"
726 "pmaddubsw %%xmm4,%%xmm1 \n"
727 "pmaddubsw %%xmm4,%%xmm2 \n"
728 "pmaddubsw %%xmm4,%%xmm3 \n"
729 "lea 0x40(%0),%0 \n"
730 "phaddw %%xmm1,%%xmm0 \n"
731 "phaddw %%xmm3,%%xmm2 \n"
732 "psrlw $0x7,%%xmm0 \n"
733 "psrlw $0x7,%%xmm2 \n"
734 "packuswb %%xmm2,%%xmm0 \n"
735 "sub $0x10,%2 \n"
736 "movdqu %%xmm0,(%1) \n"
737 "lea 0x10(%1),%1 \n"
738 "jg 1b \n"
739 : "+r"(src_argb), // %0
740 "+r"(dst_y), // %1
741 "+r"(pix) // %2
742 : "m"(kARGBToY) // %3
743 : "memory", "cc"
744#if defined(__SSE2__)
745 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
746#endif
747 );
748}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000749// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000750// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
751// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
752// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000753// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000754void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
755 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000756 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000757 "movdqa %0,%%xmm4 \n"
758 "movdqa %1,%%xmm3 \n"
759 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000760 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000761 : "m"(kARGBToU), // %0
762 "m"(kARGBToV), // %1
763 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000764 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000765 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000766 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000767 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000768 "1: \n"
769 "movdqa (%0),%%xmm0 \n"
770 "movdqa 0x10(%0),%%xmm1 \n"
771 "movdqa 0x20(%0),%%xmm2 \n"
772 "movdqa 0x30(%0),%%xmm6 \n"
773 "pavgb (%0,%4,1),%%xmm0 \n"
774 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
775 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
776 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
777 "lea 0x40(%0),%0 \n"
778 "movdqa %%xmm0,%%xmm7 \n"
779 "shufps $0x88,%%xmm1,%%xmm0 \n"
780 "shufps $0xdd,%%xmm1,%%xmm7 \n"
781 "pavgb %%xmm7,%%xmm0 \n"
782 "movdqa %%xmm2,%%xmm7 \n"
783 "shufps $0x88,%%xmm6,%%xmm2 \n"
784 "shufps $0xdd,%%xmm6,%%xmm7 \n"
785 "pavgb %%xmm7,%%xmm2 \n"
786 "movdqa %%xmm0,%%xmm1 \n"
787 "movdqa %%xmm2,%%xmm6 \n"
788 "pmaddubsw %%xmm4,%%xmm0 \n"
789 "pmaddubsw %%xmm4,%%xmm2 \n"
790 "pmaddubsw %%xmm3,%%xmm1 \n"
791 "pmaddubsw %%xmm3,%%xmm6 \n"
792 "phaddw %%xmm2,%%xmm0 \n"
793 "phaddw %%xmm6,%%xmm1 \n"
794 "psraw $0x8,%%xmm0 \n"
795 "psraw $0x8,%%xmm1 \n"
796 "packsswb %%xmm1,%%xmm0 \n"
797 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000798 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000799 "movlps %%xmm0,(%1) \n"
800 "movhps %%xmm0,(%1,%2,1) \n"
801 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000802 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000803 : "+r"(src_argb0), // %0
804 "+r"(dst_u), // %1
805 "+r"(dst_v), // %2
806 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000807 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000808 : "memory", "cc"
809#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000810 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000811#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000812 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000813}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000814
815void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
816 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000817 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000818 "movdqa %0,%%xmm4 \n"
819 "movdqa %1,%%xmm3 \n"
820 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000821 :
822 : "m"(kARGBToU), // %0
823 "m"(kARGBToV), // %1
824 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000825 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000826 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000827 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000828 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000829 "1: \n"
830 "movdqu (%0),%%xmm0 \n"
831 "movdqu 0x10(%0),%%xmm1 \n"
832 "movdqu 0x20(%0),%%xmm2 \n"
833 "movdqu 0x30(%0),%%xmm6 \n"
834 "movdqu (%0,%4,1),%%xmm7 \n"
835 "pavgb %%xmm7,%%xmm0 \n"
836 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
837 "pavgb %%xmm7,%%xmm1 \n"
838 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
839 "pavgb %%xmm7,%%xmm2 \n"
840 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
841 "pavgb %%xmm7,%%xmm6 \n"
842 "lea 0x40(%0),%0 \n"
843 "movdqa %%xmm0,%%xmm7 \n"
844 "shufps $0x88,%%xmm1,%%xmm0 \n"
845 "shufps $0xdd,%%xmm1,%%xmm7 \n"
846 "pavgb %%xmm7,%%xmm0 \n"
847 "movdqa %%xmm2,%%xmm7 \n"
848 "shufps $0x88,%%xmm6,%%xmm2 \n"
849 "shufps $0xdd,%%xmm6,%%xmm7 \n"
850 "pavgb %%xmm7,%%xmm2 \n"
851 "movdqa %%xmm0,%%xmm1 \n"
852 "movdqa %%xmm2,%%xmm6 \n"
853 "pmaddubsw %%xmm4,%%xmm0 \n"
854 "pmaddubsw %%xmm4,%%xmm2 \n"
855 "pmaddubsw %%xmm3,%%xmm1 \n"
856 "pmaddubsw %%xmm3,%%xmm6 \n"
857 "phaddw %%xmm2,%%xmm0 \n"
858 "phaddw %%xmm6,%%xmm1 \n"
859 "psraw $0x8,%%xmm0 \n"
860 "psraw $0x8,%%xmm1 \n"
861 "packsswb %%xmm1,%%xmm0 \n"
862 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000863 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000864 "movlps %%xmm0,(%1) \n"
865 "movhps %%xmm0,(%1,%2,1) \n"
866 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000867 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000868 : "+r"(src_argb0), // %0
869 "+r"(dst_u), // %1
870 "+r"(dst_v), // %2
871 "+rm"(width) // %3
872 : "r"(static_cast<intptr_t>(src_stride_argb))
873 : "memory", "cc"
874#if defined(__SSE2__)
875 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
876#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000877 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000878}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000879
fbarchard@google.com762c0502013-02-04 18:47:21 +0000880void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
881 int width) {
882 asm volatile (
883 "movdqa %0,%%xmm4 \n"
884 "movdqa %1,%%xmm3 \n"
885 "movdqa %2,%%xmm5 \n"
886 :
887 : "m"(kARGBToU), // %0
888 "m"(kARGBToV), // %1
889 "m"(kAddUV128) // %2
890 );
891 asm volatile (
892 "sub %1,%2 \n"
893 ".p2align 4 \n"
894 "1: \n"
895 "movdqa (%0),%%xmm0 \n"
896 "movdqa 0x10(%0),%%xmm1 \n"
897 "movdqa 0x20(%0),%%xmm2 \n"
898 "movdqa 0x30(%0),%%xmm6 \n"
899 "pmaddubsw %%xmm4,%%xmm0 \n"
900 "pmaddubsw %%xmm4,%%xmm1 \n"
901 "pmaddubsw %%xmm4,%%xmm2 \n"
902 "pmaddubsw %%xmm4,%%xmm6 \n"
903 "phaddw %%xmm1,%%xmm0 \n"
904 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000905 "psraw $0x8,%%xmm0 \n"
906 "psraw $0x8,%%xmm2 \n"
907 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000908 "paddb %%xmm5,%%xmm0 \n"
909 "sub $0x10,%3 \n"
910 "movdqa %%xmm0,(%1) \n"
911 "movdqa (%0),%%xmm0 \n"
912 "movdqa 0x10(%0),%%xmm1 \n"
913 "movdqa 0x20(%0),%%xmm2 \n"
914 "movdqa 0x30(%0),%%xmm6 \n"
915 "pmaddubsw %%xmm3,%%xmm0 \n"
916 "pmaddubsw %%xmm3,%%xmm1 \n"
917 "pmaddubsw %%xmm3,%%xmm2 \n"
918 "pmaddubsw %%xmm3,%%xmm6 \n"
919 "phaddw %%xmm1,%%xmm0 \n"
920 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000921 "psraw $0x8,%%xmm0 \n"
922 "psraw $0x8,%%xmm2 \n"
923 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000924 "paddb %%xmm5,%%xmm0 \n"
925 "lea 0x40(%0),%0 \n"
926 "movdqa %%xmm0,(%1,%2,1) \n"
927 "lea 0x10(%1),%1 \n"
928 "jg 1b \n"
929 : "+r"(src_argb), // %0
930 "+r"(dst_u), // %1
931 "+r"(dst_v), // %2
932 "+rm"(width) // %3
933 :
934 : "memory", "cc"
935#if defined(__SSE2__)
936 , "xmm0", "xmm1", "xmm2", "xmm6"
937#endif
938 );
939}
940
941void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
942 uint8* dst_v, int width) {
943 asm volatile (
944 "movdqa %0,%%xmm4 \n"
945 "movdqa %1,%%xmm3 \n"
946 "movdqa %2,%%xmm5 \n"
947 :
948 : "m"(kARGBToU), // %0
949 "m"(kARGBToV), // %1
950 "m"(kAddUV128) // %2
951 );
952 asm volatile (
953 "sub %1,%2 \n"
954 ".p2align 4 \n"
955 "1: \n"
956 "movdqu (%0),%%xmm0 \n"
957 "movdqu 0x10(%0),%%xmm1 \n"
958 "movdqu 0x20(%0),%%xmm2 \n"
959 "movdqu 0x30(%0),%%xmm6 \n"
960 "pmaddubsw %%xmm4,%%xmm0 \n"
961 "pmaddubsw %%xmm4,%%xmm1 \n"
962 "pmaddubsw %%xmm4,%%xmm2 \n"
963 "pmaddubsw %%xmm4,%%xmm6 \n"
964 "phaddw %%xmm1,%%xmm0 \n"
965 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000966 "psraw $0x8,%%xmm0 \n"
967 "psraw $0x8,%%xmm2 \n"
968 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000969 "paddb %%xmm5,%%xmm0 \n"
970 "sub $0x10,%3 \n"
971 "movdqu %%xmm0,(%1) \n"
972 "movdqu (%0),%%xmm0 \n"
973 "movdqu 0x10(%0),%%xmm1 \n"
974 "movdqu 0x20(%0),%%xmm2 \n"
975 "movdqu 0x30(%0),%%xmm6 \n"
976 "pmaddubsw %%xmm3,%%xmm0 \n"
977 "pmaddubsw %%xmm3,%%xmm1 \n"
978 "pmaddubsw %%xmm3,%%xmm2 \n"
979 "pmaddubsw %%xmm3,%%xmm6 \n"
980 "phaddw %%xmm1,%%xmm0 \n"
981 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000982 "psraw $0x8,%%xmm0 \n"
983 "psraw $0x8,%%xmm2 \n"
984 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000985 "paddb %%xmm5,%%xmm0 \n"
986 "lea 0x40(%0),%0 \n"
987 "movdqu %%xmm0,(%1,%2,1) \n"
988 "lea 0x10(%1),%1 \n"
989 "jg 1b \n"
990 : "+r"(src_argb), // %0
991 "+r"(dst_u), // %1
992 "+r"(dst_v), // %2
993 "+rm"(width) // %3
994 :
995 : "memory", "cc"
996#if defined(__SSE2__)
997 , "xmm0", "xmm1", "xmm2", "xmm6"
998#endif
999 );
1000}
1001
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001002void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1003 uint8* dst_u, uint8* dst_v, int width) {
1004 asm volatile (
1005 "movdqa %0,%%xmm4 \n"
1006 "movdqa %1,%%xmm3 \n"
1007 "movdqa %2,%%xmm5 \n"
1008 :
1009 : "m"(kARGBToU), // %0
1010 "m"(kARGBToV), // %1
1011 "m"(kAddUV128) // %2
1012 );
1013 asm volatile (
1014 "sub %1,%2 \n"
1015 ".p2align 4 \n"
1016 "1: \n"
1017 "movdqa (%0),%%xmm0 \n"
1018 "movdqa 0x10(%0),%%xmm1 \n"
1019 "movdqa 0x20(%0),%%xmm2 \n"
1020 "movdqa 0x30(%0),%%xmm6 \n"
1021 "lea 0x40(%0),%0 \n"
1022 "movdqa %%xmm0,%%xmm7 \n"
1023 "shufps $0x88,%%xmm1,%%xmm0 \n"
1024 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1025 "pavgb %%xmm7,%%xmm0 \n"
1026 "movdqa %%xmm2,%%xmm7 \n"
1027 "shufps $0x88,%%xmm6,%%xmm2 \n"
1028 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1029 "pavgb %%xmm7,%%xmm2 \n"
1030 "movdqa %%xmm0,%%xmm1 \n"
1031 "movdqa %%xmm2,%%xmm6 \n"
1032 "pmaddubsw %%xmm4,%%xmm0 \n"
1033 "pmaddubsw %%xmm4,%%xmm2 \n"
1034 "pmaddubsw %%xmm3,%%xmm1 \n"
1035 "pmaddubsw %%xmm3,%%xmm6 \n"
1036 "phaddw %%xmm2,%%xmm0 \n"
1037 "phaddw %%xmm6,%%xmm1 \n"
1038 "psraw $0x8,%%xmm0 \n"
1039 "psraw $0x8,%%xmm1 \n"
1040 "packsswb %%xmm1,%%xmm0 \n"
1041 "paddb %%xmm5,%%xmm0 \n"
1042 "sub $0x10,%3 \n"
1043 "movlps %%xmm0,(%1) \n"
1044 "movhps %%xmm0,(%1,%2,1) \n"
1045 "lea 0x8(%1),%1 \n"
1046 "jg 1b \n"
1047 : "+r"(src_argb0), // %0
1048 "+r"(dst_u), // %1
1049 "+r"(dst_v), // %2
1050 "+rm"(width) // %3
1051 :
1052 : "memory", "cc"
1053#if defined(__SSE2__)
1054 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1055#endif
1056 );
1057}
1058
1059void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1060 uint8* dst_u, uint8* dst_v, int width) {
1061 asm volatile (
1062 "movdqa %0,%%xmm4 \n"
1063 "movdqa %1,%%xmm3 \n"
1064 "movdqa %2,%%xmm5 \n"
1065 :
1066 : "m"(kARGBToU), // %0
1067 "m"(kARGBToV), // %1
1068 "m"(kAddUV128) // %2
1069 );
1070 asm volatile (
1071 "sub %1,%2 \n"
1072 ".p2align 4 \n"
1073 "1: \n"
1074 "movdqu (%0),%%xmm0 \n"
1075 "movdqu 0x10(%0),%%xmm1 \n"
1076 "movdqu 0x20(%0),%%xmm2 \n"
1077 "movdqu 0x30(%0),%%xmm6 \n"
1078 "lea 0x40(%0),%0 \n"
1079 "movdqa %%xmm0,%%xmm7 \n"
1080 "shufps $0x88,%%xmm1,%%xmm0 \n"
1081 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1082 "pavgb %%xmm7,%%xmm0 \n"
1083 "movdqa %%xmm2,%%xmm7 \n"
1084 "shufps $0x88,%%xmm6,%%xmm2 \n"
1085 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1086 "pavgb %%xmm7,%%xmm2 \n"
1087 "movdqa %%xmm0,%%xmm1 \n"
1088 "movdqa %%xmm2,%%xmm6 \n"
1089 "pmaddubsw %%xmm4,%%xmm0 \n"
1090 "pmaddubsw %%xmm4,%%xmm2 \n"
1091 "pmaddubsw %%xmm3,%%xmm1 \n"
1092 "pmaddubsw %%xmm3,%%xmm6 \n"
1093 "phaddw %%xmm2,%%xmm0 \n"
1094 "phaddw %%xmm6,%%xmm1 \n"
1095 "psraw $0x8,%%xmm0 \n"
1096 "psraw $0x8,%%xmm1 \n"
1097 "packsswb %%xmm1,%%xmm0 \n"
1098 "paddb %%xmm5,%%xmm0 \n"
1099 "sub $0x10,%3 \n"
1100 "movlps %%xmm0,(%1) \n"
1101 "movhps %%xmm0,(%1,%2,1) \n"
1102 "lea 0x8(%1),%1 \n"
1103 "jg 1b \n"
1104 : "+r"(src_argb0), // %0
1105 "+r"(dst_u), // %1
1106 "+r"(dst_v), // %2
1107 "+rm"(width) // %3
1108 :
1109 : "memory", "cc"
1110#if defined(__SSE2__)
1111 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1112#endif
1113 );
1114}
1115
fbarchard@google.com714050a2012-02-17 22:59:56 +00001116void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001117 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001118 "movdqa %4,%%xmm5 \n"
1119 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001120 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001121 "1: \n"
1122 "movdqa (%0),%%xmm0 \n"
1123 "movdqa 0x10(%0),%%xmm1 \n"
1124 "movdqa 0x20(%0),%%xmm2 \n"
1125 "movdqa 0x30(%0),%%xmm3 \n"
1126 "pmaddubsw %%xmm4,%%xmm0 \n"
1127 "pmaddubsw %%xmm4,%%xmm1 \n"
1128 "pmaddubsw %%xmm4,%%xmm2 \n"
1129 "pmaddubsw %%xmm4,%%xmm3 \n"
1130 "lea 0x40(%0),%0 \n"
1131 "phaddw %%xmm1,%%xmm0 \n"
1132 "phaddw %%xmm3,%%xmm2 \n"
1133 "psrlw $0x7,%%xmm0 \n"
1134 "psrlw $0x7,%%xmm2 \n"
1135 "packuswb %%xmm2,%%xmm0 \n"
1136 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001137 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001138 "movdqa %%xmm0,(%1) \n"
1139 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001140 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001141 : "+r"(src_bgra), // %0
1142 "+r"(dst_y), // %1
1143 "+r"(pix) // %2
1144 : "m"(kBGRAToY), // %3
1145 "m"(kAddY16) // %4
1146 : "memory", "cc"
1147#if defined(__SSE2__)
1148 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001149#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +00001150 );
1151}
1152
1153void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001154 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001155 "movdqa %4,%%xmm5 \n"
1156 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001157 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001158 "1: \n"
1159 "movdqu (%0),%%xmm0 \n"
1160 "movdqu 0x10(%0),%%xmm1 \n"
1161 "movdqu 0x20(%0),%%xmm2 \n"
1162 "movdqu 0x30(%0),%%xmm3 \n"
1163 "pmaddubsw %%xmm4,%%xmm0 \n"
1164 "pmaddubsw %%xmm4,%%xmm1 \n"
1165 "pmaddubsw %%xmm4,%%xmm2 \n"
1166 "pmaddubsw %%xmm4,%%xmm3 \n"
1167 "lea 0x40(%0),%0 \n"
1168 "phaddw %%xmm1,%%xmm0 \n"
1169 "phaddw %%xmm3,%%xmm2 \n"
1170 "psrlw $0x7,%%xmm0 \n"
1171 "psrlw $0x7,%%xmm2 \n"
1172 "packuswb %%xmm2,%%xmm0 \n"
1173 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001174 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001175 "movdqu %%xmm0,(%1) \n"
1176 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001177 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001178 : "+r"(src_bgra), // %0
1179 "+r"(dst_y), // %1
1180 "+r"(pix) // %2
1181 : "m"(kBGRAToY), // %3
1182 "m"(kAddY16) // %4
1183 : "memory", "cc"
1184#if defined(__SSE2__)
1185 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1186#endif
1187 );
1188}
1189
1190void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1191 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001192 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001193 "movdqa %0,%%xmm4 \n"
1194 "movdqa %1,%%xmm3 \n"
1195 "movdqa %2,%%xmm5 \n"
1196 :
1197 : "m"(kBGRAToU), // %0
1198 "m"(kBGRAToV), // %1
1199 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001200 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001201 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001202 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001203 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001204 "1: \n"
1205 "movdqa (%0),%%xmm0 \n"
1206 "movdqa 0x10(%0),%%xmm1 \n"
1207 "movdqa 0x20(%0),%%xmm2 \n"
1208 "movdqa 0x30(%0),%%xmm6 \n"
1209 "pavgb (%0,%4,1),%%xmm0 \n"
1210 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1211 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1212 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1213 "lea 0x40(%0),%0 \n"
1214 "movdqa %%xmm0,%%xmm7 \n"
1215 "shufps $0x88,%%xmm1,%%xmm0 \n"
1216 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1217 "pavgb %%xmm7,%%xmm0 \n"
1218 "movdqa %%xmm2,%%xmm7 \n"
1219 "shufps $0x88,%%xmm6,%%xmm2 \n"
1220 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1221 "pavgb %%xmm7,%%xmm2 \n"
1222 "movdqa %%xmm0,%%xmm1 \n"
1223 "movdqa %%xmm2,%%xmm6 \n"
1224 "pmaddubsw %%xmm4,%%xmm0 \n"
1225 "pmaddubsw %%xmm4,%%xmm2 \n"
1226 "pmaddubsw %%xmm3,%%xmm1 \n"
1227 "pmaddubsw %%xmm3,%%xmm6 \n"
1228 "phaddw %%xmm2,%%xmm0 \n"
1229 "phaddw %%xmm6,%%xmm1 \n"
1230 "psraw $0x8,%%xmm0 \n"
1231 "psraw $0x8,%%xmm1 \n"
1232 "packsswb %%xmm1,%%xmm0 \n"
1233 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001234 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001235 "movlps %%xmm0,(%1) \n"
1236 "movhps %%xmm0,(%1,%2,1) \n"
1237 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001238 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001239 : "+r"(src_bgra0), // %0
1240 "+r"(dst_u), // %1
1241 "+r"(dst_v), // %2
1242 "+rm"(width) // %3
1243 : "r"(static_cast<intptr_t>(src_stride_bgra))
1244 : "memory", "cc"
1245#if defined(__SSE2__)
1246 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1247#endif
1248 );
1249}
1250
1251void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1252 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001253 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001254 "movdqa %0,%%xmm4 \n"
1255 "movdqa %1,%%xmm3 \n"
1256 "movdqa %2,%%xmm5 \n"
1257 :
1258 : "m"(kBGRAToU), // %0
1259 "m"(kBGRAToV), // %1
1260 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001261 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001262 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001263 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001264 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001265 "1: \n"
1266 "movdqu (%0),%%xmm0 \n"
1267 "movdqu 0x10(%0),%%xmm1 \n"
1268 "movdqu 0x20(%0),%%xmm2 \n"
1269 "movdqu 0x30(%0),%%xmm6 \n"
1270 "movdqu (%0,%4,1),%%xmm7 \n"
1271 "pavgb %%xmm7,%%xmm0 \n"
1272 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1273 "pavgb %%xmm7,%%xmm1 \n"
1274 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1275 "pavgb %%xmm7,%%xmm2 \n"
1276 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1277 "pavgb %%xmm7,%%xmm6 \n"
1278 "lea 0x40(%0),%0 \n"
1279 "movdqa %%xmm0,%%xmm7 \n"
1280 "shufps $0x88,%%xmm1,%%xmm0 \n"
1281 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1282 "pavgb %%xmm7,%%xmm0 \n"
1283 "movdqa %%xmm2,%%xmm7 \n"
1284 "shufps $0x88,%%xmm6,%%xmm2 \n"
1285 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1286 "pavgb %%xmm7,%%xmm2 \n"
1287 "movdqa %%xmm0,%%xmm1 \n"
1288 "movdqa %%xmm2,%%xmm6 \n"
1289 "pmaddubsw %%xmm4,%%xmm0 \n"
1290 "pmaddubsw %%xmm4,%%xmm2 \n"
1291 "pmaddubsw %%xmm3,%%xmm1 \n"
1292 "pmaddubsw %%xmm3,%%xmm6 \n"
1293 "phaddw %%xmm2,%%xmm0 \n"
1294 "phaddw %%xmm6,%%xmm1 \n"
1295 "psraw $0x8,%%xmm0 \n"
1296 "psraw $0x8,%%xmm1 \n"
1297 "packsswb %%xmm1,%%xmm0 \n"
1298 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001299 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001300 "movlps %%xmm0,(%1) \n"
1301 "movhps %%xmm0,(%1,%2,1) \n"
1302 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001303 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001304 : "+r"(src_bgra0), // %0
1305 "+r"(dst_u), // %1
1306 "+r"(dst_v), // %2
1307 "+rm"(width) // %3
1308 : "r"(static_cast<intptr_t>(src_stride_bgra))
1309 : "memory", "cc"
1310#if defined(__SSE2__)
1311 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1312#endif
1313 );
1314}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001315
1316void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001317 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001318 "movdqa %4,%%xmm5 \n"
1319 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001320 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001321 "1: \n"
1322 "movdqa (%0),%%xmm0 \n"
1323 "movdqa 0x10(%0),%%xmm1 \n"
1324 "movdqa 0x20(%0),%%xmm2 \n"
1325 "movdqa 0x30(%0),%%xmm3 \n"
1326 "pmaddubsw %%xmm4,%%xmm0 \n"
1327 "pmaddubsw %%xmm4,%%xmm1 \n"
1328 "pmaddubsw %%xmm4,%%xmm2 \n"
1329 "pmaddubsw %%xmm4,%%xmm3 \n"
1330 "lea 0x40(%0),%0 \n"
1331 "phaddw %%xmm1,%%xmm0 \n"
1332 "phaddw %%xmm3,%%xmm2 \n"
1333 "psrlw $0x7,%%xmm0 \n"
1334 "psrlw $0x7,%%xmm2 \n"
1335 "packuswb %%xmm2,%%xmm0 \n"
1336 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001337 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001338 "movdqa %%xmm0,(%1) \n"
1339 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001340 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001341 : "+r"(src_abgr), // %0
1342 "+r"(dst_y), // %1
1343 "+r"(pix) // %2
1344 : "m"(kABGRToY), // %3
1345 "m"(kAddY16) // %4
1346 : "memory", "cc"
1347#if defined(__SSE2__)
1348 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1349#endif
1350 );
1351}
1352
1353void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001354 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001355 "movdqa %4,%%xmm5 \n"
1356 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001357 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001358 "1: \n"
1359 "movdqu (%0),%%xmm0 \n"
1360 "movdqu 0x10(%0),%%xmm1 \n"
1361 "movdqu 0x20(%0),%%xmm2 \n"
1362 "movdqu 0x30(%0),%%xmm3 \n"
1363 "pmaddubsw %%xmm4,%%xmm0 \n"
1364 "pmaddubsw %%xmm4,%%xmm1 \n"
1365 "pmaddubsw %%xmm4,%%xmm2 \n"
1366 "pmaddubsw %%xmm4,%%xmm3 \n"
1367 "lea 0x40(%0),%0 \n"
1368 "phaddw %%xmm1,%%xmm0 \n"
1369 "phaddw %%xmm3,%%xmm2 \n"
1370 "psrlw $0x7,%%xmm0 \n"
1371 "psrlw $0x7,%%xmm2 \n"
1372 "packuswb %%xmm2,%%xmm0 \n"
1373 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001374 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001375 "movdqu %%xmm0,(%1) \n"
1376 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001377 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001378 : "+r"(src_abgr), // %0
1379 "+r"(dst_y), // %1
1380 "+r"(pix) // %2
1381 : "m"(kABGRToY), // %3
1382 "m"(kAddY16) // %4
1383 : "memory", "cc"
1384#if defined(__SSE2__)
1385 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1386#endif
1387 );
1388}
1389
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001390void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1391 asm volatile (
1392 "movdqa %4,%%xmm5 \n"
1393 "movdqa %3,%%xmm4 \n"
1394 ".p2align 4 \n"
1395 "1: \n"
1396 "movdqa (%0),%%xmm0 \n"
1397 "movdqa 0x10(%0),%%xmm1 \n"
1398 "movdqa 0x20(%0),%%xmm2 \n"
1399 "movdqa 0x30(%0),%%xmm3 \n"
1400 "pmaddubsw %%xmm4,%%xmm0 \n"
1401 "pmaddubsw %%xmm4,%%xmm1 \n"
1402 "pmaddubsw %%xmm4,%%xmm2 \n"
1403 "pmaddubsw %%xmm4,%%xmm3 \n"
1404 "lea 0x40(%0),%0 \n"
1405 "phaddw %%xmm1,%%xmm0 \n"
1406 "phaddw %%xmm3,%%xmm2 \n"
1407 "psrlw $0x7,%%xmm0 \n"
1408 "psrlw $0x7,%%xmm2 \n"
1409 "packuswb %%xmm2,%%xmm0 \n"
1410 "paddb %%xmm5,%%xmm0 \n"
1411 "sub $0x10,%2 \n"
1412 "movdqa %%xmm0,(%1) \n"
1413 "lea 0x10(%1),%1 \n"
1414 "jg 1b \n"
1415 : "+r"(src_rgba), // %0
1416 "+r"(dst_y), // %1
1417 "+r"(pix) // %2
1418 : "m"(kRGBAToY), // %3
1419 "m"(kAddY16) // %4
1420 : "memory", "cc"
1421#if defined(__SSE2__)
1422 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1423#endif
1424 );
1425}
1426
1427void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1428 asm volatile (
1429 "movdqa %4,%%xmm5 \n"
1430 "movdqa %3,%%xmm4 \n"
1431 ".p2align 4 \n"
1432 "1: \n"
1433 "movdqu (%0),%%xmm0 \n"
1434 "movdqu 0x10(%0),%%xmm1 \n"
1435 "movdqu 0x20(%0),%%xmm2 \n"
1436 "movdqu 0x30(%0),%%xmm3 \n"
1437 "pmaddubsw %%xmm4,%%xmm0 \n"
1438 "pmaddubsw %%xmm4,%%xmm1 \n"
1439 "pmaddubsw %%xmm4,%%xmm2 \n"
1440 "pmaddubsw %%xmm4,%%xmm3 \n"
1441 "lea 0x40(%0),%0 \n"
1442 "phaddw %%xmm1,%%xmm0 \n"
1443 "phaddw %%xmm3,%%xmm2 \n"
1444 "psrlw $0x7,%%xmm0 \n"
1445 "psrlw $0x7,%%xmm2 \n"
1446 "packuswb %%xmm2,%%xmm0 \n"
1447 "paddb %%xmm5,%%xmm0 \n"
1448 "sub $0x10,%2 \n"
1449 "movdqu %%xmm0,(%1) \n"
1450 "lea 0x10(%1),%1 \n"
1451 "jg 1b \n"
1452 : "+r"(src_rgba), // %0
1453 "+r"(dst_y), // %1
1454 "+r"(pix) // %2
1455 : "m"(kRGBAToY), // %3
1456 "m"(kAddY16) // %4
1457 : "memory", "cc"
1458#if defined(__SSE2__)
1459 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1460#endif
1461 );
1462}
1463
fbarchard@google.com714050a2012-02-17 22:59:56 +00001464void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1465 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001466 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001467 "movdqa %0,%%xmm4 \n"
1468 "movdqa %1,%%xmm3 \n"
1469 "movdqa %2,%%xmm5 \n"
1470 :
1471 : "m"(kABGRToU), // %0
1472 "m"(kABGRToV), // %1
1473 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001474 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001475 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001476 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001477 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001478 "1: \n"
1479 "movdqa (%0),%%xmm0 \n"
1480 "movdqa 0x10(%0),%%xmm1 \n"
1481 "movdqa 0x20(%0),%%xmm2 \n"
1482 "movdqa 0x30(%0),%%xmm6 \n"
1483 "pavgb (%0,%4,1),%%xmm0 \n"
1484 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1485 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1486 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1487 "lea 0x40(%0),%0 \n"
1488 "movdqa %%xmm0,%%xmm7 \n"
1489 "shufps $0x88,%%xmm1,%%xmm0 \n"
1490 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1491 "pavgb %%xmm7,%%xmm0 \n"
1492 "movdqa %%xmm2,%%xmm7 \n"
1493 "shufps $0x88,%%xmm6,%%xmm2 \n"
1494 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1495 "pavgb %%xmm7,%%xmm2 \n"
1496 "movdqa %%xmm0,%%xmm1 \n"
1497 "movdqa %%xmm2,%%xmm6 \n"
1498 "pmaddubsw %%xmm4,%%xmm0 \n"
1499 "pmaddubsw %%xmm4,%%xmm2 \n"
1500 "pmaddubsw %%xmm3,%%xmm1 \n"
1501 "pmaddubsw %%xmm3,%%xmm6 \n"
1502 "phaddw %%xmm2,%%xmm0 \n"
1503 "phaddw %%xmm6,%%xmm1 \n"
1504 "psraw $0x8,%%xmm0 \n"
1505 "psraw $0x8,%%xmm1 \n"
1506 "packsswb %%xmm1,%%xmm0 \n"
1507 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001508 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001509 "movlps %%xmm0,(%1) \n"
1510 "movhps %%xmm0,(%1,%2,1) \n"
1511 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001512 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001513 : "+r"(src_abgr0), // %0
1514 "+r"(dst_u), // %1
1515 "+r"(dst_v), // %2
1516 "+rm"(width) // %3
1517 : "r"(static_cast<intptr_t>(src_stride_abgr))
1518 : "memory", "cc"
1519#if defined(__SSE2__)
1520 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1521#endif
1522 );
1523}
1524
1525void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1526 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001527 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001528 "movdqa %0,%%xmm4 \n"
1529 "movdqa %1,%%xmm3 \n"
1530 "movdqa %2,%%xmm5 \n"
1531 :
1532 : "m"(kABGRToU), // %0
1533 "m"(kABGRToV), // %1
1534 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001535 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001536 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001537 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001538 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001539 "1: \n"
1540 "movdqu (%0),%%xmm0 \n"
1541 "movdqu 0x10(%0),%%xmm1 \n"
1542 "movdqu 0x20(%0),%%xmm2 \n"
1543 "movdqu 0x30(%0),%%xmm6 \n"
1544 "movdqu (%0,%4,1),%%xmm7 \n"
1545 "pavgb %%xmm7,%%xmm0 \n"
1546 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1547 "pavgb %%xmm7,%%xmm1 \n"
1548 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1549 "pavgb %%xmm7,%%xmm2 \n"
1550 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1551 "pavgb %%xmm7,%%xmm6 \n"
1552 "lea 0x40(%0),%0 \n"
1553 "movdqa %%xmm0,%%xmm7 \n"
1554 "shufps $0x88,%%xmm1,%%xmm0 \n"
1555 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1556 "pavgb %%xmm7,%%xmm0 \n"
1557 "movdqa %%xmm2,%%xmm7 \n"
1558 "shufps $0x88,%%xmm6,%%xmm2 \n"
1559 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1560 "pavgb %%xmm7,%%xmm2 \n"
1561 "movdqa %%xmm0,%%xmm1 \n"
1562 "movdqa %%xmm2,%%xmm6 \n"
1563 "pmaddubsw %%xmm4,%%xmm0 \n"
1564 "pmaddubsw %%xmm4,%%xmm2 \n"
1565 "pmaddubsw %%xmm3,%%xmm1 \n"
1566 "pmaddubsw %%xmm3,%%xmm6 \n"
1567 "phaddw %%xmm2,%%xmm0 \n"
1568 "phaddw %%xmm6,%%xmm1 \n"
1569 "psraw $0x8,%%xmm0 \n"
1570 "psraw $0x8,%%xmm1 \n"
1571 "packsswb %%xmm1,%%xmm0 \n"
1572 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001573 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001574 "movlps %%xmm0,(%1) \n"
1575 "movhps %%xmm0,(%1,%2,1) \n"
1576 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001577 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001578 : "+r"(src_abgr0), // %0
1579 "+r"(dst_u), // %1
1580 "+r"(dst_v), // %2
1581 "+rm"(width) // %3
1582 : "r"(static_cast<intptr_t>(src_stride_abgr))
1583 : "memory", "cc"
1584#if defined(__SSE2__)
1585 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1586#endif
1587 );
1588}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001589
1590void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1591 uint8* dst_u, uint8* dst_v, int width) {
1592 asm volatile (
1593 "movdqa %0,%%xmm4 \n"
1594 "movdqa %1,%%xmm3 \n"
1595 "movdqa %2,%%xmm5 \n"
1596 :
1597 : "m"(kRGBAToU), // %0
1598 "m"(kRGBAToV), // %1
1599 "m"(kAddUV128) // %2
1600 );
1601 asm volatile (
1602 "sub %1,%2 \n"
1603 ".p2align 4 \n"
1604 "1: \n"
1605 "movdqa (%0),%%xmm0 \n"
1606 "movdqa 0x10(%0),%%xmm1 \n"
1607 "movdqa 0x20(%0),%%xmm2 \n"
1608 "movdqa 0x30(%0),%%xmm6 \n"
1609 "pavgb (%0,%4,1),%%xmm0 \n"
1610 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1611 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1612 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1613 "lea 0x40(%0),%0 \n"
1614 "movdqa %%xmm0,%%xmm7 \n"
1615 "shufps $0x88,%%xmm1,%%xmm0 \n"
1616 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1617 "pavgb %%xmm7,%%xmm0 \n"
1618 "movdqa %%xmm2,%%xmm7 \n"
1619 "shufps $0x88,%%xmm6,%%xmm2 \n"
1620 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1621 "pavgb %%xmm7,%%xmm2 \n"
1622 "movdqa %%xmm0,%%xmm1 \n"
1623 "movdqa %%xmm2,%%xmm6 \n"
1624 "pmaddubsw %%xmm4,%%xmm0 \n"
1625 "pmaddubsw %%xmm4,%%xmm2 \n"
1626 "pmaddubsw %%xmm3,%%xmm1 \n"
1627 "pmaddubsw %%xmm3,%%xmm6 \n"
1628 "phaddw %%xmm2,%%xmm0 \n"
1629 "phaddw %%xmm6,%%xmm1 \n"
1630 "psraw $0x8,%%xmm0 \n"
1631 "psraw $0x8,%%xmm1 \n"
1632 "packsswb %%xmm1,%%xmm0 \n"
1633 "paddb %%xmm5,%%xmm0 \n"
1634 "sub $0x10,%3 \n"
1635 "movlps %%xmm0,(%1) \n"
1636 "movhps %%xmm0,(%1,%2,1) \n"
1637 "lea 0x8(%1),%1 \n"
1638 "jg 1b \n"
1639 : "+r"(src_rgba0), // %0
1640 "+r"(dst_u), // %1
1641 "+r"(dst_v), // %2
1642 "+rm"(width) // %3
1643 : "r"(static_cast<intptr_t>(src_stride_rgba))
1644 : "memory", "cc"
1645#if defined(__SSE2__)
1646 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1647#endif
1648 );
1649}
1650
1651void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1652 uint8* dst_u, uint8* dst_v, int width) {
1653 asm volatile (
1654 "movdqa %0,%%xmm4 \n"
1655 "movdqa %1,%%xmm3 \n"
1656 "movdqa %2,%%xmm5 \n"
1657 :
1658 : "m"(kRGBAToU), // %0
1659 "m"(kRGBAToV), // %1
1660 "m"(kAddUV128) // %2
1661 );
1662 asm volatile (
1663 "sub %1,%2 \n"
1664 ".p2align 4 \n"
1665 "1: \n"
1666 "movdqu (%0),%%xmm0 \n"
1667 "movdqu 0x10(%0),%%xmm1 \n"
1668 "movdqu 0x20(%0),%%xmm2 \n"
1669 "movdqu 0x30(%0),%%xmm6 \n"
1670 "movdqu (%0,%4,1),%%xmm7 \n"
1671 "pavgb %%xmm7,%%xmm0 \n"
1672 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1673 "pavgb %%xmm7,%%xmm1 \n"
1674 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1675 "pavgb %%xmm7,%%xmm2 \n"
1676 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1677 "pavgb %%xmm7,%%xmm6 \n"
1678 "lea 0x40(%0),%0 \n"
1679 "movdqa %%xmm0,%%xmm7 \n"
1680 "shufps $0x88,%%xmm1,%%xmm0 \n"
1681 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1682 "pavgb %%xmm7,%%xmm0 \n"
1683 "movdqa %%xmm2,%%xmm7 \n"
1684 "shufps $0x88,%%xmm6,%%xmm2 \n"
1685 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1686 "pavgb %%xmm7,%%xmm2 \n"
1687 "movdqa %%xmm0,%%xmm1 \n"
1688 "movdqa %%xmm2,%%xmm6 \n"
1689 "pmaddubsw %%xmm4,%%xmm0 \n"
1690 "pmaddubsw %%xmm4,%%xmm2 \n"
1691 "pmaddubsw %%xmm3,%%xmm1 \n"
1692 "pmaddubsw %%xmm3,%%xmm6 \n"
1693 "phaddw %%xmm2,%%xmm0 \n"
1694 "phaddw %%xmm6,%%xmm1 \n"
1695 "psraw $0x8,%%xmm0 \n"
1696 "psraw $0x8,%%xmm1 \n"
1697 "packsswb %%xmm1,%%xmm0 \n"
1698 "paddb %%xmm5,%%xmm0 \n"
1699 "sub $0x10,%3 \n"
1700 "movlps %%xmm0,(%1) \n"
1701 "movhps %%xmm0,(%1,%2,1) \n"
1702 "lea 0x8(%1),%1 \n"
1703 "jg 1b \n"
1704 : "+r"(src_rgba0), // %0
1705 "+r"(dst_u), // %1
1706 "+r"(dst_v), // %2
1707 "+rm"(width) // %3
1708 : "r"(static_cast<intptr_t>(src_stride_rgba))
1709 : "memory", "cc"
1710#if defined(__SSE2__)
1711 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1712#endif
1713 );
1714}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001715#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001716
fbarchard@google.come214fe32012-06-04 23:47:11 +00001717#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001718#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1719#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1720#define UR 0
1721
1722#define VB 0
1723#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1724#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1725
1726// Bias
1727#define BB UB * 128 + VB * 128
1728#define BG UG * 128 + VG * 128
1729#define BR UR * 128 + VR * 128
1730
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001731#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001732
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001733struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001734 vec8 kUVToB; // 0
1735 vec8 kUVToG; // 16
1736 vec8 kUVToR; // 32
1737 vec16 kUVBiasB; // 48
1738 vec16 kUVBiasG; // 64
1739 vec16 kUVBiasR; // 80
1740 vec16 kYSub16; // 96
1741 vec16 kYToRgb; // 112
1742 vec8 kVUToB; // 128
1743 vec8 kVUToG; // 144
1744 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001745} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001746 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1747 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1748 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1749 { BB, BB, BB, BB, BB, BB, BB, BB },
1750 { BG, BG, BG, BG, BG, BG, BG, BG },
1751 { BR, BR, BR, BR, BR, BR, BR, BR },
1752 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001753 { YG, YG, YG, YG, YG, YG, YG, YG },
1754 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1755 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1756 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001757};
1758
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001759
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001760// Read 8 UV from 411
1761#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001762 "movq (%[u_buf]),%%xmm0 \n" \
1763 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1764 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001765 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001766
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001767// Read 4 UV from 422, upsample to 8 UV
1768#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001769 "movd (%[u_buf]),%%xmm0 \n" \
1770 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1771 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001772 "punpcklbw %%xmm1,%%xmm0 \n" \
1773 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001774
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001775// Read 2 UV from 411, upsample to 8 UV
1776#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001777 "movd (%[u_buf]),%%xmm0 \n" \
1778 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1779 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001780 "punpcklbw %%xmm1,%%xmm0 \n" \
1781 "punpcklwd %%xmm0,%%xmm0 \n" \
1782 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001783
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001784// Read 4 UV from NV12, upsample to 8 UV
1785#define READNV12 \
1786 "movq (%[uv_buf]),%%xmm0 \n" \
1787 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001788 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001789
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001790// Convert 8 pixels: 8 UV and 8 Y
1791#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001792 "movdqa %%xmm0,%%xmm1 \n" \
1793 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001794 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1795 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1796 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1797 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1798 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1799 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1800 "movq (%[y_buf]),%%xmm3 \n" \
1801 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001802 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001803 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1804 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001805 "paddsw %%xmm3,%%xmm0 \n" \
1806 "paddsw %%xmm3,%%xmm1 \n" \
1807 "paddsw %%xmm3,%%xmm2 \n" \
1808 "psraw $0x6,%%xmm0 \n" \
1809 "psraw $0x6,%%xmm1 \n" \
1810 "psraw $0x6,%%xmm2 \n" \
1811 "packuswb %%xmm0,%%xmm0 \n" \
1812 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001813 "packuswb %%xmm2,%%xmm2 \n" \
1814
1815// Convert 8 pixels: 8 VU and 8 Y
1816#define YVUTORGB \
1817 "movdqa %%xmm0,%%xmm1 \n" \
1818 "movdqa %%xmm0,%%xmm2 \n" \
1819 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1820 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1821 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1822 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1823 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1824 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1825 "movq (%[y_buf]),%%xmm3 \n" \
1826 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1827 "punpcklbw %%xmm4,%%xmm3 \n" \
1828 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1829 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1830 "paddsw %%xmm3,%%xmm0 \n" \
1831 "paddsw %%xmm3,%%xmm1 \n" \
1832 "paddsw %%xmm3,%%xmm2 \n" \
1833 "psraw $0x6,%%xmm0 \n" \
1834 "psraw $0x6,%%xmm1 \n" \
1835 "psraw $0x6,%%xmm2 \n" \
1836 "packuswb %%xmm0,%%xmm0 \n" \
1837 "packuswb %%xmm1,%%xmm1 \n" \
1838 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001839
1840void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001841 const uint8* u_buf,
1842 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001843 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001844 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001845 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001846 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001847 "pcmpeqb %%xmm5,%%xmm5 \n"
1848 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001849 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001850 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001851 READYUV444
1852 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001853 "punpcklbw %%xmm1,%%xmm0 \n"
1854 "punpcklbw %%xmm5,%%xmm2 \n"
1855 "movdqa %%xmm0,%%xmm1 \n"
1856 "punpcklwd %%xmm2,%%xmm0 \n"
1857 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001858 "movdqa %%xmm0,(%[dst_argb]) \n"
1859 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
1860 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001861 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001862 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001863 : [y_buf]"+r"(y_buf), // %[y_buf]
1864 [u_buf]"+r"(u_buf), // %[u_buf]
1865 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001866 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001867 [width]"+rm"(width) // %[width]
1868 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001869 : "memory", "cc"
1870#if defined(__SSE2__)
1871 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1872#endif
1873 );
1874}
1875
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001876void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1877 const uint8* u_buf,
1878 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001879 uint8* dst_rgb24,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001880 int width) {
1881// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1882#ifdef __APPLE__
1883 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001884 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1885 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1886 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1887 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001888#endif
1889
1890 asm volatile (
1891#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001892 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1893 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001894#endif
1895 "sub %[u_buf],%[v_buf] \n"
1896 "pxor %%xmm4,%%xmm4 \n"
1897 ".p2align 4 \n"
1898 "1: \n"
1899 READYUV422
1900 YUVTORGB
1901 "punpcklbw %%xmm1,%%xmm0 \n"
1902 "punpcklbw %%xmm2,%%xmm2 \n"
1903 "movdqa %%xmm0,%%xmm1 \n"
1904 "punpcklwd %%xmm2,%%xmm0 \n"
1905 "punpckhwd %%xmm2,%%xmm1 \n"
1906 "pshufb %%xmm5,%%xmm0 \n"
1907 "pshufb %%xmm6,%%xmm1 \n"
1908 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001909 "movq %%xmm0,(%[dst_rgb24]) \n"
1910 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
1911 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001912 "sub $0x8,%[width] \n"
1913 "jg 1b \n"
1914 : [y_buf]"+r"(y_buf), // %[y_buf]
1915 [u_buf]"+r"(u_buf), // %[u_buf]
1916 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001917 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001918 [width]"+rm"(width) // %[width]
1919 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1920#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001921 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1922 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001923#endif
1924 : "memory", "cc"
1925#if defined(__SSE2__)
1926 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1927#endif
1928 );
1929}
1930
1931void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1932 const uint8* u_buf,
1933 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001934 uint8* dst_raw,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001935 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001936// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001937#ifdef __APPLE__
1938 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001939 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1940 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1941 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1942 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001943#endif
1944
1945 asm volatile (
1946#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001947 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1948 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001949#endif
1950 "sub %[u_buf],%[v_buf] \n"
1951 "pxor %%xmm4,%%xmm4 \n"
1952 ".p2align 4 \n"
1953 "1: \n"
1954 READYUV422
1955 YUVTORGB
1956 "punpcklbw %%xmm1,%%xmm0 \n"
1957 "punpcklbw %%xmm2,%%xmm2 \n"
1958 "movdqa %%xmm0,%%xmm1 \n"
1959 "punpcklwd %%xmm2,%%xmm0 \n"
1960 "punpckhwd %%xmm2,%%xmm1 \n"
1961 "pshufb %%xmm5,%%xmm0 \n"
1962 "pshufb %%xmm6,%%xmm1 \n"
1963 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001964 "movq %%xmm0,(%[dst_raw]) \n"
1965 "movdqu %%xmm1,0x8(%[dst_raw]) \n"
1966 "lea 0x18(%[dst_raw]),%[dst_raw] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001967 "sub $0x8,%[width] \n"
1968 "jg 1b \n"
1969 : [y_buf]"+r"(y_buf), // %[y_buf]
1970 [u_buf]"+r"(u_buf), // %[u_buf]
1971 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001972 [dst_raw]"+r"(dst_raw), // %[dst_raw]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001973 [width]"+rm"(width) // %[width]
1974 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1975#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001976 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1977 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001978#endif
1979 : "memory", "cc"
1980#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001981 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001982#endif
1983 );
1984}
1985
fbarchard@google.come214fe32012-06-04 23:47:11 +00001986void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001987 const uint8* u_buf,
1988 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001989 uint8* dst_argb,
fbarchard@google.comdbcabea2012-10-29 21:20:25 +00001990 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001991 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001992 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001993 "pcmpeqb %%xmm5,%%xmm5 \n"
1994 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001995 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001996 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001997 READYUV422
1998 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001999 "punpcklbw %%xmm1,%%xmm0 \n"
2000 "punpcklbw %%xmm5,%%xmm2 \n"
2001 "movdqa %%xmm0,%%xmm1 \n"
2002 "punpcklwd %%xmm2,%%xmm0 \n"
2003 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002004 "movdqa %%xmm0,(%[dst_argb]) \n"
2005 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2006 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002007 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002008 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002009 : [y_buf]"+r"(y_buf), // %[y_buf]
2010 [u_buf]"+r"(u_buf), // %[u_buf]
2011 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002012 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002013 [width]"+rm"(width) // %[width]
2014 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002015 : "memory", "cc"
2016#if defined(__SSE2__)
2017 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2018#endif
2019 );
2020}
2021
2022void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
2023 const uint8* u_buf,
2024 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002025 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002026 int width) {
2027 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002028 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002029 "pcmpeqb %%xmm5,%%xmm5 \n"
2030 "pxor %%xmm4,%%xmm4 \n"
2031 ".p2align 4 \n"
2032 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002033 READYUV411
2034 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002035 "punpcklbw %%xmm1,%%xmm0 \n"
2036 "punpcklbw %%xmm5,%%xmm2 \n"
2037 "movdqa %%xmm0,%%xmm1 \n"
2038 "punpcklwd %%xmm2,%%xmm0 \n"
2039 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002040 "movdqa %%xmm0,(%[dst_argb]) \n"
2041 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2042 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002043 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002044 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002045 : [y_buf]"+r"(y_buf), // %[y_buf]
2046 [u_buf]"+r"(u_buf), // %[u_buf]
2047 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002048 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002049 [width]"+rm"(width) // %[width]
2050 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2051 : "memory", "cc"
2052#if defined(__SSE2__)
2053 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2054#endif
2055 );
2056}
2057
2058void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
2059 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002060 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002061 int width) {
2062 asm volatile (
2063 "pcmpeqb %%xmm5,%%xmm5 \n"
2064 "pxor %%xmm4,%%xmm4 \n"
2065 ".p2align 4 \n"
2066 "1: \n"
2067 READNV12
2068 YUVTORGB
2069 "punpcklbw %%xmm1,%%xmm0 \n"
2070 "punpcklbw %%xmm5,%%xmm2 \n"
2071 "movdqa %%xmm0,%%xmm1 \n"
2072 "punpcklwd %%xmm2,%%xmm0 \n"
2073 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002074 "movdqa %%xmm0,(%[dst_argb]) \n"
2075 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2076 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002077 "sub $0x8,%[width] \n"
2078 "jg 1b \n"
2079 : [y_buf]"+r"(y_buf), // %[y_buf]
2080 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002081 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002082 [width]"+rm"(width) // %[width]
2083 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2084 : "memory", "cc"
2085#if defined(__SSE2__)
2086 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2087#endif
2088 );
2089}
2090
2091void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002092 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002093 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002094 int width) {
2095 asm volatile (
2096 "pcmpeqb %%xmm5,%%xmm5 \n"
2097 "pxor %%xmm4,%%xmm4 \n"
2098 ".p2align 4 \n"
2099 "1: \n"
2100 READNV12
2101 YVUTORGB
2102 "punpcklbw %%xmm1,%%xmm0 \n"
2103 "punpcklbw %%xmm5,%%xmm2 \n"
2104 "movdqa %%xmm0,%%xmm1 \n"
2105 "punpcklwd %%xmm2,%%xmm0 \n"
2106 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002107 "movdqa %%xmm0,(%[dst_argb]) \n"
2108 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2109 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002110 "sub $0x8,%[width] \n"
2111 "jg 1b \n"
2112 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002113 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2114 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002115 [width]"+rm"(width) // %[width]
2116 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002117 : "memory", "cc"
2118#if defined(__SSE2__)
2119 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2120#endif
2121 );
2122}
2123
2124void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2125 const uint8* u_buf,
2126 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002127 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002128 int width) {
2129 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002130 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002131 "pcmpeqb %%xmm5,%%xmm5 \n"
2132 "pxor %%xmm4,%%xmm4 \n"
2133 ".p2align 4 \n"
2134 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002135 READYUV444
2136 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002137 "punpcklbw %%xmm1,%%xmm0 \n"
2138 "punpcklbw %%xmm5,%%xmm2 \n"
2139 "movdqa %%xmm0,%%xmm1 \n"
2140 "punpcklwd %%xmm2,%%xmm0 \n"
2141 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002142 "movdqu %%xmm0,(%[dst_argb]) \n"
2143 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2144 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002145 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002146 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002147 : [y_buf]"+r"(y_buf), // %[y_buf]
2148 [u_buf]"+r"(u_buf), // %[u_buf]
2149 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002150 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002151 [width]"+rm"(width) // %[width]
2152 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002153 : "memory", "cc"
2154#if defined(__SSE2__)
2155 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2156#endif
2157 );
2158}
2159
2160void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2161 const uint8* u_buf,
2162 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002163 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002164 int width) {
2165 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002166 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002167 "pcmpeqb %%xmm5,%%xmm5 \n"
2168 "pxor %%xmm4,%%xmm4 \n"
2169 ".p2align 4 \n"
2170 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002171 READYUV422
2172 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002173 "punpcklbw %%xmm1,%%xmm0 \n"
2174 "punpcklbw %%xmm5,%%xmm2 \n"
2175 "movdqa %%xmm0,%%xmm1 \n"
2176 "punpcklwd %%xmm2,%%xmm0 \n"
2177 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002178 "movdqu %%xmm0,(%[dst_argb]) \n"
2179 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2180 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002181 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002182 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002183 : [y_buf]"+r"(y_buf), // %[y_buf]
2184 [u_buf]"+r"(u_buf), // %[u_buf]
2185 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002186 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002187 [width]"+rm"(width) // %[width]
2188 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002189 : "memory", "cc"
2190#if defined(__SSE2__)
2191 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2192#endif
2193 );
2194}
2195
2196void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2197 const uint8* u_buf,
2198 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002199 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002200 int width) {
2201 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002202 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002203 "pcmpeqb %%xmm5,%%xmm5 \n"
2204 "pxor %%xmm4,%%xmm4 \n"
2205 ".p2align 4 \n"
2206 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002207 READYUV411
2208 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002209 "punpcklbw %%xmm1,%%xmm0 \n"
2210 "punpcklbw %%xmm5,%%xmm2 \n"
2211 "movdqa %%xmm0,%%xmm1 \n"
2212 "punpcklwd %%xmm2,%%xmm0 \n"
2213 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002214 "movdqu %%xmm0,(%[dst_argb]) \n"
2215 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2216 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002217 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002218 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002219 : [y_buf]"+r"(y_buf), // %[y_buf]
2220 [u_buf]"+r"(u_buf), // %[u_buf]
2221 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002222 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002223 [width]"+rm"(width) // %[width]
2224 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2225 : "memory", "cc"
2226#if defined(__SSE2__)
2227 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2228#endif
2229 );
2230}
2231
2232void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2233 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002234 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002235 int width) {
2236 asm volatile (
2237 "pcmpeqb %%xmm5,%%xmm5 \n"
2238 "pxor %%xmm4,%%xmm4 \n"
2239 ".p2align 4 \n"
2240 "1: \n"
2241 READNV12
2242 YUVTORGB
2243 "punpcklbw %%xmm1,%%xmm0 \n"
2244 "punpcklbw %%xmm5,%%xmm2 \n"
2245 "movdqa %%xmm0,%%xmm1 \n"
2246 "punpcklwd %%xmm2,%%xmm0 \n"
2247 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002248 "movdqu %%xmm0,(%[dst_argb]) \n"
2249 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2250 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002251 "sub $0x8,%[width] \n"
2252 "jg 1b \n"
2253 : [y_buf]"+r"(y_buf), // %[y_buf]
2254 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002255 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002256 [width]"+rm"(width) // %[width]
2257 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2258 : "memory", "cc"
2259#if defined(__SSE2__)
2260 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2261#endif
2262 );
2263}
2264
2265void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002266 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002267 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002268 int width) {
2269 asm volatile (
2270 "pcmpeqb %%xmm5,%%xmm5 \n"
2271 "pxor %%xmm4,%%xmm4 \n"
2272 ".p2align 4 \n"
2273 "1: \n"
2274 READNV12
2275 YVUTORGB
2276 "punpcklbw %%xmm1,%%xmm0 \n"
2277 "punpcklbw %%xmm5,%%xmm2 \n"
2278 "movdqa %%xmm0,%%xmm1 \n"
2279 "punpcklwd %%xmm2,%%xmm0 \n"
2280 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002281 "movdqu %%xmm0,(%[dst_argb]) \n"
2282 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2283 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002284 "sub $0x8,%[width] \n"
2285 "jg 1b \n"
2286 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002287 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2288 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002289 [width]"+rm"(width) // %[width]
2290 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002291 : "memory", "cc"
2292#if defined(__SSE2__)
2293 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2294#endif
2295 );
2296}
2297
2298void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2299 const uint8* u_buf,
2300 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002301 uint8* dst_bgra,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002302 int width) {
2303 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002304 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002305 "pcmpeqb %%xmm5,%%xmm5 \n"
2306 "pxor %%xmm4,%%xmm4 \n"
2307 ".p2align 4 \n"
2308 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002309 READYUV422
2310 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002311 "pcmpeqb %%xmm5,%%xmm5 \n"
2312 "punpcklbw %%xmm0,%%xmm1 \n"
2313 "punpcklbw %%xmm2,%%xmm5 \n"
2314 "movdqa %%xmm5,%%xmm0 \n"
2315 "punpcklwd %%xmm1,%%xmm5 \n"
2316 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002317 "movdqa %%xmm5,(%[dst_bgra]) \n"
2318 "movdqa %%xmm0,0x10(%[dst_bgra]) \n"
2319 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002320 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002321 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002322 : [y_buf]"+r"(y_buf), // %[y_buf]
2323 [u_buf]"+r"(u_buf), // %[u_buf]
2324 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002325 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002326 [width]"+rm"(width) // %[width]
2327 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002328 : "memory", "cc"
2329#if defined(__SSE2__)
2330 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2331#endif
2332 );
2333}
2334
fbarchard@google.come214fe32012-06-04 23:47:11 +00002335void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002336 const uint8* u_buf,
2337 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002338 uint8* dst_abgr,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002339 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002340 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002341 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002342 "pcmpeqb %%xmm5,%%xmm5 \n"
2343 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002344 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002345 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002346 READYUV422
2347 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002348 "punpcklbw %%xmm1,%%xmm2 \n"
2349 "punpcklbw %%xmm5,%%xmm0 \n"
2350 "movdqa %%xmm2,%%xmm1 \n"
2351 "punpcklwd %%xmm0,%%xmm2 \n"
2352 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002353 "movdqa %%xmm2,(%[dst_abgr]) \n"
2354 "movdqa %%xmm1,0x10(%[dst_abgr]) \n"
2355 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002356 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002357 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002358 : [y_buf]"+r"(y_buf), // %[y_buf]
2359 [u_buf]"+r"(u_buf), // %[u_buf]
2360 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002361 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002362 [width]"+rm"(width) // %[width]
2363 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002364 : "memory", "cc"
2365#if defined(__SSE2__)
2366 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2367#endif
2368 );
2369}
2370
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002371void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2372 const uint8* u_buf,
2373 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002374 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002375 int width) {
2376 asm volatile (
2377 "sub %[u_buf],%[v_buf] \n"
2378 "pcmpeqb %%xmm5,%%xmm5 \n"
2379 "pxor %%xmm4,%%xmm4 \n"
2380 ".p2align 4 \n"
2381 "1: \n"
2382 READYUV422
2383 YUVTORGB
2384 "pcmpeqb %%xmm5,%%xmm5 \n"
2385 "punpcklbw %%xmm2,%%xmm1 \n"
2386 "punpcklbw %%xmm0,%%xmm5 \n"
2387 "movdqa %%xmm5,%%xmm0 \n"
2388 "punpcklwd %%xmm1,%%xmm5 \n"
2389 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002390 "movdqa %%xmm5,(%[dst_rgba]) \n"
2391 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2392 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002393 "sub $0x8,%[width] \n"
2394 "jg 1b \n"
2395 : [y_buf]"+r"(y_buf), // %[y_buf]
2396 [u_buf]"+r"(u_buf), // %[u_buf]
2397 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002398 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002399 [width]"+rm"(width) // %[width]
2400 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2401 : "memory", "cc"
2402#if defined(__SSE2__)
2403 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2404#endif
2405 );
2406}
2407
fbarchard@google.come214fe32012-06-04 23:47:11 +00002408void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002409 const uint8* u_buf,
2410 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002411 uint8* dst_bgra,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002412 int width) {
2413 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002414 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002415 "pcmpeqb %%xmm5,%%xmm5 \n"
2416 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002417 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002418 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002419 READYUV422
2420 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002421 "pcmpeqb %%xmm5,%%xmm5 \n"
2422 "punpcklbw %%xmm0,%%xmm1 \n"
2423 "punpcklbw %%xmm2,%%xmm5 \n"
2424 "movdqa %%xmm5,%%xmm0 \n"
2425 "punpcklwd %%xmm1,%%xmm5 \n"
2426 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002427 "movdqu %%xmm5,(%[dst_bgra]) \n"
2428 "movdqu %%xmm0,0x10(%[dst_bgra]) \n"
2429 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002430 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002431 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002432 : [y_buf]"+r"(y_buf), // %[y_buf]
2433 [u_buf]"+r"(u_buf), // %[u_buf]
2434 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002435 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002436 [width]"+rm"(width) // %[width]
2437 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002438 : "memory", "cc"
2439#if defined(__SSE2__)
2440 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2441#endif
2442 );
2443}
2444
fbarchard@google.come214fe32012-06-04 23:47:11 +00002445void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002446 const uint8* u_buf,
2447 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002448 uint8* dst_abgr,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002449 int width) {
2450 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002451 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002452 "pcmpeqb %%xmm5,%%xmm5 \n"
2453 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002454 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002455 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002456 READYUV422
2457 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002458 "punpcklbw %%xmm1,%%xmm2 \n"
2459 "punpcklbw %%xmm5,%%xmm0 \n"
2460 "movdqa %%xmm2,%%xmm1 \n"
2461 "punpcklwd %%xmm0,%%xmm2 \n"
2462 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002463 "movdqu %%xmm2,(%[dst_abgr]) \n"
2464 "movdqu %%xmm1,0x10(%[dst_abgr]) \n"
2465 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002466 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002467 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002468 : [y_buf]"+r"(y_buf), // %[y_buf]
2469 [u_buf]"+r"(u_buf), // %[u_buf]
2470 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002471 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002472 [width]"+rm"(width) // %[width]
2473 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002474 : "memory", "cc"
2475#if defined(__SSE2__)
2476 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2477#endif
2478 );
2479}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002480
2481void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2482 const uint8* u_buf,
2483 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002484 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002485 int width) {
2486 asm volatile (
2487 "sub %[u_buf],%[v_buf] \n"
2488 "pcmpeqb %%xmm5,%%xmm5 \n"
2489 "pxor %%xmm4,%%xmm4 \n"
2490 ".p2align 4 \n"
2491 "1: \n"
2492 READYUV422
2493 YUVTORGB
2494 "pcmpeqb %%xmm5,%%xmm5 \n"
2495 "punpcklbw %%xmm2,%%xmm1 \n"
2496 "punpcklbw %%xmm0,%%xmm5 \n"
2497 "movdqa %%xmm5,%%xmm0 \n"
2498 "punpcklwd %%xmm1,%%xmm5 \n"
2499 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002500 "movdqa %%xmm5,(%[dst_rgba]) \n"
2501 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2502 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002503 "sub $0x8,%[width] \n"
2504 "jg 1b \n"
2505 : [y_buf]"+r"(y_buf), // %[y_buf]
2506 [u_buf]"+r"(u_buf), // %[u_buf]
2507 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002508 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002509 [width]"+rm"(width) // %[width]
2510 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2511 : "memory", "cc"
2512#if defined(__SSE2__)
2513 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2514#endif
2515 );
2516}
2517
fbarchard@google.come214fe32012-06-04 23:47:11 +00002518#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002519
2520#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002521void YToARGBRow_SSE2(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002522 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002523 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002524 asm volatile (
fbarchard@google.com30859f72012-11-02 09:51:29 +00002525 "pxor %%xmm5,%%xmm5 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002526 "pcmpeqb %%xmm4,%%xmm4 \n"
2527 "pslld $0x18,%%xmm4 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002528 "mov $0x00100010,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002529 "movd %%eax,%%xmm3 \n"
2530 "pshufd $0x0,%%xmm3,%%xmm3 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002531 "mov $0x004a004a,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002532 "movd %%eax,%%xmm2 \n"
2533 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002534 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002535 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002536 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002537 "movq (%0),%%xmm0 \n"
2538 "lea 0x8(%0),%0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002539 "punpcklbw %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002540 "psubusw %%xmm3,%%xmm0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002541 "pmullw %%xmm2,%%xmm0 \n"
2542 "psrlw $6, %%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002543 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002544
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002545 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002546 "punpcklbw %%xmm0,%%xmm0 \n"
2547 "movdqa %%xmm0,%%xmm1 \n"
2548 "punpcklwd %%xmm0,%%xmm0 \n"
2549 "punpckhwd %%xmm1,%%xmm1 \n"
2550 "por %%xmm4,%%xmm0 \n"
2551 "por %%xmm4,%%xmm1 \n"
2552 "movdqa %%xmm0,(%1) \n"
2553 "movdqa %%xmm1,16(%1) \n"
2554 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002555
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002556 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002557 "jg 1b \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002558 : "+r"(y_buf), // %0
2559 "+r"(dst_argb), // %1
2560 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002561 :
2562 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002563#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002564 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002565#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002566 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002567}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002568#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002569
fbarchard@google.com42831e02012-01-21 02:54:17 +00002570#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002571// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002572CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002573 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2574};
2575
fbarchard@google.com42831e02012-01-21 02:54:17 +00002576void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002577 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002578 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002579 "movdqa %3,%%xmm5 \n"
2580 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002581 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002582 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002583 "movdqa (%0,%2),%%xmm0 \n"
2584 "pshufb %%xmm5,%%xmm0 \n"
2585 "sub $0x10,%2 \n"
2586 "movdqa %%xmm0,(%1) \n"
2587 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002588 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002589 : "+r"(src), // %0
2590 "+r"(dst), // %1
2591 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002592 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002593 : "memory", "cc"
2594#if defined(__SSE2__)
2595 , "xmm0", "xmm5"
2596#endif
2597 );
2598}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002599#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002600
fbarchard@google.com42831e02012-01-21 02:54:17 +00002601#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002602void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002603 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002604 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002605 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002606 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002607 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002608 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002609 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002610 "psllw $0x8,%%xmm0 \n"
2611 "psrlw $0x8,%%xmm1 \n"
2612 "por %%xmm1,%%xmm0 \n"
2613 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2614 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2615 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2616 "sub $0x10,%2 \n"
2617 "movdqu %%xmm0,(%1) \n"
2618 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002619 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002620 : "+r"(src), // %0
2621 "+r"(dst), // %1
2622 "+r"(temp_width) // %2
2623 :
2624 : "memory", "cc"
2625#if defined(__SSE2__)
2626 , "xmm0", "xmm1"
2627#endif
2628 );
2629}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002630#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002631
fbarchard@google.com16a96642012-03-02 22:38:09 +00002632#ifdef HAS_MIRRORROW_UV_SSSE3
2633// Shuffle table for reversing the bytes of UV channels.
2634CONST uvec8 kShuffleMirrorUV = {
2635 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2636};
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002637void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
fbarchard@google.com16a96642012-03-02 22:38:09 +00002638 int width) {
2639 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002640 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002641 "movdqa %4,%%xmm1 \n"
2642 "lea -16(%0,%3,2),%0 \n"
2643 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002644 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002645 "1: \n"
2646 "movdqa (%0),%%xmm0 \n"
2647 "lea -16(%0),%0 \n"
2648 "pshufb %%xmm1,%%xmm0 \n"
2649 "sub $8,%3 \n"
2650 "movlpd %%xmm0,(%1) \n"
2651 "movhpd %%xmm0,(%1,%2) \n"
2652 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002653 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002654 : "+r"(src), // %0
2655 "+r"(dst_u), // %1
2656 "+r"(dst_v), // %2
2657 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002658 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002659 : "memory", "cc"
2660#if defined(__SSE2__)
2661 , "xmm0", "xmm1"
2662#endif
2663 );
2664}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002665#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002666
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002667#ifdef HAS_ARGBMIRRORROW_SSSE3
2668// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002669CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002670 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2671};
2672
2673void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2674 intptr_t temp_width = static_cast<intptr_t>(width);
2675 asm volatile (
2676 "movdqa %3,%%xmm5 \n"
2677 "lea -0x10(%0),%0 \n"
2678 ".p2align 4 \n"
2679 "1: \n"
2680 "movdqa (%0,%2,4),%%xmm0 \n"
2681 "pshufb %%xmm5,%%xmm0 \n"
2682 "sub $0x4,%2 \n"
2683 "movdqa %%xmm0,(%1) \n"
2684 "lea 0x10(%1),%1 \n"
2685 "jg 1b \n"
2686 : "+r"(src), // %0
2687 "+r"(dst), // %1
2688 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002689 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002690 : "memory", "cc"
2691#if defined(__SSE2__)
2692 , "xmm0", "xmm5"
2693#endif
2694 );
2695}
2696#endif // HAS_ARGBMIRRORROW_SSSE3
2697
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002698#ifdef HAS_SPLITUVROW_SSE2
2699void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002700 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002701 "pcmpeqb %%xmm5,%%xmm5 \n"
2702 "psrlw $0x8,%%xmm5 \n"
2703 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002704 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002705 "1: \n"
2706 "movdqa (%0),%%xmm0 \n"
2707 "movdqa 0x10(%0),%%xmm1 \n"
2708 "lea 0x20(%0),%0 \n"
2709 "movdqa %%xmm0,%%xmm2 \n"
2710 "movdqa %%xmm1,%%xmm3 \n"
2711 "pand %%xmm5,%%xmm0 \n"
2712 "pand %%xmm5,%%xmm1 \n"
2713 "packuswb %%xmm1,%%xmm0 \n"
2714 "psrlw $0x8,%%xmm2 \n"
2715 "psrlw $0x8,%%xmm3 \n"
2716 "packuswb %%xmm3,%%xmm2 \n"
2717 "movdqa %%xmm0,(%1) \n"
2718 "movdqa %%xmm2,(%1,%2) \n"
2719 "lea 0x10(%1),%1 \n"
2720 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002721 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002722 : "+r"(src_uv), // %0
2723 "+r"(dst_u), // %1
2724 "+r"(dst_v), // %2
2725 "+r"(pix) // %3
2726 :
2727 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002728#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002729 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002730#endif
2731 );
2732}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002733
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002734void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2735 int pix) {
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002736 asm volatile (
2737 "pcmpeqb %%xmm5,%%xmm5 \n"
2738 "psrlw $0x8,%%xmm5 \n"
2739 "sub %1,%2 \n"
2740 ".p2align 4 \n"
2741 "1: \n"
2742 "movdqu (%0),%%xmm0 \n"
2743 "movdqu 0x10(%0),%%xmm1 \n"
2744 "lea 0x20(%0),%0 \n"
2745 "movdqa %%xmm0,%%xmm2 \n"
2746 "movdqa %%xmm1,%%xmm3 \n"
2747 "pand %%xmm5,%%xmm0 \n"
2748 "pand %%xmm5,%%xmm1 \n"
2749 "packuswb %%xmm1,%%xmm0 \n"
2750 "psrlw $0x8,%%xmm2 \n"
2751 "psrlw $0x8,%%xmm3 \n"
2752 "packuswb %%xmm3,%%xmm2 \n"
2753 "movdqu %%xmm0,(%1) \n"
2754 "movdqu %%xmm2,(%1,%2) \n"
2755 "lea 0x10(%1),%1 \n"
2756 "sub $0x10,%3 \n"
2757 "jg 1b \n"
2758 : "+r"(src_uv), // %0
2759 "+r"(dst_u), // %1
2760 "+r"(dst_v), // %2
2761 "+r"(pix) // %3
2762 :
2763 : "memory", "cc"
2764#if defined(__SSE2__)
2765 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2766#endif
2767 );
2768}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002769#endif // HAS_SPLITUVROW_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002770
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002771#ifdef HAS_MERGEUVROW_SSE2
2772void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2773 int width) {
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002774 asm volatile (
2775 "sub %0,%1 \n"
2776 ".p2align 4 \n"
2777 "1: \n"
2778 "movdqa (%0),%%xmm0 \n"
2779 "movdqa (%0,%1,1),%%xmm1 \n"
2780 "lea 0x10(%0),%0 \n"
2781 "movdqa %%xmm0,%%xmm2 \n"
2782 "punpcklbw %%xmm1,%%xmm0 \n"
2783 "punpckhbw %%xmm1,%%xmm2 \n"
2784 "movdqa %%xmm0,(%2) \n"
2785 "movdqa %%xmm2,0x10(%2) \n"
2786 "lea 0x20(%2),%2 \n"
2787 "sub $0x10,%3 \n"
2788 "jg 1b \n"
2789 : "+r"(src_u), // %0
2790 "+r"(src_v), // %1
2791 "+r"(dst_uv), // %2
2792 "+r"(width) // %3
2793 :
2794 : "memory", "cc"
2795#if defined(__SSE2__)
2796 , "xmm0", "xmm1", "xmm2"
2797#endif
2798 );
2799}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002800
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002801void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2802 uint8* dst_uv, int width) {
fbarchard@google.come0d86482012-10-27 19:07:55 +00002803 asm volatile (
2804 "sub %0,%1 \n"
2805 ".p2align 4 \n"
2806 "1: \n"
2807 "movdqu (%0),%%xmm0 \n"
2808 "movdqu (%0,%1,1),%%xmm1 \n"
2809 "lea 0x10(%0),%0 \n"
2810 "movdqa %%xmm0,%%xmm2 \n"
2811 "punpcklbw %%xmm1,%%xmm0 \n"
2812 "punpckhbw %%xmm1,%%xmm2 \n"
2813 "movdqu %%xmm0,(%2) \n"
2814 "movdqu %%xmm2,0x10(%2) \n"
2815 "lea 0x20(%2),%2 \n"
2816 "sub $0x10,%3 \n"
2817 "jg 1b \n"
2818 : "+r"(src_u), // %0
2819 "+r"(src_v), // %1
2820 "+r"(dst_uv), // %2
2821 "+r"(width) // %3
2822 :
2823 : "memory", "cc"
2824#if defined(__SSE2__)
2825 , "xmm0", "xmm1", "xmm2"
2826#endif
2827 );
2828}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002829#endif // HAS_MERGEUVROW_SSE2
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002830
fbarchard@google.com19932f82012-02-16 22:19:14 +00002831#ifdef HAS_COPYROW_SSE2
2832void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002833 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002834 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002835 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002836 "1: \n"
2837 "movdqa (%0),%%xmm0 \n"
2838 "movdqa 0x10(%0),%%xmm1 \n"
2839 "movdqa %%xmm0,(%0,%1) \n"
2840 "movdqa %%xmm1,0x10(%0,%1) \n"
2841 "lea 0x20(%0),%0 \n"
2842 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002843 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002844 : "+r"(src), // %0
2845 "+r"(dst), // %1
2846 "+r"(count) // %2
2847 :
2848 : "memory", "cc"
2849#if defined(__SSE2__)
2850 , "xmm0", "xmm1"
2851#endif
2852 );
2853}
2854#endif // HAS_COPYROW_SSE2
2855
2856#ifdef HAS_COPYROW_X86
2857void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2858 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002859 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002860 "shr $0x2,%2 \n"
2861 "rep movsl \n"
2862 : "+S"(src), // %0
2863 "+D"(dst), // %1
2864 "+c"(width_tmp) // %2
2865 :
2866 : "memory", "cc"
2867 );
2868}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002869#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002870
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002871#ifdef HAS_SETROW_X86
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002872void SetRow_X86(uint8* dst, uint32 v32, int width) {
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002873 size_t width_tmp = static_cast<size_t>(width);
2874 asm volatile (
2875 "shr $0x2,%1 \n"
2876 "rep stosl \n"
2877 : "+D"(dst), // %0
2878 "+c"(width_tmp) // %1
2879 : "a"(v32) // %2
2880 : "memory", "cc");
2881}
2882
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002883void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002884 int dst_stride, int height) {
2885 for (int y = 0; y < height; ++y) {
2886 size_t width_tmp = static_cast<size_t>(width);
2887 uint32* d = reinterpret_cast<uint32*>(dst);
2888 asm volatile (
2889 "rep stosl \n"
2890 : "+D"(d), // %0
2891 "+c"(width_tmp) // %1
2892 : "a"(v32) // %2
2893 : "memory", "cc");
2894 dst += dst_stride;
2895 }
2896}
2897#endif // HAS_SETROW_X86
2898
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002899#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002900void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002901 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002902 "pcmpeqb %%xmm5,%%xmm5 \n"
2903 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002904 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002905 "1: \n"
2906 "movdqa (%0),%%xmm0 \n"
2907 "movdqa 0x10(%0),%%xmm1 \n"
2908 "lea 0x20(%0),%0 \n"
2909 "pand %%xmm5,%%xmm0 \n"
2910 "pand %%xmm5,%%xmm1 \n"
2911 "packuswb %%xmm1,%%xmm0 \n"
2912 "movdqa %%xmm0,(%1) \n"
2913 "lea 0x10(%1),%1 \n"
2914 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002915 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002916 : "+r"(src_yuy2), // %0
2917 "+r"(dst_y), // %1
2918 "+r"(pix) // %2
2919 :
2920 : "memory", "cc"
2921#if defined(__SSE2__)
2922 , "xmm0", "xmm1", "xmm5"
2923#endif
2924 );
2925}
2926
2927void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002928 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002929 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002930 "pcmpeqb %%xmm5,%%xmm5 \n"
2931 "psrlw $0x8,%%xmm5 \n"
2932 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002933 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002934 "1: \n"
2935 "movdqa (%0),%%xmm0 \n"
2936 "movdqa 0x10(%0),%%xmm1 \n"
2937 "movdqa (%0,%4,1),%%xmm2 \n"
2938 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2939 "lea 0x20(%0),%0 \n"
2940 "pavgb %%xmm2,%%xmm0 \n"
2941 "pavgb %%xmm3,%%xmm1 \n"
2942 "psrlw $0x8,%%xmm0 \n"
2943 "psrlw $0x8,%%xmm1 \n"
2944 "packuswb %%xmm1,%%xmm0 \n"
2945 "movdqa %%xmm0,%%xmm1 \n"
2946 "pand %%xmm5,%%xmm0 \n"
2947 "packuswb %%xmm0,%%xmm0 \n"
2948 "psrlw $0x8,%%xmm1 \n"
2949 "packuswb %%xmm1,%%xmm1 \n"
2950 "movq %%xmm0,(%1) \n"
2951 "movq %%xmm1,(%1,%2) \n"
2952 "lea 0x8(%1),%1 \n"
2953 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002954 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002955 : "+r"(src_yuy2), // %0
2956 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002957 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002958 "+r"(pix) // %3
2959 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2960 : "memory", "cc"
2961#if defined(__SSE2__)
2962 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2963#endif
2964 );
2965}
2966
fbarchard@google.comc704f782012-08-30 19:53:48 +00002967void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2968 uint8* dst_u, uint8* dst_v, int pix) {
2969 asm volatile (
2970 "pcmpeqb %%xmm5,%%xmm5 \n"
2971 "psrlw $0x8,%%xmm5 \n"
2972 "sub %1,%2 \n"
2973 ".p2align 4 \n"
2974 "1: \n"
2975 "movdqa (%0),%%xmm0 \n"
2976 "movdqa 0x10(%0),%%xmm1 \n"
2977 "lea 0x20(%0),%0 \n"
2978 "psrlw $0x8,%%xmm0 \n"
2979 "psrlw $0x8,%%xmm1 \n"
2980 "packuswb %%xmm1,%%xmm0 \n"
2981 "movdqa %%xmm0,%%xmm1 \n"
2982 "pand %%xmm5,%%xmm0 \n"
2983 "packuswb %%xmm0,%%xmm0 \n"
2984 "psrlw $0x8,%%xmm1 \n"
2985 "packuswb %%xmm1,%%xmm1 \n"
2986 "movq %%xmm0,(%1) \n"
2987 "movq %%xmm1,(%1,%2) \n"
2988 "lea 0x8(%1),%1 \n"
2989 "sub $0x10,%3 \n"
2990 "jg 1b \n"
2991 : "+r"(src_yuy2), // %0
2992 "+r"(dst_u), // %1
2993 "+r"(dst_v), // %2
2994 "+r"(pix) // %3
2995 :
2996 : "memory", "cc"
2997#if defined(__SSE2__)
2998 , "xmm0", "xmm1", "xmm5"
2999#endif
3000 );
3001}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00003002
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003003void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
3004 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003005 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003006 "pcmpeqb %%xmm5,%%xmm5 \n"
3007 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003008 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003009 "1: \n"
3010 "movdqu (%0),%%xmm0 \n"
3011 "movdqu 0x10(%0),%%xmm1 \n"
3012 "lea 0x20(%0),%0 \n"
3013 "pand %%xmm5,%%xmm0 \n"
3014 "pand %%xmm5,%%xmm1 \n"
3015 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003016 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003017 "movdqu %%xmm0,(%1) \n"
3018 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003019 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003020 : "+r"(src_yuy2), // %0
3021 "+r"(dst_y), // %1
3022 "+r"(pix) // %2
3023 :
3024 : "memory", "cc"
3025#if defined(__SSE2__)
3026 , "xmm0", "xmm1", "xmm5"
3027#endif
3028 );
3029}
3030
3031void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
3032 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00003033 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003034 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003035 "pcmpeqb %%xmm5,%%xmm5 \n"
3036 "psrlw $0x8,%%xmm5 \n"
3037 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003038 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003039 "1: \n"
3040 "movdqu (%0),%%xmm0 \n"
3041 "movdqu 0x10(%0),%%xmm1 \n"
3042 "movdqu (%0,%4,1),%%xmm2 \n"
3043 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3044 "lea 0x20(%0),%0 \n"
3045 "pavgb %%xmm2,%%xmm0 \n"
3046 "pavgb %%xmm3,%%xmm1 \n"
3047 "psrlw $0x8,%%xmm0 \n"
3048 "psrlw $0x8,%%xmm1 \n"
3049 "packuswb %%xmm1,%%xmm0 \n"
3050 "movdqa %%xmm0,%%xmm1 \n"
3051 "pand %%xmm5,%%xmm0 \n"
3052 "packuswb %%xmm0,%%xmm0 \n"
3053 "psrlw $0x8,%%xmm1 \n"
3054 "packuswb %%xmm1,%%xmm1 \n"
3055 "movq %%xmm0,(%1) \n"
3056 "movq %%xmm1,(%1,%2) \n"
3057 "lea 0x8(%1),%1 \n"
3058 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003059 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003060 : "+r"(src_yuy2), // %0
3061 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003062 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003063 "+r"(pix) // %3
3064 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
3065 : "memory", "cc"
3066#if defined(__SSE2__)
3067 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3068#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00003069 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003070}
3071
fbarchard@google.comc704f782012-08-30 19:53:48 +00003072void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
3073 uint8* dst_u, uint8* dst_v, int pix) {
3074 asm volatile (
3075 "pcmpeqb %%xmm5,%%xmm5 \n"
3076 "psrlw $0x8,%%xmm5 \n"
3077 "sub %1,%2 \n"
3078 ".p2align 4 \n"
3079 "1: \n"
3080 "movdqu (%0),%%xmm0 \n"
3081 "movdqu 0x10(%0),%%xmm1 \n"
3082 "lea 0x20(%0),%0 \n"
3083 "psrlw $0x8,%%xmm0 \n"
3084 "psrlw $0x8,%%xmm1 \n"
3085 "packuswb %%xmm1,%%xmm0 \n"
3086 "movdqa %%xmm0,%%xmm1 \n"
3087 "pand %%xmm5,%%xmm0 \n"
3088 "packuswb %%xmm0,%%xmm0 \n"
3089 "psrlw $0x8,%%xmm1 \n"
3090 "packuswb %%xmm1,%%xmm1 \n"
3091 "movq %%xmm0,(%1) \n"
3092 "movq %%xmm1,(%1,%2) \n"
3093 "lea 0x8(%1),%1 \n"
3094 "sub $0x10,%3 \n"
3095 "jg 1b \n"
3096 : "+r"(src_yuy2), // %0
3097 "+r"(dst_u), // %1
3098 "+r"(dst_v), // %2
3099 "+r"(pix) // %3
3100 :
3101 : "memory", "cc"
3102#if defined(__SSE2__)
3103 , "xmm0", "xmm1", "xmm5"
3104#endif
3105 );
3106}
3107
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003108void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003109 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003110 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003111 "1: \n"
3112 "movdqa (%0),%%xmm0 \n"
3113 "movdqa 0x10(%0),%%xmm1 \n"
3114 "lea 0x20(%0),%0 \n"
3115 "psrlw $0x8,%%xmm0 \n"
3116 "psrlw $0x8,%%xmm1 \n"
3117 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003118 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003119 "movdqa %%xmm0,(%1) \n"
3120 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003121 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003122 : "+r"(src_uyvy), // %0
3123 "+r"(dst_y), // %1
3124 "+r"(pix) // %2
3125 :
3126 : "memory", "cc"
3127#if defined(__SSE2__)
3128 , "xmm0", "xmm1"
3129#endif
3130 );
3131}
3132
3133void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003134 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003135 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003136 "pcmpeqb %%xmm5,%%xmm5 \n"
3137 "psrlw $0x8,%%xmm5 \n"
3138 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003139 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003140 "1: \n"
3141 "movdqa (%0),%%xmm0 \n"
3142 "movdqa 0x10(%0),%%xmm1 \n"
3143 "movdqa (%0,%4,1),%%xmm2 \n"
3144 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
3145 "lea 0x20(%0),%0 \n"
3146 "pavgb %%xmm2,%%xmm0 \n"
3147 "pavgb %%xmm3,%%xmm1 \n"
3148 "pand %%xmm5,%%xmm0 \n"
3149 "pand %%xmm5,%%xmm1 \n"
3150 "packuswb %%xmm1,%%xmm0 \n"
3151 "movdqa %%xmm0,%%xmm1 \n"
3152 "pand %%xmm5,%%xmm0 \n"
3153 "packuswb %%xmm0,%%xmm0 \n"
3154 "psrlw $0x8,%%xmm1 \n"
3155 "packuswb %%xmm1,%%xmm1 \n"
3156 "movq %%xmm0,(%1) \n"
3157 "movq %%xmm1,(%1,%2) \n"
3158 "lea 0x8(%1),%1 \n"
3159 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003160 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003161 : "+r"(src_uyvy), // %0
3162 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003163 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003164 "+r"(pix) // %3
3165 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3166 : "memory", "cc"
3167#if defined(__SSE2__)
3168 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3169#endif
3170 );
3171}
3172
fbarchard@google.comc704f782012-08-30 19:53:48 +00003173void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3174 uint8* dst_u, uint8* dst_v, int pix) {
3175 asm volatile (
3176 "pcmpeqb %%xmm5,%%xmm5 \n"
3177 "psrlw $0x8,%%xmm5 \n"
3178 "sub %1,%2 \n"
3179 ".p2align 4 \n"
3180 "1: \n"
3181 "movdqa (%0),%%xmm0 \n"
3182 "movdqa 0x10(%0),%%xmm1 \n"
3183 "lea 0x20(%0),%0 \n"
3184 "pand %%xmm5,%%xmm0 \n"
3185 "pand %%xmm5,%%xmm1 \n"
3186 "packuswb %%xmm1,%%xmm0 \n"
3187 "movdqa %%xmm0,%%xmm1 \n"
3188 "pand %%xmm5,%%xmm0 \n"
3189 "packuswb %%xmm0,%%xmm0 \n"
3190 "psrlw $0x8,%%xmm1 \n"
3191 "packuswb %%xmm1,%%xmm1 \n"
3192 "movq %%xmm0,(%1) \n"
3193 "movq %%xmm1,(%1,%2) \n"
3194 "lea 0x8(%1),%1 \n"
3195 "sub $0x10,%3 \n"
3196 "jg 1b \n"
3197 : "+r"(src_uyvy), // %0
3198 "+r"(dst_u), // %1
3199 "+r"(dst_v), // %2
3200 "+r"(pix) // %3
3201 :
3202 : "memory", "cc"
3203#if defined(__SSE2__)
3204 , "xmm0", "xmm1", "xmm5"
3205#endif
3206 );
3207}
3208
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003209void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3210 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003211 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003212 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003213 "1: \n"
3214 "movdqu (%0),%%xmm0 \n"
3215 "movdqu 0x10(%0),%%xmm1 \n"
3216 "lea 0x20(%0),%0 \n"
3217 "psrlw $0x8,%%xmm0 \n"
3218 "psrlw $0x8,%%xmm1 \n"
3219 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003220 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003221 "movdqu %%xmm0,(%1) \n"
3222 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003223 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003224 : "+r"(src_uyvy), // %0
3225 "+r"(dst_y), // %1
3226 "+r"(pix) // %2
3227 :
3228 : "memory", "cc"
3229#if defined(__SSE2__)
3230 , "xmm0", "xmm1"
3231#endif
3232 );
3233}
3234
3235void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003236 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003237 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003238 "pcmpeqb %%xmm5,%%xmm5 \n"
3239 "psrlw $0x8,%%xmm5 \n"
3240 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003241 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003242 "1: \n"
3243 "movdqu (%0),%%xmm0 \n"
3244 "movdqu 0x10(%0),%%xmm1 \n"
3245 "movdqu (%0,%4,1),%%xmm2 \n"
3246 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3247 "lea 0x20(%0),%0 \n"
3248 "pavgb %%xmm2,%%xmm0 \n"
3249 "pavgb %%xmm3,%%xmm1 \n"
3250 "pand %%xmm5,%%xmm0 \n"
3251 "pand %%xmm5,%%xmm1 \n"
3252 "packuswb %%xmm1,%%xmm0 \n"
3253 "movdqa %%xmm0,%%xmm1 \n"
3254 "pand %%xmm5,%%xmm0 \n"
3255 "packuswb %%xmm0,%%xmm0 \n"
3256 "psrlw $0x8,%%xmm1 \n"
3257 "packuswb %%xmm1,%%xmm1 \n"
3258 "movq %%xmm0,(%1) \n"
3259 "movq %%xmm1,(%1,%2) \n"
3260 "lea 0x8(%1),%1 \n"
3261 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003262 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003263 : "+r"(src_uyvy), // %0
3264 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003265 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003266 "+r"(pix) // %3
3267 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3268 : "memory", "cc"
3269#if defined(__SSE2__)
3270 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3271#endif
3272 );
3273}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003274
3275void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3276 uint8* dst_u, uint8* dst_v, int pix) {
3277 asm volatile (
3278 "pcmpeqb %%xmm5,%%xmm5 \n"
3279 "psrlw $0x8,%%xmm5 \n"
3280 "sub %1,%2 \n"
3281 ".p2align 4 \n"
3282 "1: \n"
3283 "movdqu (%0),%%xmm0 \n"
3284 "movdqu 0x10(%0),%%xmm1 \n"
3285 "lea 0x20(%0),%0 \n"
3286 "pand %%xmm5,%%xmm0 \n"
3287 "pand %%xmm5,%%xmm1 \n"
3288 "packuswb %%xmm1,%%xmm0 \n"
3289 "movdqa %%xmm0,%%xmm1 \n"
3290 "pand %%xmm5,%%xmm0 \n"
3291 "packuswb %%xmm0,%%xmm0 \n"
3292 "psrlw $0x8,%%xmm1 \n"
3293 "packuswb %%xmm1,%%xmm1 \n"
3294 "movq %%xmm0,(%1) \n"
3295 "movq %%xmm1,(%1,%2) \n"
3296 "lea 0x8(%1),%1 \n"
3297 "sub $0x10,%3 \n"
3298 "jg 1b \n"
3299 : "+r"(src_uyvy), // %0
3300 "+r"(dst_u), // %1
3301 "+r"(dst_v), // %2
3302 "+r"(pix) // %3
3303 :
3304 : "memory", "cc"
3305#if defined(__SSE2__)
3306 , "xmm0", "xmm1", "xmm5"
3307#endif
3308 );
3309}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003310#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003311
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003312#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003313// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003314void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3315 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003316 asm volatile (
3317 "pcmpeqb %%xmm7,%%xmm7 \n"
3318 "psrlw $0xf,%%xmm7 \n"
3319 "pcmpeqb %%xmm6,%%xmm6 \n"
3320 "psrlw $0x8,%%xmm6 \n"
3321 "pcmpeqb %%xmm5,%%xmm5 \n"
3322 "psllw $0x8,%%xmm5 \n"
3323 "pcmpeqb %%xmm4,%%xmm4 \n"
3324 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003325 "sub $0x1,%3 \n"
3326 "je 91f \n"
3327 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003328
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003329 // 1 pixel loop until destination pointer is aligned.
3330 "10: \n"
3331 "test $0xf,%2 \n"
3332 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003333 "movd (%0),%%xmm3 \n"
3334 "lea 0x4(%0),%0 \n"
3335 "movdqa %%xmm3,%%xmm0 \n"
3336 "pxor %%xmm4,%%xmm3 \n"
3337 "movd (%1),%%xmm2 \n"
3338 "psrlw $0x8,%%xmm3 \n"
3339 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3340 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3341 "pand %%xmm6,%%xmm2 \n"
3342 "paddw %%xmm7,%%xmm3 \n"
3343 "pmullw %%xmm3,%%xmm2 \n"
3344 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003345 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003346 "psrlw $0x8,%%xmm1 \n"
3347 "por %%xmm4,%%xmm0 \n"
3348 "pmullw %%xmm3,%%xmm1 \n"
3349 "psrlw $0x8,%%xmm2 \n"
3350 "paddusb %%xmm2,%%xmm0 \n"
3351 "pand %%xmm5,%%xmm1 \n"
3352 "paddusb %%xmm1,%%xmm0 \n"
3353 "sub $0x1,%3 \n"
3354 "movd %%xmm0,(%2) \n"
3355 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003356 "jge 10b \n"
3357
3358 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003359 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003360 "jl 49f \n"
3361
fbarchard@google.com794fe122012-06-15 01:05:01 +00003362 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003363 ".p2align 2 \n"
3364 "41: \n"
3365 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003366 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003367 "movdqa %%xmm3,%%xmm0 \n"
3368 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003369 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003370 "psrlw $0x8,%%xmm3 \n"
3371 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3372 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003373 "pand %%xmm6,%%xmm2 \n"
3374 "paddw %%xmm7,%%xmm3 \n"
3375 "pmullw %%xmm3,%%xmm2 \n"
3376 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003377 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003378 "psrlw $0x8,%%xmm1 \n"
3379 "por %%xmm4,%%xmm0 \n"
3380 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003381 "psrlw $0x8,%%xmm2 \n"
3382 "paddusb %%xmm2,%%xmm0 \n"
3383 "pand %%xmm5,%%xmm1 \n"
3384 "paddusb %%xmm1,%%xmm0 \n"
3385 "sub $0x4,%3 \n"
3386 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003387 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003388 "jge 41b \n"
3389
3390 "49: \n"
3391 "add $0x3,%3 \n"
3392 "jl 99f \n"
3393
fbarchard@google.com794fe122012-06-15 01:05:01 +00003394 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003395 "91: \n"
3396 "movd (%0),%%xmm3 \n"
3397 "lea 0x4(%0),%0 \n"
3398 "movdqa %%xmm3,%%xmm0 \n"
3399 "pxor %%xmm4,%%xmm3 \n"
3400 "movd (%1),%%xmm2 \n"
3401 "psrlw $0x8,%%xmm3 \n"
3402 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3403 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3404 "pand %%xmm6,%%xmm2 \n"
3405 "paddw %%xmm7,%%xmm3 \n"
3406 "pmullw %%xmm3,%%xmm2 \n"
3407 "movd (%1),%%xmm1 \n"
3408 "lea 0x4(%1),%1 \n"
3409 "psrlw $0x8,%%xmm1 \n"
3410 "por %%xmm4,%%xmm0 \n"
3411 "pmullw %%xmm3,%%xmm1 \n"
3412 "psrlw $0x8,%%xmm2 \n"
3413 "paddusb %%xmm2,%%xmm0 \n"
3414 "pand %%xmm5,%%xmm1 \n"
3415 "paddusb %%xmm1,%%xmm0 \n"
3416 "sub $0x1,%3 \n"
3417 "movd %%xmm0,(%2) \n"
3418 "lea 0x4(%2),%2 \n"
3419 "jge 91b \n"
3420 "99: \n"
3421 : "+r"(src_argb0), // %0
3422 "+r"(src_argb1), // %1
3423 "+r"(dst_argb), // %2
3424 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003425 :
3426 : "memory", "cc"
3427#if defined(__SSE2__)
3428 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3429#endif
3430 );
3431}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003432#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003433
fbarchard@google.com96af8702012-04-06 18:22:27 +00003434#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003435// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003436CONST uvec8 kShuffleAlpha = {
3437 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3438 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3439};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003440
3441// Blend 8 pixels at a time
3442// Shuffle table for reversing the bytes.
3443
3444// Same as SSE2, but replaces
3445// psrlw xmm3, 8 // alpha
3446// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3447// pshuflw xmm3, xmm3,0F5h
3448// with..
3449// pshufb xmm3, kShuffleAlpha // alpha
3450
3451void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3452 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003453 asm volatile (
3454 "pcmpeqb %%xmm7,%%xmm7 \n"
3455 "psrlw $0xf,%%xmm7 \n"
3456 "pcmpeqb %%xmm6,%%xmm6 \n"
3457 "psrlw $0x8,%%xmm6 \n"
3458 "pcmpeqb %%xmm5,%%xmm5 \n"
3459 "psllw $0x8,%%xmm5 \n"
3460 "pcmpeqb %%xmm4,%%xmm4 \n"
3461 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003462 "sub $0x1,%3 \n"
3463 "je 91f \n"
3464 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003465
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003466 // 1 pixel loop until destination pointer is aligned.
3467 "10: \n"
3468 "test $0xf,%2 \n"
3469 "je 19f \n"
3470 "movd (%0),%%xmm3 \n"
3471 "lea 0x4(%0),%0 \n"
3472 "movdqa %%xmm3,%%xmm0 \n"
3473 "pxor %%xmm4,%%xmm3 \n"
3474 "movd (%1),%%xmm2 \n"
3475 "pshufb %4,%%xmm3 \n"
3476 "pand %%xmm6,%%xmm2 \n"
3477 "paddw %%xmm7,%%xmm3 \n"
3478 "pmullw %%xmm3,%%xmm2 \n"
3479 "movd (%1),%%xmm1 \n"
3480 "lea 0x4(%1),%1 \n"
3481 "psrlw $0x8,%%xmm1 \n"
3482 "por %%xmm4,%%xmm0 \n"
3483 "pmullw %%xmm3,%%xmm1 \n"
3484 "psrlw $0x8,%%xmm2 \n"
3485 "paddusb %%xmm2,%%xmm0 \n"
3486 "pand %%xmm5,%%xmm1 \n"
3487 "paddusb %%xmm1,%%xmm0 \n"
3488 "sub $0x1,%3 \n"
3489 "movd %%xmm0,(%2) \n"
3490 "lea 0x4(%2),%2 \n"
3491 "jge 10b \n"
3492
3493 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003494 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003495 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003496 "test $0xf,%0 \n"
3497 "jne 41f \n"
3498 "test $0xf,%1 \n"
3499 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003500
fbarchard@google.com794fe122012-06-15 01:05:01 +00003501 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003502 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003503 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003504 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003505 "lea 0x10(%0),%0 \n"
3506 "movdqa %%xmm3,%%xmm0 \n"
3507 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003508 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003509 "pshufb %4,%%xmm3 \n"
3510 "pand %%xmm6,%%xmm2 \n"
3511 "paddw %%xmm7,%%xmm3 \n"
3512 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003513 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003514 "lea 0x10(%1),%1 \n"
3515 "psrlw $0x8,%%xmm1 \n"
3516 "por %%xmm4,%%xmm0 \n"
3517 "pmullw %%xmm3,%%xmm1 \n"
3518 "psrlw $0x8,%%xmm2 \n"
3519 "paddusb %%xmm2,%%xmm0 \n"
3520 "pand %%xmm5,%%xmm1 \n"
3521 "paddusb %%xmm1,%%xmm0 \n"
3522 "sub $0x4,%3 \n"
3523 "movdqa %%xmm0,(%2) \n"
3524 "lea 0x10(%2),%2 \n"
3525 "jge 40b \n"
3526 "jmp 49f \n"
3527
3528 // 4 pixel unaligned loop.
3529 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003530 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003531 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003532 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003533 "movdqa %%xmm3,%%xmm0 \n"
3534 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003535 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003536 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003537 "pand %%xmm6,%%xmm2 \n"
3538 "paddw %%xmm7,%%xmm3 \n"
3539 "pmullw %%xmm3,%%xmm2 \n"
3540 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003541 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003542 "psrlw $0x8,%%xmm1 \n"
3543 "por %%xmm4,%%xmm0 \n"
3544 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003545 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003546 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003547 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003548 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003549 "sub $0x4,%3 \n"
3550 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003551 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003552 "jge 41b \n"
3553
3554 "49: \n"
3555 "add $0x3,%3 \n"
3556 "jl 99f \n"
3557
fbarchard@google.com794fe122012-06-15 01:05:01 +00003558 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003559 "91: \n"
3560 "movd (%0),%%xmm3 \n"
3561 "lea 0x4(%0),%0 \n"
3562 "movdqa %%xmm3,%%xmm0 \n"
3563 "pxor %%xmm4,%%xmm3 \n"
3564 "movd (%1),%%xmm2 \n"
3565 "pshufb %4,%%xmm3 \n"
3566 "pand %%xmm6,%%xmm2 \n"
3567 "paddw %%xmm7,%%xmm3 \n"
3568 "pmullw %%xmm3,%%xmm2 \n"
3569 "movd (%1),%%xmm1 \n"
3570 "lea 0x4(%1),%1 \n"
3571 "psrlw $0x8,%%xmm1 \n"
3572 "por %%xmm4,%%xmm0 \n"
3573 "pmullw %%xmm3,%%xmm1 \n"
3574 "psrlw $0x8,%%xmm2 \n"
3575 "paddusb %%xmm2,%%xmm0 \n"
3576 "pand %%xmm5,%%xmm1 \n"
3577 "paddusb %%xmm1,%%xmm0 \n"
3578 "sub $0x1,%3 \n"
3579 "movd %%xmm0,(%2) \n"
3580 "lea 0x4(%2),%2 \n"
3581 "jge 91b \n"
3582 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003583 : "+r"(src_argb0), // %0
3584 "+r"(src_argb1), // %1
3585 "+r"(dst_argb), // %2
3586 "+r"(width) // %3
3587 : "m"(kShuffleAlpha) // %4
3588 : "memory", "cc"
3589#if defined(__SSE2__)
3590 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3591#endif
3592 );
3593}
3594#endif // HAS_ARGBBLENDROW_SSSE3
3595
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003596#ifdef HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003597// Attenuate 4 pixels at a time.
3598// aligned to 16 bytes
3599void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3600 asm volatile (
3601 "sub %0,%1 \n"
3602 "pcmpeqb %%xmm4,%%xmm4 \n"
3603 "pslld $0x18,%%xmm4 \n"
3604 "pcmpeqb %%xmm5,%%xmm5 \n"
3605 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003606
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003607 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003608 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003609 "1: \n"
3610 "movdqa (%0),%%xmm0 \n"
3611 "punpcklbw %%xmm0,%%xmm0 \n"
3612 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3613 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3614 "pmulhuw %%xmm2,%%xmm0 \n"
3615 "movdqa (%0),%%xmm1 \n"
3616 "punpckhbw %%xmm1,%%xmm1 \n"
3617 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3618 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3619 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003620 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003621 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003622 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003623 "psrlw $0x8,%%xmm1 \n"
3624 "packuswb %%xmm1,%%xmm0 \n"
3625 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003626 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003627 "sub $0x4,%2 \n"
3628 "movdqa %%xmm0,(%0,%1,1) \n"
3629 "lea 0x10(%0),%0 \n"
3630 "jg 1b \n"
3631 : "+r"(src_argb), // %0
3632 "+r"(dst_argb), // %1
3633 "+r"(width) // %2
3634 :
3635 : "memory", "cc"
3636#if defined(__SSE2__)
3637 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3638#endif
3639 );
3640}
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003641#endif // HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003642
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003643#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003644// Shuffle table duplicating alpha
3645CONST uvec8 kShuffleAlpha0 = {
3646 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3647};
3648CONST uvec8 kShuffleAlpha1 = {
3649 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3650 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3651};
3652// Attenuate 4 pixels at a time.
3653// aligned to 16 bytes
3654void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3655 asm volatile (
3656 "sub %0,%1 \n"
3657 "pcmpeqb %%xmm3,%%xmm3 \n"
3658 "pslld $0x18,%%xmm3 \n"
3659 "movdqa %3,%%xmm4 \n"
3660 "movdqa %4,%%xmm5 \n"
3661
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003662 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003663 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003664 "1: \n"
3665 "movdqa (%0),%%xmm0 \n"
3666 "pshufb %%xmm4,%%xmm0 \n"
3667 "movdqa (%0),%%xmm1 \n"
3668 "punpcklbw %%xmm1,%%xmm1 \n"
3669 "pmulhuw %%xmm1,%%xmm0 \n"
3670 "movdqa (%0),%%xmm1 \n"
3671 "pshufb %%xmm5,%%xmm1 \n"
3672 "movdqa (%0),%%xmm2 \n"
3673 "punpckhbw %%xmm2,%%xmm2 \n"
3674 "pmulhuw %%xmm2,%%xmm1 \n"
3675 "movdqa (%0),%%xmm2 \n"
3676 "pand %%xmm3,%%xmm2 \n"
3677 "psrlw $0x8,%%xmm0 \n"
3678 "psrlw $0x8,%%xmm1 \n"
3679 "packuswb %%xmm1,%%xmm0 \n"
3680 "por %%xmm2,%%xmm0 \n"
3681 "sub $0x4,%2 \n"
3682 "movdqa %%xmm0,(%0,%1,1) \n"
3683 "lea 0x10(%0),%0 \n"
3684 "jg 1b \n"
3685 : "+r"(src_argb), // %0
3686 "+r"(dst_argb), // %1
3687 "+r"(width) // %2
3688 : "m"(kShuffleAlpha0), // %3
3689 "m"(kShuffleAlpha1) // %4
3690 : "memory", "cc"
3691#if defined(__SSE2__)
3692 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3693#endif
3694 );
3695}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003696#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003697
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003698#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003699// Unattenuate 4 pixels at a time.
3700// aligned to 16 bytes
3701void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3702 int width) {
3703 uintptr_t alpha = 0;
3704 asm volatile (
3705 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003706
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003707 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003708 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003709 "1: \n"
3710 "movdqa (%0),%%xmm0 \n"
3711 "movzb 0x3(%0),%3 \n"
3712 "punpcklbw %%xmm0,%%xmm0 \n"
3713 "movd 0x0(%4,%3,4),%%xmm2 \n"
3714 "movzb 0x7(%0),%3 \n"
3715 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003716 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3717 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003718 "movlhps %%xmm3,%%xmm2 \n"
3719 "pmulhuw %%xmm2,%%xmm0 \n"
3720 "movdqa (%0),%%xmm1 \n"
3721 "movzb 0xb(%0),%3 \n"
3722 "punpckhbw %%xmm1,%%xmm1 \n"
3723 "movd 0x0(%4,%3,4),%%xmm2 \n"
3724 "movzb 0xf(%0),%3 \n"
3725 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003726 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3727 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003728 "movlhps %%xmm3,%%xmm2 \n"
3729 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003730 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003731 "sub $0x4,%2 \n"
3732 "movdqa %%xmm0,(%0,%1,1) \n"
3733 "lea 0x10(%0),%0 \n"
3734 "jg 1b \n"
3735 : "+r"(src_argb), // %0
3736 "+r"(dst_argb), // %1
3737 "+r"(width), // %2
3738 "+r"(alpha) // %3
3739 : "r"(fixed_invtbl8) // %4
3740 : "memory", "cc"
3741#if defined(__SSE2__)
3742 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3743#endif
3744 );
3745}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003746#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003747
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003748#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003749// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003750CONST vec8 kARGBToGray = {
3751 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3752};
3753
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003754// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003755void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003756 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003757 "movdqa %3,%%xmm4 \n"
3758 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003759
3760 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003761 ".p2align 4 \n"
3762 "1: \n"
3763 "movdqa (%0),%%xmm0 \n"
3764 "movdqa 0x10(%0),%%xmm1 \n"
3765 "pmaddubsw %%xmm4,%%xmm0 \n"
3766 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003767 "phaddw %%xmm1,%%xmm0 \n"
3768 "psrlw $0x7,%%xmm0 \n"
3769 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003770 "movdqa (%0),%%xmm2 \n"
3771 "movdqa 0x10(%0),%%xmm3 \n"
3772 "psrld $0x18,%%xmm2 \n"
3773 "psrld $0x18,%%xmm3 \n"
3774 "packuswb %%xmm3,%%xmm2 \n"
3775 "packuswb %%xmm2,%%xmm2 \n"
3776 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003777 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003778 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003779 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003780 "punpcklwd %%xmm3,%%xmm0 \n"
3781 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003782 "sub $0x8,%2 \n"
3783 "movdqa %%xmm0,(%0,%1,1) \n"
3784 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003785 "lea 0x20(%0),%0 \n"
3786 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003787 : "+r"(src_argb), // %0
3788 "+r"(dst_argb), // %1
3789 "+r"(width) // %2
3790 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003791 : "memory", "cc"
3792#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003793 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003794#endif
3795 );
3796}
3797#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003798
3799#ifdef HAS_ARGBSEPIAROW_SSSE3
3800// b = (r * 35 + g * 68 + b * 17) >> 7
3801// g = (r * 45 + g * 88 + b * 22) >> 7
3802// r = (r * 50 + g * 98 + b * 24) >> 7
3803// Constant for ARGB color to sepia tone
3804CONST vec8 kARGBToSepiaB = {
3805 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3806};
3807
3808CONST vec8 kARGBToSepiaG = {
3809 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3810};
3811
3812CONST vec8 kARGBToSepiaR = {
3813 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3814};
3815
fbarchard@google.come442dc42012-06-18 17:37:09 +00003816// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003817void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3818 asm volatile (
3819 "movdqa %2,%%xmm2 \n"
3820 "movdqa %3,%%xmm3 \n"
3821 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003822
3823 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003824 ".p2align 4 \n"
3825 "1: \n"
3826 "movdqa (%0),%%xmm0 \n"
3827 "movdqa 0x10(%0),%%xmm6 \n"
3828 "pmaddubsw %%xmm2,%%xmm0 \n"
3829 "pmaddubsw %%xmm2,%%xmm6 \n"
3830 "phaddw %%xmm6,%%xmm0 \n"
3831 "psrlw $0x7,%%xmm0 \n"
3832 "packuswb %%xmm0,%%xmm0 \n"
3833 "movdqa (%0),%%xmm5 \n"
3834 "movdqa 0x10(%0),%%xmm1 \n"
3835 "pmaddubsw %%xmm3,%%xmm5 \n"
3836 "pmaddubsw %%xmm3,%%xmm1 \n"
3837 "phaddw %%xmm1,%%xmm5 \n"
3838 "psrlw $0x7,%%xmm5 \n"
3839 "packuswb %%xmm5,%%xmm5 \n"
3840 "punpcklbw %%xmm5,%%xmm0 \n"
3841 "movdqa (%0),%%xmm5 \n"
3842 "movdqa 0x10(%0),%%xmm1 \n"
3843 "pmaddubsw %%xmm4,%%xmm5 \n"
3844 "pmaddubsw %%xmm4,%%xmm1 \n"
3845 "phaddw %%xmm1,%%xmm5 \n"
3846 "psrlw $0x7,%%xmm5 \n"
3847 "packuswb %%xmm5,%%xmm5 \n"
3848 "movdqa (%0),%%xmm6 \n"
3849 "movdqa 0x10(%0),%%xmm1 \n"
3850 "psrld $0x18,%%xmm6 \n"
3851 "psrld $0x18,%%xmm1 \n"
3852 "packuswb %%xmm1,%%xmm6 \n"
3853 "packuswb %%xmm6,%%xmm6 \n"
3854 "punpcklbw %%xmm6,%%xmm5 \n"
3855 "movdqa %%xmm0,%%xmm1 \n"
3856 "punpcklwd %%xmm5,%%xmm0 \n"
3857 "punpckhwd %%xmm5,%%xmm1 \n"
3858 "sub $0x8,%1 \n"
3859 "movdqa %%xmm0,(%0) \n"
3860 "movdqa %%xmm1,0x10(%0) \n"
3861 "lea 0x20(%0),%0 \n"
3862 "jg 1b \n"
3863 : "+r"(dst_argb), // %0
3864 "+r"(width) // %1
3865 : "m"(kARGBToSepiaB), // %2
3866 "m"(kARGBToSepiaG), // %3
3867 "m"(kARGBToSepiaR) // %4
3868 : "memory", "cc"
3869#if defined(__SSE2__)
3870 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3871#endif
3872 );
3873}
3874#endif // HAS_ARGBSEPIAROW_SSSE3
3875
fbarchard@google.come442dc42012-06-18 17:37:09 +00003876#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3877// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3878// Same as Sepia except matrix is provided.
3879void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3880 int width) {
3881 asm volatile (
3882 "movd (%2),%%xmm2 \n"
3883 "movd 0x4(%2),%%xmm3 \n"
3884 "movd 0x8(%2),%%xmm4 \n"
3885 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3886 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3887 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003888
3889 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003890 ".p2align 4 \n"
3891 "1: \n"
3892 "movdqa (%0),%%xmm0 \n"
3893 "movdqa 0x10(%0),%%xmm6 \n"
3894 "pmaddubsw %%xmm2,%%xmm0 \n"
3895 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003896 "movdqa (%0),%%xmm5 \n"
3897 "movdqa 0x10(%0),%%xmm1 \n"
3898 "pmaddubsw %%xmm3,%%xmm5 \n"
3899 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003900 "phaddsw %%xmm6,%%xmm0 \n"
3901 "phaddsw %%xmm1,%%xmm5 \n"
3902 "psraw $0x7,%%xmm0 \n"
3903 "psraw $0x7,%%xmm5 \n"
3904 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003905 "packuswb %%xmm5,%%xmm5 \n"
3906 "punpcklbw %%xmm5,%%xmm0 \n"
3907 "movdqa (%0),%%xmm5 \n"
3908 "movdqa 0x10(%0),%%xmm1 \n"
3909 "pmaddubsw %%xmm4,%%xmm5 \n"
3910 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003911 "phaddsw %%xmm1,%%xmm5 \n"
3912 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003913 "packuswb %%xmm5,%%xmm5 \n"
3914 "movdqa (%0),%%xmm6 \n"
3915 "movdqa 0x10(%0),%%xmm1 \n"
3916 "psrld $0x18,%%xmm6 \n"
3917 "psrld $0x18,%%xmm1 \n"
3918 "packuswb %%xmm1,%%xmm6 \n"
3919 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003920 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003921 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003922 "punpcklwd %%xmm5,%%xmm0 \n"
3923 "punpckhwd %%xmm5,%%xmm1 \n"
3924 "sub $0x8,%1 \n"
3925 "movdqa %%xmm0,(%0) \n"
3926 "movdqa %%xmm1,0x10(%0) \n"
3927 "lea 0x20(%0),%0 \n"
3928 "jg 1b \n"
3929 : "+r"(dst_argb), // %0
3930 "+r"(width) // %1
3931 : "r"(matrix_argb) // %2
3932 : "memory", "cc"
3933#if defined(__SSE2__)
3934 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3935#endif
3936 );
3937}
3938#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3939
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003940#ifdef HAS_ARGBQUANTIZEROW_SSE2
3941// Quantize 4 ARGB pixels (16 bytes).
3942// aligned to 16 bytes
3943void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3944 int interval_offset, int width) {
3945 asm volatile (
3946 "movd %2,%%xmm2 \n"
3947 "movd %3,%%xmm3 \n"
3948 "movd %4,%%xmm4 \n"
3949 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3950 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3951 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3952 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3953 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3954 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3955 "pxor %%xmm5,%%xmm5 \n"
3956 "pcmpeqb %%xmm6,%%xmm6 \n"
3957 "pslld $0x18,%%xmm6 \n"
3958
3959 // 4 pixel loop.
3960 ".p2align 2 \n"
3961 "1: \n"
3962 "movdqa (%0),%%xmm0 \n"
3963 "punpcklbw %%xmm5,%%xmm0 \n"
3964 "pmulhuw %%xmm2,%%xmm0 \n"
3965 "movdqa (%0),%%xmm1 \n"
3966 "punpckhbw %%xmm5,%%xmm1 \n"
3967 "pmulhuw %%xmm2,%%xmm1 \n"
3968 "pmullw %%xmm3,%%xmm0 \n"
3969 "movdqa (%0),%%xmm7 \n"
3970 "pmullw %%xmm3,%%xmm1 \n"
3971 "pand %%xmm6,%%xmm7 \n"
3972 "paddw %%xmm4,%%xmm0 \n"
3973 "paddw %%xmm4,%%xmm1 \n"
3974 "packuswb %%xmm1,%%xmm0 \n"
3975 "por %%xmm7,%%xmm0 \n"
3976 "sub $0x4,%1 \n"
3977 "movdqa %%xmm0,(%0) \n"
3978 "lea 0x10(%0),%0 \n"
3979 "jg 1b \n"
3980 : "+r"(dst_argb), // %0
3981 "+r"(width) // %1
3982 : "r"(scale), // %2
3983 "r"(interval_size), // %3
3984 "r"(interval_offset) // %4
3985 : "memory", "cc"
3986#if defined(__SSE2__)
3987 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3988#endif
3989 );
3990}
3991#endif // HAS_ARGBQUANTIZEROW_SSE2
3992
fbarchard@google.comb94b1392012-12-03 20:36:40 +00003993#ifdef HAS_ARGBSHADEROW_SSE2
3994// Shade 4 pixels at a time by specified value.
3995// Aligned to 16 bytes.
3996void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3997 uint32 value) {
3998 asm volatile (
3999 "movd %3,%%xmm2 \n"
4000 "sub %0,%1 \n"
4001 "punpcklbw %%xmm2,%%xmm2 \n"
4002 "punpcklqdq %%xmm2,%%xmm2 \n"
4003
4004 // 4 pixel loop.
4005 ".p2align 2 \n"
4006 "1: \n"
4007 "movdqa (%0),%%xmm0 \n"
4008 "movdqa %%xmm0,%%xmm1 \n"
4009 "punpcklbw %%xmm0,%%xmm0 \n"
4010 "punpckhbw %%xmm1,%%xmm1 \n"
4011 "pmulhuw %%xmm2,%%xmm0 \n"
4012 "pmulhuw %%xmm2,%%xmm1 \n"
4013 "psrlw $0x8,%%xmm0 \n"
4014 "psrlw $0x8,%%xmm1 \n"
4015 "packuswb %%xmm1,%%xmm0 \n"
4016 "sub $0x4,%2 \n"
4017 "movdqa %%xmm0,(%0,%1,1) \n"
4018 "lea 0x10(%0),%0 \n"
4019 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004020 : "+r"(src_argb), // %0
4021 "+r"(dst_argb), // %1
4022 "+r"(width) // %2
4023 : "r"(value) // %3
fbarchard@google.comb94b1392012-12-03 20:36:40 +00004024 : "memory", "cc"
4025#if defined(__SSE2__)
4026 , "xmm0", "xmm1", "xmm2"
4027#endif
4028 );
4029}
4030#endif // HAS_ARGBSHADEROW_SSE2
4031
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004032#ifdef HAS_ARGBMULTIPLYROW_SSE2
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004033// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004034// Aligned to 16 bytes.
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004035void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4036 uint8* dst_argb, int width) {
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004037 asm volatile (
4038 "pxor %%xmm5,%%xmm5 \n"
4039 "sub %0,%1 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004040 "sub %0,%2 \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004041
4042 // 4 pixel loop.
4043 ".p2align 4 \n"
4044 "1: \n"
4045 "movdqa (%0),%%xmm0 \n"
4046 "movdqa (%0,%1),%%xmm2 \n"
4047 "movdqa %%xmm0,%%xmm1 \n"
4048 "movdqa %%xmm2,%%xmm3 \n"
4049 "punpcklbw %%xmm0,%%xmm0 \n"
4050 "punpckhbw %%xmm1,%%xmm1 \n"
4051 "punpcklbw %%xmm5,%%xmm2 \n"
4052 "punpckhbw %%xmm5,%%xmm3 \n"
4053 "pmulhuw %%xmm2,%%xmm0 \n"
4054 "pmulhuw %%xmm3,%%xmm1 \n"
4055 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004056 "sub $0x4,%3 \n"
4057 "movdqa %%xmm0,(%0,%2,1) \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004058 "lea 0x10(%0),%0 \n"
4059 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004060 : "+r"(src_argb0), // %0
4061 "+r"(src_argb1), // %1
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004062 "+r"(dst_argb), // %2
4063 "+r"(width) // %3
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004064 :
4065 : "memory", "cc"
4066#if defined(__SSE2__)
4067 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4068#endif
4069 );
4070}
4071#endif // HAS_ARGBMULTIPLYROW_SSE2
4072
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004073#ifdef HAS_ARGBADDROW_SSE2
4074// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4075// Aligned to 16 bytes.
4076void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4077 uint8* dst_argb, int width) {
4078 asm volatile (
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004079 "sub %0,%1 \n"
4080 "sub %0,%2 \n"
4081
4082 // 4 pixel loop.
4083 ".p2align 4 \n"
4084 "1: \n"
4085 "movdqa (%0),%%xmm0 \n"
4086 "movdqa (%0,%1),%%xmm1 \n"
4087 "paddusb %%xmm1,%%xmm0 \n"
4088 "sub $0x4,%3 \n"
4089 "movdqa %%xmm0,(%0,%2,1) \n"
4090 "lea 0x10(%0),%0 \n"
4091 "jg 1b \n"
4092 : "+r"(src_argb0), // %0
4093 "+r"(src_argb1), // %1
4094 "+r"(dst_argb), // %2
4095 "+r"(width) // %3
4096 :
4097 : "memory", "cc"
4098#if defined(__SSE2__)
fbarchard@google.com573a8832013-01-24 23:08:12 +00004099 , "xmm0", "xmm1"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004100#endif
4101 );
4102}
4103#endif // HAS_ARGBADDROW_SSE2
4104
fbarchard@google.com573a8832013-01-24 23:08:12 +00004105#ifdef HAS_ARGBSUBTRACTROW_SSE2
4106// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4107// Aligned to 16 bytes.
4108void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4109 uint8* dst_argb, int width) {
4110 asm volatile (
4111 "sub %0,%1 \n"
4112 "sub %0,%2 \n"
4113
4114 // 4 pixel loop.
4115 ".p2align 4 \n"
4116 "1: \n"
4117 "movdqa (%0),%%xmm0 \n"
4118 "movdqa (%0,%1),%%xmm1 \n"
4119 "psubusb %%xmm1,%%xmm0 \n"
4120 "sub $0x4,%3 \n"
4121 "movdqa %%xmm0,(%0,%2,1) \n"
4122 "lea 0x10(%0),%0 \n"
4123 "jg 1b \n"
4124 : "+r"(src_argb0), // %0
4125 "+r"(src_argb1), // %1
4126 "+r"(dst_argb), // %2
4127 "+r"(width) // %3
4128 :
4129 : "memory", "cc"
4130#if defined(__SSE2__)
4131 , "xmm0", "xmm1"
4132#endif
4133 );
4134}
4135#endif // HAS_ARGBSUBTRACTROW_SSE2
4136
fbarchard@google.com9d48df92013-03-24 20:12:25 +00004137#ifdef HAS_SOBELXROW_SSSE3
4138// SobelX as a matrix is
4139// -1 0 1
4140// -2 0 2
4141// -1 0 1
4142void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
4143 const uint8* src_y2, uint8* dst_sobelx, int width) {
4144 asm volatile (
4145 "sub %0,%1 \n"
4146 "sub %0,%2 \n"
4147 "sub %0,%3 \n"
4148 "pxor %%xmm5,%%xmm5 \n"
4149
4150 // 8 pixel loop.
4151 ".p2align 4 \n"
4152 "1: \n"
4153 "movq (%0),%%xmm0 \n"
4154 "movq 0x2(%0),%%xmm1 \n"
4155 "punpcklbw %%xmm5,%%xmm0 \n"
4156 "punpcklbw %%xmm5,%%xmm1 \n"
4157 "psubw %%xmm1,%%xmm0 \n"
4158 "movq (%0,%1,1),%%xmm1 \n"
4159 "movq 0x2(%0,%1,1),%%xmm2 \n"
4160 "punpcklbw %%xmm5,%%xmm1 \n"
4161 "punpcklbw %%xmm5,%%xmm2 \n"
4162 "psubw %%xmm2,%%xmm1 \n"
4163 "movq (%0,%2,1),%%xmm2 \n"
4164 "movq 0x2(%0,%2,1),%%xmm3 \n"
4165 "punpcklbw %%xmm5,%%xmm2 \n"
4166 "punpcklbw %%xmm5,%%xmm3 \n"
4167 "psubw %%xmm3,%%xmm2 \n"
4168 "paddw %%xmm2,%%xmm0 \n"
4169 "paddw %%xmm1,%%xmm0 \n"
4170 "paddw %%xmm1,%%xmm0 \n"
4171 "pabsw %%xmm0,%%xmm0 \n"
4172 "packuswb %%xmm0,%%xmm0 \n"
4173 "sub $0x8,%4 \n"
4174 "movq %%xmm0,(%0,%3,1) \n"
4175 "lea 0x8(%0),%0 \n"
4176 "jg 1b \n"
4177 : "+r"(src_y0), // %0
4178 "+r"(src_y1), // %1
4179 "+r"(src_y2), // %2
4180 "+r"(dst_sobelx), // %3
4181 "+r"(width) // %4
4182 :
4183 : "memory", "cc"
4184#if defined(__SSE2__)
4185 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4186#endif
4187 );
4188}
4189#endif // HAS_SOBELXROW_SSSE3
4190
4191#ifdef HAS_SOBELYROW_SSSE3
4192// SobelY as a matrix is
4193// -1 -2 -1
4194// 0 0 0
4195// 1 2 1
4196void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
4197 uint8* dst_sobely, int width) {
4198 asm volatile (
4199 "sub %0,%1 \n"
4200 "sub %0,%2 \n"
4201 "pxor %%xmm5,%%xmm5 \n"
4202
4203 // 8 pixel loop.
4204 ".p2align 4 \n"
4205 "1: \n"
4206 "movq (%0),%%xmm0 \n"
4207 "movq (%0,%1,1),%%xmm1 \n"
4208 "punpcklbw %%xmm5,%%xmm0 \n"
4209 "punpcklbw %%xmm5,%%xmm1 \n"
4210 "psubw %%xmm1,%%xmm0 \n"
4211 "movq 0x1(%0),%%xmm1 \n"
4212 "movq 0x1(%0,%1,1),%%xmm2 \n"
4213 "punpcklbw %%xmm5,%%xmm1 \n"
4214 "punpcklbw %%xmm5,%%xmm2 \n"
4215 "psubw %%xmm2,%%xmm1 \n"
4216 "movq 0x2(%0),%%xmm2 \n"
4217 "movq 0x2(%0,%1,1),%%xmm3 \n"
4218 "punpcklbw %%xmm5,%%xmm2 \n"
4219 "punpcklbw %%xmm5,%%xmm3 \n"
4220 "psubw %%xmm3,%%xmm2 \n"
4221 "paddw %%xmm2,%%xmm0 \n"
4222 "paddw %%xmm1,%%xmm0 \n"
4223 "paddw %%xmm1,%%xmm0 \n"
4224 "pabsw %%xmm0,%%xmm0 \n"
4225 "packuswb %%xmm0,%%xmm0 \n"
4226 "sub $0x8,%3 \n"
4227 "movq %%xmm0,(%0,%2,1) \n"
4228 "lea 0x8(%0),%0 \n"
4229 "jg 1b \n"
4230 : "+r"(src_y0), // %0
4231 "+r"(src_y1), // %1
4232 "+r"(dst_sobely), // %2
4233 "+r"(width) // %3
4234 :
4235 : "memory", "cc"
4236#if defined(__SSE2__)
4237 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4238#endif
4239 );
4240}
4241#endif // HAS_SOBELYROW_SSSE3
4242
4243#ifdef HAS_SOBELROW_SSE2
4244// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4245// A = 255
4246// R = Sobel
4247// G = Sobel
4248// B = Sobel
4249void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4250 uint8* dst_argb, int width) {
4251 asm volatile (
4252 "sub %0,%1 \n"
4253 "pcmpeqb %%xmm5,%%xmm5 \n"
4254 "pslld $0x18,%%xmm5 \n"
4255
4256 // 8 pixel loop.
4257 ".p2align 4 \n"
4258 "1: \n"
4259 "movdqa (%0),%%xmm0 \n"
4260 "movdqa (%0,%1,1),%%xmm1 \n"
4261 "lea 0x10(%0),%0 \n"
4262 "paddusb %%xmm1,%%xmm0 \n"
4263 "movdqa %%xmm0,%%xmm2 \n"
4264 "punpcklbw %%xmm0,%%xmm2 \n"
4265 "punpckhbw %%xmm0,%%xmm0 \n"
4266 "movdqa %%xmm2,%%xmm1 \n"
4267 "punpcklwd %%xmm2,%%xmm1 \n"
4268 "punpckhwd %%xmm2,%%xmm2 \n"
4269 "por %%xmm5,%%xmm1 \n"
4270 "por %%xmm5,%%xmm2 \n"
4271 "movdqa %%xmm0,%%xmm3 \n"
4272 "punpcklwd %%xmm0,%%xmm3 \n"
4273 "punpckhwd %%xmm0,%%xmm0 \n"
4274 "por %%xmm5,%%xmm3 \n"
4275 "por %%xmm5,%%xmm0 \n"
4276 "sub $0x10,%3 \n"
4277 "movdqa %%xmm1,(%2) \n"
4278 "movdqa %%xmm2,0x10(%2) \n"
4279 "movdqa %%xmm3,0x20(%2) \n"
4280 "movdqa %%xmm0,0x30(%2) \n"
4281 "lea 0x40(%2),%2 \n"
4282 "jg 1b \n"
4283 : "+r"(src_sobelx), // %0
4284 "+r"(src_sobely), // %1
4285 "+r"(dst_argb), // %2
4286 "+r"(width) // %3
4287 :
4288 : "memory", "cc"
4289#if defined(__SSE2__)
4290 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4291#endif
4292 );
4293}
4294#endif // HAS_SOBELROW_SSE2
4295
4296#ifdef HAS_SOBELXYROW_SSE2
4297// Mixes Sobel X, Sobel Y and Sobel into ARGB.
4298// A = 255
4299// R = Sobel X
4300// G = Sobel
4301// B = Sobel Y
4302void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4303 uint8* dst_argb, int width) {
4304 asm volatile (
4305 "sub %0,%1 \n"
4306 "pcmpeqb %%xmm5,%%xmm5 \n"
4307
4308 // 8 pixel loop.
4309 ".p2align 4 \n"
4310 "1: \n"
4311 "movdqa (%0),%%xmm0 \n"
4312 "movdqa (%0,%1,1),%%xmm1 \n"
4313 "lea 0x10(%0),%0 \n"
4314 "movdqa %%xmm0,%%xmm2 \n"
4315 "paddusb %%xmm1,%%xmm2 \n"
4316 "movdqa %%xmm0,%%xmm3 \n"
4317 "punpcklbw %%xmm5,%%xmm3 \n"
4318 "punpckhbw %%xmm5,%%xmm0 \n"
4319 "movdqa %%xmm1,%%xmm4 \n"
4320 "punpcklbw %%xmm2,%%xmm4 \n"
4321 "punpckhbw %%xmm2,%%xmm1 \n"
4322 "movdqa %%xmm4,%%xmm6 \n"
4323 "punpcklwd %%xmm3,%%xmm6 \n"
4324 "punpckhwd %%xmm3,%%xmm4 \n"
4325 "movdqa %%xmm1,%%xmm7 \n"
4326 "punpcklwd %%xmm0,%%xmm7 \n"
4327 "punpckhwd %%xmm0,%%xmm1 \n"
4328 "sub $0x10,%3 \n"
4329 "movdqa %%xmm6,(%2) \n"
4330 "movdqa %%xmm4,0x10(%2) \n"
4331 "movdqa %%xmm7,0x20(%2) \n"
4332 "movdqa %%xmm1,0x30(%2) \n"
4333 "lea 0x40(%2),%2 \n"
4334 "jg 1b \n"
4335 : "+r"(src_sobelx), // %0
4336 "+r"(src_sobely), // %1
4337 "+r"(dst_argb), // %2
4338 "+r"(width) // %3
4339 :
4340 : "memory", "cc"
4341#if defined(__SSE2__)
4342 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4343#endif
4344 );
4345}
4346#endif // HAS_SOBELXYROW_SSE2
4347
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004348#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4349// Creates a table of cumulative sums where each value is a sum of all values
4350// above and to the left of the value, inclusive of the value.
4351void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00004352 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004353 asm volatile (
4354 "sub %1,%2 \n"
4355 "pxor %%xmm0,%%xmm0 \n"
4356 "pxor %%xmm1,%%xmm1 \n"
4357 "sub $0x4,%3 \n"
4358 "jl 49f \n"
4359 "test $0xf,%1 \n"
4360 "jne 49f \n"
4361
4362 // 4 pixel loop \n"
4363 ".p2align 2 \n"
4364 "40: \n"
4365 "movdqu (%0),%%xmm2 \n"
4366 "lea 0x10(%0),%0 \n"
4367 "movdqa %%xmm2,%%xmm4 \n"
4368 "punpcklbw %%xmm1,%%xmm2 \n"
4369 "movdqa %%xmm2,%%xmm3 \n"
4370 "punpcklwd %%xmm1,%%xmm2 \n"
4371 "punpckhwd %%xmm1,%%xmm3 \n"
4372 "punpckhbw %%xmm1,%%xmm4 \n"
4373 "movdqa %%xmm4,%%xmm5 \n"
4374 "punpcklwd %%xmm1,%%xmm4 \n"
4375 "punpckhwd %%xmm1,%%xmm5 \n"
4376 "paddd %%xmm2,%%xmm0 \n"
4377 "movdqa (%1,%2,1),%%xmm2 \n"
4378 "paddd %%xmm0,%%xmm2 \n"
4379 "paddd %%xmm3,%%xmm0 \n"
4380 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
4381 "paddd %%xmm0,%%xmm3 \n"
4382 "paddd %%xmm4,%%xmm0 \n"
4383 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
4384 "paddd %%xmm0,%%xmm4 \n"
4385 "paddd %%xmm5,%%xmm0 \n"
4386 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
4387 "paddd %%xmm0,%%xmm5 \n"
4388 "movdqa %%xmm2,(%1) \n"
4389 "movdqa %%xmm3,0x10(%1) \n"
4390 "movdqa %%xmm4,0x20(%1) \n"
4391 "movdqa %%xmm5,0x30(%1) \n"
4392 "lea 0x40(%1),%1 \n"
4393 "sub $0x4,%3 \n"
4394 "jge 40b \n"
4395
4396 "49: \n"
4397 "add $0x3,%3 \n"
4398 "jl 19f \n"
4399
4400 // 1 pixel loop \n"
4401 ".p2align 2 \n"
4402 "10: \n"
4403 "movd (%0),%%xmm2 \n"
4404 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00004405 "punpcklbw %%xmm1,%%xmm2 \n"
4406 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004407 "paddd %%xmm2,%%xmm0 \n"
4408 "movdqu (%1,%2,1),%%xmm2 \n"
4409 "paddd %%xmm0,%%xmm2 \n"
4410 "movdqu %%xmm2,(%1) \n"
4411 "lea 0x10(%1),%1 \n"
4412 "sub $0x1,%3 \n"
4413 "jge 10b \n"
4414
4415 "19: \n"
4416 : "+r"(row), // %0
4417 "+r"(cumsum), // %1
4418 "+r"(previous_cumsum), // %2
4419 "+r"(width) // %3
4420 :
4421 : "memory", "cc"
4422#if defined(__SSE2__)
4423 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4424#endif
4425 );
4426}
4427#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4428
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004429#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4430void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4431 int width, int area, uint8* dst,
4432 int count) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004433 asm volatile (
4434 "movd %5,%%xmm4 \n"
4435 "cvtdq2ps %%xmm4,%%xmm4 \n"
4436 "rcpss %%xmm4,%%xmm4 \n"
4437 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4438 "sub $0x4,%3 \n"
4439 "jl 49f \n"
4440
4441 // 4 pixel loop \n"
4442 ".p2align 2 \n"
4443 "40: \n"
4444 "movdqa (%0),%%xmm0 \n"
4445 "movdqa 0x10(%0),%%xmm1 \n"
4446 "movdqa 0x20(%0),%%xmm2 \n"
4447 "movdqa 0x30(%0),%%xmm3 \n"
4448 "psubd (%0,%4,4),%%xmm0 \n"
4449 "psubd 0x10(%0,%4,4),%%xmm1 \n"
4450 "psubd 0x20(%0,%4,4),%%xmm2 \n"
4451 "psubd 0x30(%0,%4,4),%%xmm3 \n"
4452 "lea 0x40(%0),%0 \n"
4453 "psubd (%1),%%xmm0 \n"
4454 "psubd 0x10(%1),%%xmm1 \n"
4455 "psubd 0x20(%1),%%xmm2 \n"
4456 "psubd 0x30(%1),%%xmm3 \n"
4457 "paddd (%1,%4,4),%%xmm0 \n"
4458 "paddd 0x10(%1,%4,4),%%xmm1 \n"
4459 "paddd 0x20(%1,%4,4),%%xmm2 \n"
4460 "paddd 0x30(%1,%4,4),%%xmm3 \n"
4461 "lea 0x40(%1),%1 \n"
4462 "cvtdq2ps %%xmm0,%%xmm0 \n"
4463 "cvtdq2ps %%xmm1,%%xmm1 \n"
4464 "mulps %%xmm4,%%xmm0 \n"
4465 "mulps %%xmm4,%%xmm1 \n"
4466 "cvtdq2ps %%xmm2,%%xmm2 \n"
4467 "cvtdq2ps %%xmm3,%%xmm3 \n"
4468 "mulps %%xmm4,%%xmm2 \n"
4469 "mulps %%xmm4,%%xmm3 \n"
4470 "cvtps2dq %%xmm0,%%xmm0 \n"
4471 "cvtps2dq %%xmm1,%%xmm1 \n"
4472 "cvtps2dq %%xmm2,%%xmm2 \n"
4473 "cvtps2dq %%xmm3,%%xmm3 \n"
4474 "packssdw %%xmm1,%%xmm0 \n"
4475 "packssdw %%xmm3,%%xmm2 \n"
4476 "packuswb %%xmm2,%%xmm0 \n"
4477 "movdqu %%xmm0,(%2) \n"
4478 "lea 0x10(%2),%2 \n"
4479 "sub $0x4,%3 \n"
4480 "jge 40b \n"
4481
4482 "49: \n"
4483 "add $0x3,%3 \n"
4484 "jl 19f \n"
4485
4486 // 1 pixel loop \n"
4487 ".p2align 2 \n"
4488 "10: \n"
4489 "movdqa (%0),%%xmm0 \n"
4490 "psubd (%0,%4,4),%%xmm0 \n"
4491 "lea 0x10(%0),%0 \n"
4492 "psubd (%1),%%xmm0 \n"
4493 "paddd (%1,%4,4),%%xmm0 \n"
4494 "lea 0x10(%1),%1 \n"
4495 "cvtdq2ps %%xmm0,%%xmm0 \n"
4496 "mulps %%xmm4,%%xmm0 \n"
4497 "cvtps2dq %%xmm0,%%xmm0 \n"
4498 "packssdw %%xmm0,%%xmm0 \n"
4499 "packuswb %%xmm0,%%xmm0 \n"
4500 "movd %%xmm0,(%2) \n"
4501 "lea 0x4(%2),%2 \n"
4502 "sub $0x1,%3 \n"
4503 "jge 10b \n"
4504 "19: \n"
4505 : "+r"(topleft), // %0
4506 "+r"(botleft), // %1
4507 "+r"(dst), // %2
4508 "+rm"(count) // %3
4509 : "r"(static_cast<intptr_t>(width)), // %4
4510 "rm"(area) // %5
4511 : "memory", "cc"
4512#if defined(__SSE2__)
4513 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
4514#endif
4515 );
4516}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004517#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004518
fbarchard@google.com73444402012-08-09 17:33:29 +00004519#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004520// TODO(fbarchard): Find 64 bit way to avoid masking.
fbarchard@google.com73444402012-08-09 17:33:29 +00004521// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004522// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00004523// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004524
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00004525LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00004526void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004527 uint8* dst_argb, const float* src_dudv, int width) {
fbarchard@google.com73444402012-08-09 17:33:29 +00004528 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004529 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00004530 asm volatile (
4531 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004532 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004533 "shl $0x10,%1 \n"
4534 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004535 "movd %1,%%xmm5 \n"
4536 "sub $0x4,%4 \n"
4537 "jl 49f \n"
4538
4539 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4540 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004541 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004542 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004543 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004544 "movdqa %%xmm7,%%xmm4 \n"
4545 "addps %%xmm4,%%xmm4 \n"
4546 "movdqa %%xmm2,%%xmm3 \n"
4547 "addps %%xmm4,%%xmm3 \n"
4548 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004549
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004550 // 4 pixel loop \n"
4551 ".p2align 4 \n"
4552 "40: \n"
4553 "cvttps2dq %%xmm2,%%xmm0 \n"
4554 "cvttps2dq %%xmm3,%%xmm1 \n"
4555 "packssdw %%xmm1,%%xmm0 \n"
4556 "pmaddwd %%xmm5,%%xmm0 \n"
4557#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004558 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004559 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004560 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004561 "shr $32,%5 \n"
4562 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4563#else
4564 "movd %%xmm0,%1 \n"
4565 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4566 "movd %%xmm0,%5 \n"
4567 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4568#endif
4569 "movd (%0,%1,1),%%xmm1 \n"
4570 "movd (%0,%5,1),%%xmm6 \n"
4571 "punpckldq %%xmm6,%%xmm1 \n"
4572 "addps %%xmm4,%%xmm2 \n"
4573 "movq %%xmm1,(%2) \n"
4574#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004575 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004576 "mov %1,%5 \n"
4577 "and $0x0fffffff,%1 \n"
4578 "shr $32,%5 \n"
4579#else
4580 "movd %%xmm0,%1 \n"
4581 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4582 "movd %%xmm0,%5 \n"
4583#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004584 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004585 "movd (%0,%5,1),%%xmm6 \n"
4586 "punpckldq %%xmm6,%%xmm0 \n"
4587 "addps %%xmm4,%%xmm3 \n"
4588 "sub $0x4,%4 \n"
4589 "movq %%xmm0,0x08(%2) \n"
4590 "lea 0x10(%2),%2 \n"
4591 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004592
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004593 "49: \n"
4594 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004595 "jl 19f \n"
4596
4597 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004598 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004599 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004600 "cvttps2dq %%xmm2,%%xmm0 \n"
4601 "packssdw %%xmm0,%%xmm0 \n"
4602 "pmaddwd %%xmm5,%%xmm0 \n"
4603 "addps %%xmm7,%%xmm2 \n"
4604 "movd %%xmm0,%1 \n"
4605#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004606 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004607#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004608 "movd (%0,%1,1),%%xmm0 \n"
4609 "sub $0x1,%4 \n"
4610 "movd %%xmm0,(%2) \n"
4611 "lea 0x4(%2),%2 \n"
4612 "jge 10b \n"
4613 "19: \n"
4614 : "+r"(src_argb), // %0
4615 "+r"(src_argb_stride_temp), // %1
4616 "+r"(dst_argb), // %2
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004617 "+r"(src_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004618 "+rm"(width), // %4
4619 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004620 :
4621 : "memory", "cc"
4622#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004623 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004624#endif
4625 );
4626}
4627#endif // HAS_ARGBAFFINEROW_SSE2
4628
fbarchard@google.comb5491752012-11-20 09:44:46 +00004629// Bilinear image filtering.
4630// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4631void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004632 ptrdiff_t src_stride, int dst_width,
4633 int source_y_fraction) {
4634 asm volatile (
4635 "sub %1,%0 \n"
4636 "shr %3 \n"
4637 "cmp $0x0,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004638 "je 100f \n"
4639 "cmp $0x20,%3 \n"
4640 "je 75f \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004641 "cmp $0x40,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004642 "je 50f \n"
4643 "cmp $0x60,%3 \n"
4644 "je 25f \n"
4645
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004646 "movd %3,%%xmm0 \n"
4647 "neg %3 \n"
4648 "add $0x80,%3 \n"
4649 "movd %3,%%xmm5 \n"
4650 "punpcklbw %%xmm0,%%xmm5 \n"
4651 "punpcklwd %%xmm5,%%xmm5 \n"
4652 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004653
4654 // General purpose row blend.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004655 ".p2align 4 \n"
4656 "1: \n"
4657 "movdqa (%1),%%xmm0 \n"
4658 "movdqa (%1,%4,1),%%xmm2 \n"
4659 "movdqa %%xmm0,%%xmm1 \n"
4660 "punpcklbw %%xmm2,%%xmm0 \n"
4661 "punpckhbw %%xmm2,%%xmm1 \n"
4662 "pmaddubsw %%xmm5,%%xmm0 \n"
4663 "pmaddubsw %%xmm5,%%xmm1 \n"
4664 "psrlw $0x7,%%xmm0 \n"
4665 "psrlw $0x7,%%xmm1 \n"
4666 "packuswb %%xmm1,%%xmm0 \n"
4667 "sub $0x4,%2 \n"
4668 "movdqa %%xmm0,(%1,%0,1) \n"
4669 "lea 0x10(%1),%1 \n"
4670 "jg 1b \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004671 "jmp 99f \n"
4672
4673 // Blend 25 / 75.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004674 ".p2align 4 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004675 "25: \n"
4676 "movdqa (%1),%%xmm0 \n"
4677 "movdqa (%1,%4,1),%%xmm1 \n"
4678 "pavgb %%xmm1,%%xmm0 \n"
4679 "pavgb %%xmm1,%%xmm0 \n"
4680 "sub $0x4,%2 \n"
4681 "movdqa %%xmm0,(%1,%0,1) \n"
4682 "lea 0x10(%1),%1 \n"
4683 "jg 25b \n"
4684 "jmp 99f \n"
4685
4686 // Blend 50 / 50.
4687 ".p2align 4 \n"
4688 "50: \n"
4689 "movdqa (%1),%%xmm0 \n"
4690 "movdqa (%1,%4,1),%%xmm1 \n"
4691 "pavgb %%xmm1,%%xmm0 \n"
4692 "sub $0x4,%2 \n"
4693 "movdqa %%xmm0,(%1,%0,1) \n"
4694 "lea 0x10(%1),%1 \n"
4695 "jg 50b \n"
4696 "jmp 99f \n"
4697
4698 // Blend 75 / 25.
4699 ".p2align 4 \n"
4700 "75: \n"
4701 "movdqa (%1),%%xmm1 \n"
4702 "movdqa (%1,%4,1),%%xmm0 \n"
4703 "pavgb %%xmm1,%%xmm0 \n"
4704 "pavgb %%xmm1,%%xmm0 \n"
4705 "sub $0x4,%2 \n"
4706 "movdqa %%xmm0,(%1,%0,1) \n"
4707 "lea 0x10(%1),%1 \n"
4708 "jg 75b \n"
4709 "jmp 99f \n"
4710
4711 // Blend 100 / 0 - Copy row unchanged.
4712 ".p2align 4 \n"
4713 "100: \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004714 "movdqa (%1),%%xmm0 \n"
4715 "sub $0x4,%2 \n"
4716 "movdqa %%xmm0,(%1,%0,1) \n"
4717 "lea 0x10(%1),%1 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004718 "jg 100b \n"
4719
fbarchard@google.comb5491752012-11-20 09:44:46 +00004720 "99: \n"
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004721 : "+r"(dst_argb), // %0
4722 "+r"(src_argb), // %1
fbarchard@google.comb5491752012-11-20 09:44:46 +00004723 "+r"(dst_width), // %2
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004724 "+r"(source_y_fraction) // %3
4725 : "r"(static_cast<intptr_t>(src_stride)) // %4
4726 : "memory", "cc"
4727#if defined(__SSE2__)
4728 , "xmm0", "xmm1", "xmm2", "xmm5"
4729#endif
4730 );
4731}
4732
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004733// Bilinear image filtering.
4734// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4735void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
4736 ptrdiff_t src_stride, int dst_width,
4737 int source_y_fraction) {
4738 asm volatile (
4739 "sub %1,%0 \n"
4740 "shr %3 \n"
4741 "cmp $0x0,%3 \n"
4742 "je 100f \n"
4743 "cmp $0x20,%3 \n"
4744 "je 75f \n"
4745 "cmp $0x40,%3 \n"
4746 "je 50f \n"
4747 "cmp $0x60,%3 \n"
4748 "je 25f \n"
4749
4750 "movd %3,%%xmm0 \n"
4751 "neg %3 \n"
4752 "add $0x80,%3 \n"
4753 "movd %3,%%xmm5 \n"
4754 "punpcklbw %%xmm0,%%xmm5 \n"
4755 "punpcklwd %%xmm5,%%xmm5 \n"
4756 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4757 "pxor %%xmm4,%%xmm4 \n"
4758
4759 // General purpose row blend.
4760 ".p2align 4 \n"
4761 "1: \n"
4762 "movdqa (%1),%%xmm0 \n"
4763 "movdqa (%1,%4,1),%%xmm2 \n"
4764 "movdqa %%xmm0,%%xmm1 \n"
4765 "movdqa %%xmm2,%%xmm3 \n"
4766 "punpcklbw %%xmm4,%%xmm2 \n"
4767 "punpckhbw %%xmm4,%%xmm3 \n"
4768 "punpcklbw %%xmm4,%%xmm0 \n"
4769 "punpckhbw %%xmm4,%%xmm1 \n"
4770 "psubw %%xmm0,%%xmm2 \n"
4771 "psubw %%xmm1,%%xmm3 \n"
4772 "paddw %%xmm2,%%xmm2 \n"
4773 "paddw %%xmm3,%%xmm3 \n"
4774 "pmulhw %%xmm5,%%xmm2 \n"
4775 "pmulhw %%xmm5,%%xmm3 \n"
4776 "paddw %%xmm2,%%xmm0 \n"
4777 "paddw %%xmm3,%%xmm1 \n"
4778 "packuswb %%xmm1,%%xmm0 \n"
4779 "sub $0x4,%2 \n"
4780 "movdqa %%xmm0,(%1,%0,1) \n"
4781 "lea 0x10(%1),%1 \n"
4782 "jg 1b \n"
4783 "jmp 99f \n"
4784
4785 // Blend 25 / 75.
4786 ".p2align 4 \n"
4787 "25: \n"
4788 "movdqa (%1),%%xmm0 \n"
4789 "movdqa (%1,%4,1),%%xmm1 \n"
4790 "pavgb %%xmm1,%%xmm0 \n"
4791 "pavgb %%xmm1,%%xmm0 \n"
4792 "sub $0x4,%2 \n"
4793 "movdqa %%xmm0,(%1,%0,1) \n"
4794 "lea 0x10(%1),%1 \n"
4795 "jg 25b \n"
4796 "jmp 99f \n"
4797
4798 // Blend 50 / 50.
4799 ".p2align 4 \n"
4800 "50: \n"
4801 "movdqa (%1),%%xmm0 \n"
4802 "movdqa (%1,%4,1),%%xmm1 \n"
4803 "pavgb %%xmm1,%%xmm0 \n"
4804 "sub $0x4,%2 \n"
4805 "movdqa %%xmm0,(%1,%0,1) \n"
4806 "lea 0x10(%1),%1 \n"
4807 "jg 50b \n"
4808 "jmp 99f \n"
4809
4810 // Blend 75 / 25.
4811 ".p2align 4 \n"
4812 "75: \n"
4813 "movdqa (%1),%%xmm1 \n"
4814 "movdqa (%1,%4,1),%%xmm0 \n"
4815 "pavgb %%xmm1,%%xmm0 \n"
4816 "pavgb %%xmm1,%%xmm0 \n"
4817 "sub $0x4,%2 \n"
4818 "movdqa %%xmm0,(%1,%0,1) \n"
4819 "lea 0x10(%1),%1 \n"
4820 "jg 75b \n"
4821 "jmp 99f \n"
4822
4823 // Blend 100 / 0 - Copy row unchanged.
4824 ".p2align 4 \n"
4825 "100: \n"
4826 "movdqa (%1),%%xmm0 \n"
4827 "sub $0x4,%2 \n"
4828 "movdqa %%xmm0,(%1,%0,1) \n"
4829 "lea 0x10(%1),%1 \n"
4830 "jg 100b \n"
4831
4832 "99: \n"
4833 : "+r"(dst_argb), // %0
4834 "+r"(src_argb), // %1
4835 "+r"(dst_width), // %2
4836 "+r"(source_y_fraction) // %3
4837 : "r"(static_cast<intptr_t>(src_stride)) // %4
4838 : "memory", "cc"
4839#if defined(__SSE2__)
4840 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4841#endif
4842 );
4843}
4844
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004845void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4846 uint8* dst_uv, int pix) {
4847 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00004848 "sub %0,%1 \n"
4849 ".p2align 4 \n"
4850 "1: \n"
4851 "movdqa (%0),%%xmm0 \n"
4852 "pavgb (%0,%3),%%xmm0 \n"
4853 "sub $0x10,%2 \n"
4854 "movdqa %%xmm0,(%0,%1) \n"
4855 "lea 0x10(%0),%0 \n"
4856 "jg 1b \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004857 : "+r"(src_uv), // %0
4858 "+r"(dst_uv), // %1
4859 "+r"(pix) // %2
4860 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4861 : "memory", "cc"
4862#if defined(__SSE2__)
4863 , "xmm0"
4864#endif
4865 );
4866}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004867
4868void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4869 uint32 selector, int pix) {
4870 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00004871 "movd %3,%%xmm5 \n"
4872 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004873 ".p2align 4 \n"
4874 "1: \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00004875 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004876 "movdqa 0x10(%0),%%xmm1 \n"
4877 "lea 0x20(%0),%0 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00004878 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004879 "pshufb %%xmm5,%%xmm1 \n"
fbarchard@google.coma3be4702013-03-22 05:20:02 +00004880 "punpckldq %%xmm1,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004881 "sub $0x8,%2 \n"
4882 "movq %%xmm0,(%1) \n"
4883 "lea 0x8(%1),%1 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00004884 "jg 1b \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004885 : "+r"(src_argb), // %0
4886 "+r"(dst_bayer), // %1
4887 "+r"(pix) // %2
4888 : "g"(selector) // %3
4889 : "memory", "cc"
4890#if defined(__SSE2__)
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004891 , "xmm0", "xmm1", "xmm5"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004892#endif
4893 );
4894}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004895
fbarchard@google.com10965432013-03-08 23:22:32 +00004896// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4897void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4898 const uint8* shuffler, int pix) {
4899 asm volatile (
4900 "movdqa (%3),%%xmm5 \n"
4901 ".p2align 4 \n"
4902 "1: \n"
4903 "movdqa (%0),%%xmm0 \n"
4904 "movdqa 0x10(%0),%%xmm1 \n"
4905 "lea 0x20(%0),%0 \n"
4906 "pshufb %%xmm5,%%xmm0 \n"
4907 "pshufb %%xmm5,%%xmm1 \n"
4908 "sub $0x8,%2 \n"
4909 "movdqa %%xmm0,(%1) \n"
4910 "movdqa %%xmm1,0x10(%1) \n"
4911 "lea 0x20(%1),%1 \n"
4912 "jg 1b \n"
4913 : "+r"(src_argb), // %0
4914 "+r"(dst_argb), // %1
4915 "+r"(pix) // %2
4916 : "r"(shuffler) // %3
4917 : "memory", "cc"
4918#if defined(__SSE2__)
4919 , "xmm0", "xmm1", "xmm5"
4920#endif
4921 );
4922}
4923
4924void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
4925 const uint8* shuffler, int pix) {
4926 asm volatile (
4927 "movdqa (%3),%%xmm5 \n"
4928 ".p2align 4 \n"
4929 "1: \n"
4930 "movdqu (%0),%%xmm0 \n"
4931 "movdqu 0x10(%0),%%xmm1 \n"
4932 "lea 0x20(%0),%0 \n"
4933 "pshufb %%xmm5,%%xmm0 \n"
4934 "pshufb %%xmm5,%%xmm1 \n"
4935 "sub $0x8,%2 \n"
4936 "movdqu %%xmm0,(%1) \n"
4937 "movdqu %%xmm1,0x10(%1) \n"
4938 "lea 0x20(%1),%1 \n"
4939 "jg 1b \n"
4940 : "+r"(src_argb), // %0
4941 "+r"(dst_argb), // %1
4942 "+r"(pix) // %2
4943 : "r"(shuffler) // %3
4944 : "memory", "cc"
4945#if defined(__SSE2__)
4946 , "xmm0", "xmm1", "xmm5"
4947#endif
4948 );
4949}
4950
fbarchard@google.com9de88672012-10-12 06:23:33 +00004951void I422ToYUY2Row_SSE2(const uint8* src_y,
4952 const uint8* src_u,
4953 const uint8* src_v,
4954 uint8* dst_frame, int width) {
4955 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00004956 "sub %1,%2 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00004957 ".p2align 4 \n"
4958 "1: \n"
4959 "movq (%1),%%xmm2 \n"
4960 "movq (%1,%2,1),%%xmm3 \n"
4961 "lea 0x8(%1),%1 \n"
4962 "punpcklbw %%xmm3,%%xmm2 \n"
4963 "movdqa (%0),%%xmm0 \n"
4964 "lea 0x10(%0),%0 \n"
4965 "movdqa %%xmm0,%%xmm1 \n"
4966 "punpcklbw %%xmm2,%%xmm0 \n"
4967 "punpckhbw %%xmm2,%%xmm1 \n"
4968 "movdqa %%xmm0,(%3) \n"
4969 "movdqa %%xmm1,0x10(%3) \n"
4970 "lea 0x20(%3),%3 \n"
4971 "sub $0x10,%4 \n"
4972 "jg 1b \n"
4973 : "+r"(src_y), // %0
4974 "+r"(src_u), // %1
4975 "+r"(src_v), // %2
4976 "+r"(dst_frame), // %3
4977 "+rm"(width) // %4
4978 :
4979 : "memory", "cc"
4980#if defined(__SSE2__)
4981 , "xmm0", "xmm1", "xmm2", "xmm3"
4982#endif
4983 );
4984}
4985
4986void I422ToUYVYRow_SSE2(const uint8* src_y,
4987 const uint8* src_u,
4988 const uint8* src_v,
4989 uint8* dst_frame, int width) {
4990 asm volatile (
4991 "sub %1,%2 \n"
4992 ".p2align 4 \n"
4993 "1: \n"
4994 "movq (%1),%%xmm2 \n"
4995 "movq (%1,%2,1),%%xmm3 \n"
4996 "lea 0x8(%1),%1 \n"
4997 "punpcklbw %%xmm3,%%xmm2 \n"
4998 "movdqa (%0),%%xmm0 \n"
4999 "movdqa %%xmm2,%%xmm1 \n"
5000 "lea 0x10(%0),%0 \n"
5001 "punpcklbw %%xmm0,%%xmm1 \n"
5002 "punpckhbw %%xmm0,%%xmm2 \n"
5003 "movdqa %%xmm1,(%3) \n"
5004 "movdqa %%xmm2,0x10(%3) \n"
5005 "lea 0x20(%3),%3 \n"
5006 "sub $0x10,%4 \n"
5007 "jg 1b \n"
5008 : "+r"(src_y), // %0
5009 "+r"(src_u), // %1
5010 "+r"(src_v), // %2
5011 "+r"(dst_frame), // %3
5012 "+rm"(width) // %4
5013 :
5014 : "memory", "cc"
5015#if defined(__SSE2__)
5016 , "xmm0", "xmm1", "xmm2", "xmm3"
5017#endif
5018 );
5019}
5020
fbarchard@google.com2d11d432012-02-16 02:50:39 +00005021#endif // defined(__x86_64__) || defined(__i386__)
5022
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00005023#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00005024} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00005025} // namespace libyuv
5026#endif