blob: 83a440787364049ba945b7df45080871de3571c8 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
fbarchard@google.comcde58702013-01-28 00:02:35 +00007 * in the file PATENTS. All contributing project authors may
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00008 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.com83a63e62013-02-27 00:20:29 +000021#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +000038// JPeg full range.
39CONST vec8 kARGBToYJ = {
fbarchard@google.com050b39a2013-04-01 20:07:14 +000040 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +000041};
42
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000043CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000044 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
45};
46
fbarchard@google.com050b39a2013-04-01 20:07:14 +000047CONST vec8 kARGBToUJ = {
48 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
49};
50
fbarchard@google.com714050a2012-02-17 22:59:56 +000051CONST vec8 kARGBToV = {
52 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
53};
54
fbarchard@google.com050b39a2013-04-01 20:07:14 +000055CONST vec8 kARGBToVJ = {
56 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
57};
58
fbarchard@google.com714050a2012-02-17 22:59:56 +000059// Constants for BGRA
60CONST vec8 kBGRAToY = {
61 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
62};
63
64CONST vec8 kBGRAToU = {
65 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
66};
67
68CONST vec8 kBGRAToV = {
69 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
70};
71
72// Constants for ABGR
73CONST vec8 kABGRToY = {
74 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
75};
76
77CONST vec8 kABGRToU = {
78 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
79};
80
81CONST vec8 kABGRToV = {
82 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
83};
84
fbarchard@google.com4de0c432012-10-11 01:25:46 +000085// Constants for RGBA.
86CONST vec8 kRGBAToY = {
87 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
88};
89
90CONST vec8 kRGBAToU = {
91 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
92};
93
94CONST vec8 kRGBAToV = {
95 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
96};
97
fbarchard@google.com714050a2012-02-17 22:59:56 +000098CONST uvec8 kAddY16 = {
99 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +0000100};
fbarchard@google.com2430e042011-11-11 21:57:06 +0000101
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000102CONST vec16 kAddYJ64 = {
103 64, 64, 64, 64, 64, 64, 64, 64
104};
105
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000106CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000107 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
109};
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000110
fbarchard@google.com050b39a2013-04-01 20:07:14 +0000111CONST uvec16 kAddUVJ128 = {
112 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
113};
114
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000115// Shuffle table for converting RGB24 to ARGB.
116CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000117 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
118};
119
120// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000121CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000122 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
123};
124
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000125// Shuffle table for converting ARGB to RGB24.
126CONST uvec8 kShuffleMaskARGBToRGB24 = {
127 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
128};
129
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000130// Shuffle table for converting ARGB to RAW.
131CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000132 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000133};
134
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000135// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000136CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
137 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
138};
139
140// Shuffle table for converting ARGB to RAW.
141CONST uvec8 kShuffleMaskARGBToRAW_0 = {
142 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
143};
144
fbarchard@google.comb6149762011-11-07 21:58:52 +0000145void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000146 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000147 "pcmpeqb %%xmm5,%%xmm5 \n"
148 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000149 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000150 "1: \n"
151 "movq (%0),%%xmm0 \n"
152 "lea 0x8(%0),%0 \n"
153 "punpcklbw %%xmm0,%%xmm0 \n"
154 "movdqa %%xmm0,%%xmm1 \n"
155 "punpcklwd %%xmm0,%%xmm0 \n"
156 "punpckhwd %%xmm1,%%xmm1 \n"
157 "por %%xmm5,%%xmm0 \n"
158 "por %%xmm5,%%xmm1 \n"
159 "movdqa %%xmm0,(%1) \n"
160 "movdqa %%xmm1,0x10(%1) \n"
161 "lea 0x20(%1),%1 \n"
162 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000163 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000164 : "+r"(src_y), // %0
165 "+r"(dst_argb), // %1
166 "+r"(pix) // %2
167 :
168 : "memory", "cc"
169#if defined(__SSE2__)
170 , "xmm0", "xmm1", "xmm5"
171#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000172 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000173}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000174
fbarchard@google.com00b69a22012-11-02 06:03:28 +0000175void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
176 int pix) {
177 asm volatile (
178 "pcmpeqb %%xmm5,%%xmm5 \n"
179 "pslld $0x18,%%xmm5 \n"
180 ".p2align 4 \n"
181 "1: \n"
182 "movq (%0),%%xmm0 \n"
183 "lea 0x8(%0),%0 \n"
184 "punpcklbw %%xmm0,%%xmm0 \n"
185 "movdqa %%xmm0,%%xmm1 \n"
186 "punpcklwd %%xmm0,%%xmm0 \n"
187 "punpckhwd %%xmm1,%%xmm1 \n"
188 "por %%xmm5,%%xmm0 \n"
189 "por %%xmm5,%%xmm1 \n"
190 "movdqu %%xmm0,(%1) \n"
191 "movdqu %%xmm1,0x10(%1) \n"
192 "lea 0x20(%1),%1 \n"
193 "sub $0x8,%2 \n"
194 "jg 1b \n"
195 : "+r"(src_y), // %0
196 "+r"(dst_argb), // %1
197 "+r"(pix) // %2
198 :
199 : "memory", "cc"
200#if defined(__SSE2__)
201 , "xmm0", "xmm1", "xmm5"
202#endif
203 );
204}
205
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000206void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000207 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000208 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
209 "pslld $0x18,%%xmm5 \n"
210 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000211 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000212 "1: \n"
213 "movdqu (%0),%%xmm0 \n"
214 "movdqu 0x10(%0),%%xmm1 \n"
215 "movdqu 0x20(%0),%%xmm3 \n"
216 "lea 0x30(%0),%0 \n"
217 "movdqa %%xmm3,%%xmm2 \n"
218 "palignr $0x8,%%xmm1,%%xmm2 \n"
219 "pshufb %%xmm4,%%xmm2 \n"
220 "por %%xmm5,%%xmm2 \n"
221 "palignr $0xc,%%xmm0,%%xmm1 \n"
222 "pshufb %%xmm4,%%xmm0 \n"
223 "movdqa %%xmm2,0x20(%1) \n"
224 "por %%xmm5,%%xmm0 \n"
225 "pshufb %%xmm4,%%xmm1 \n"
226 "movdqa %%xmm0,(%1) \n"
227 "por %%xmm5,%%xmm1 \n"
228 "palignr $0x4,%%xmm3,%%xmm3 \n"
229 "pshufb %%xmm4,%%xmm3 \n"
230 "movdqa %%xmm1,0x10(%1) \n"
231 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000232 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000233 "movdqa %%xmm3,0x30(%1) \n"
234 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000235 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000236 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000237 "+r"(dst_argb), // %1
238 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000239 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000240 : "memory", "cc"
241#if defined(__SSE2__)
242 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
243#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000244 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000245}
246
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000247void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000248 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000249 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
250 "pslld $0x18,%%xmm5 \n"
251 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000252 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000253 "1: \n"
254 "movdqu (%0),%%xmm0 \n"
255 "movdqu 0x10(%0),%%xmm1 \n"
256 "movdqu 0x20(%0),%%xmm3 \n"
257 "lea 0x30(%0),%0 \n"
258 "movdqa %%xmm3,%%xmm2 \n"
259 "palignr $0x8,%%xmm1,%%xmm2 \n"
260 "pshufb %%xmm4,%%xmm2 \n"
261 "por %%xmm5,%%xmm2 \n"
262 "palignr $0xc,%%xmm0,%%xmm1 \n"
263 "pshufb %%xmm4,%%xmm0 \n"
264 "movdqa %%xmm2,0x20(%1) \n"
265 "por %%xmm5,%%xmm0 \n"
266 "pshufb %%xmm4,%%xmm1 \n"
267 "movdqa %%xmm0,(%1) \n"
268 "por %%xmm5,%%xmm1 \n"
269 "palignr $0x4,%%xmm3,%%xmm3 \n"
270 "pshufb %%xmm4,%%xmm3 \n"
271 "movdqa %%xmm1,0x10(%1) \n"
272 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000273 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000274 "movdqa %%xmm3,0x30(%1) \n"
275 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000276 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000277 : "+r"(src_raw), // %0
278 "+r"(dst_argb), // %1
279 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000280 : "m"(kShuffleMaskRAWToARGB) // %3
281 : "memory", "cc"
282#if defined(__SSE2__)
283 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
284#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000285 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000286}
287
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000288void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000289 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000290 "mov $0x1080108,%%eax \n"
291 "movd %%eax,%%xmm5 \n"
292 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000293 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000294 "movd %%eax,%%xmm6 \n"
295 "pshufd $0x0,%%xmm6,%%xmm6 \n"
296 "pcmpeqb %%xmm3,%%xmm3 \n"
297 "psllw $0xb,%%xmm3 \n"
298 "pcmpeqb %%xmm4,%%xmm4 \n"
299 "psllw $0xa,%%xmm4 \n"
300 "psrlw $0x5,%%xmm4 \n"
301 "pcmpeqb %%xmm7,%%xmm7 \n"
302 "psllw $0x8,%%xmm7 \n"
303 "sub %0,%1 \n"
304 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000305 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000306 "1: \n"
307 "movdqu (%0),%%xmm0 \n"
308 "movdqa %%xmm0,%%xmm1 \n"
309 "movdqa %%xmm0,%%xmm2 \n"
310 "pand %%xmm3,%%xmm1 \n"
311 "psllw $0xb,%%xmm2 \n"
312 "pmulhuw %%xmm5,%%xmm1 \n"
313 "pmulhuw %%xmm5,%%xmm2 \n"
314 "psllw $0x8,%%xmm1 \n"
315 "por %%xmm2,%%xmm1 \n"
316 "pand %%xmm4,%%xmm0 \n"
317 "pmulhuw %%xmm6,%%xmm0 \n"
318 "por %%xmm7,%%xmm0 \n"
319 "movdqa %%xmm1,%%xmm2 \n"
320 "punpcklbw %%xmm0,%%xmm1 \n"
321 "punpckhbw %%xmm0,%%xmm2 \n"
322 "movdqa %%xmm1,(%1,%0,2) \n"
323 "movdqa %%xmm2,0x10(%1,%0,2) \n"
324 "lea 0x10(%0),%0 \n"
325 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000326 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000327 : "+r"(src), // %0
328 "+r"(dst), // %1
329 "+r"(pix) // %2
330 :
331 : "memory", "cc", "eax"
332#if defined(__SSE2__)
333 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
334#endif
335 );
336}
337
338void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000339 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000340 "mov $0x1080108,%%eax \n"
341 "movd %%eax,%%xmm5 \n"
342 "pshufd $0x0,%%xmm5,%%xmm5 \n"
343 "mov $0x42004200,%%eax \n"
344 "movd %%eax,%%xmm6 \n"
345 "pshufd $0x0,%%xmm6,%%xmm6 \n"
346 "pcmpeqb %%xmm3,%%xmm3 \n"
347 "psllw $0xb,%%xmm3 \n"
348 "movdqa %%xmm3,%%xmm4 \n"
349 "psrlw $0x6,%%xmm4 \n"
350 "pcmpeqb %%xmm7,%%xmm7 \n"
351 "psllw $0x8,%%xmm7 \n"
352 "sub %0,%1 \n"
353 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000354 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000355 "1: \n"
356 "movdqu (%0),%%xmm0 \n"
357 "movdqa %%xmm0,%%xmm1 \n"
358 "movdqa %%xmm0,%%xmm2 \n"
359 "psllw $0x1,%%xmm1 \n"
360 "psllw $0xb,%%xmm2 \n"
361 "pand %%xmm3,%%xmm1 \n"
362 "pmulhuw %%xmm5,%%xmm2 \n"
363 "pmulhuw %%xmm5,%%xmm1 \n"
364 "psllw $0x8,%%xmm1 \n"
365 "por %%xmm2,%%xmm1 \n"
366 "movdqa %%xmm0,%%xmm2 \n"
367 "pand %%xmm4,%%xmm0 \n"
368 "psraw $0x8,%%xmm2 \n"
369 "pmulhuw %%xmm6,%%xmm0 \n"
370 "pand %%xmm7,%%xmm2 \n"
371 "por %%xmm2,%%xmm0 \n"
372 "movdqa %%xmm1,%%xmm2 \n"
373 "punpcklbw %%xmm0,%%xmm1 \n"
374 "punpckhbw %%xmm0,%%xmm2 \n"
375 "movdqa %%xmm1,(%1,%0,2) \n"
376 "movdqa %%xmm2,0x10(%1,%0,2) \n"
377 "lea 0x10(%0),%0 \n"
378 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000379 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000380 : "+r"(src), // %0
381 "+r"(dst), // %1
382 "+r"(pix) // %2
383 :
384 : "memory", "cc", "eax"
385#if defined(__SSE2__)
386 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
387#endif
388 );
389}
390
391void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000392 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000393 "mov $0xf0f0f0f,%%eax \n"
394 "movd %%eax,%%xmm4 \n"
395 "pshufd $0x0,%%xmm4,%%xmm4 \n"
396 "movdqa %%xmm4,%%xmm5 \n"
397 "pslld $0x4,%%xmm5 \n"
398 "sub %0,%1 \n"
399 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000400 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000401 "1: \n"
402 "movdqu (%0),%%xmm0 \n"
403 "movdqa %%xmm0,%%xmm2 \n"
404 "pand %%xmm4,%%xmm0 \n"
405 "pand %%xmm5,%%xmm2 \n"
406 "movdqa %%xmm0,%%xmm1 \n"
407 "movdqa %%xmm2,%%xmm3 \n"
408 "psllw $0x4,%%xmm1 \n"
409 "psrlw $0x4,%%xmm3 \n"
410 "por %%xmm1,%%xmm0 \n"
411 "por %%xmm3,%%xmm2 \n"
412 "movdqa %%xmm0,%%xmm1 \n"
413 "punpcklbw %%xmm2,%%xmm0 \n"
414 "punpckhbw %%xmm2,%%xmm1 \n"
415 "movdqa %%xmm0,(%1,%0,2) \n"
416 "movdqa %%xmm1,0x10(%1,%0,2) \n"
417 "lea 0x10(%0),%0 \n"
418 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000419 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000420 : "+r"(src), // %0
421 "+r"(dst), // %1
422 "+r"(pix) // %2
423 :
424 : "memory", "cc", "eax"
425#if defined(__SSE2__)
426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
427#endif
428 );
429}
430
431void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000432 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000433 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000434 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000435 "1: \n"
436 "movdqa (%0),%%xmm0 \n"
437 "movdqa 0x10(%0),%%xmm1 \n"
438 "movdqa 0x20(%0),%%xmm2 \n"
439 "movdqa 0x30(%0),%%xmm3 \n"
440 "lea 0x40(%0),%0 \n"
441 "pshufb %%xmm6,%%xmm0 \n"
442 "pshufb %%xmm6,%%xmm1 \n"
443 "pshufb %%xmm6,%%xmm2 \n"
444 "pshufb %%xmm6,%%xmm3 \n"
445 "movdqa %%xmm1,%%xmm4 \n"
446 "psrldq $0x4,%%xmm1 \n"
447 "pslldq $0xc,%%xmm4 \n"
448 "movdqa %%xmm2,%%xmm5 \n"
449 "por %%xmm4,%%xmm0 \n"
450 "pslldq $0x8,%%xmm5 \n"
451 "movdqa %%xmm0,(%1) \n"
452 "por %%xmm5,%%xmm1 \n"
453 "psrldq $0x8,%%xmm2 \n"
454 "pslldq $0x4,%%xmm3 \n"
455 "por %%xmm3,%%xmm2 \n"
456 "movdqa %%xmm1,0x10(%1) \n"
457 "movdqa %%xmm2,0x20(%1) \n"
458 "lea 0x30(%1),%1 \n"
459 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000460 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000461 : "+r"(src), // %0
462 "+r"(dst), // %1
463 "+r"(pix) // %2
464 : "m"(kShuffleMaskARGBToRGB24) // %3
465 : "memory", "cc"
466#if defined(__SSE2__)
467 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
468#endif
469 );
470}
471
472void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000473 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000474 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000475 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000476 "1: \n"
477 "movdqa (%0),%%xmm0 \n"
478 "movdqa 0x10(%0),%%xmm1 \n"
479 "movdqa 0x20(%0),%%xmm2 \n"
480 "movdqa 0x30(%0),%%xmm3 \n"
481 "lea 0x40(%0),%0 \n"
482 "pshufb %%xmm6,%%xmm0 \n"
483 "pshufb %%xmm6,%%xmm1 \n"
484 "pshufb %%xmm6,%%xmm2 \n"
485 "pshufb %%xmm6,%%xmm3 \n"
486 "movdqa %%xmm1,%%xmm4 \n"
487 "psrldq $0x4,%%xmm1 \n"
488 "pslldq $0xc,%%xmm4 \n"
489 "movdqa %%xmm2,%%xmm5 \n"
490 "por %%xmm4,%%xmm0 \n"
491 "pslldq $0x8,%%xmm5 \n"
492 "movdqa %%xmm0,(%1) \n"
493 "por %%xmm5,%%xmm1 \n"
494 "psrldq $0x8,%%xmm2 \n"
495 "pslldq $0x4,%%xmm3 \n"
496 "por %%xmm3,%%xmm2 \n"
497 "movdqa %%xmm1,0x10(%1) \n"
498 "movdqa %%xmm2,0x20(%1) \n"
499 "lea 0x30(%1),%1 \n"
500 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000501 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000502 : "+r"(src), // %0
503 "+r"(dst), // %1
504 "+r"(pix) // %2
505 : "m"(kShuffleMaskARGBToRAW) // %3
506 : "memory", "cc"
507#if defined(__SSE2__)
508 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
509#endif
510 );
511}
512
fbarchard@google.comdbcabea2012-10-29 21:20:25 +0000513void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000514 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000515 "pcmpeqb %%xmm3,%%xmm3 \n"
516 "psrld $0x1b,%%xmm3 \n"
517 "pcmpeqb %%xmm4,%%xmm4 \n"
518 "psrld $0x1a,%%xmm4 \n"
519 "pslld $0x5,%%xmm4 \n"
520 "pcmpeqb %%xmm5,%%xmm5 \n"
521 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000522 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000523 "1: \n"
524 "movdqa (%0),%%xmm0 \n"
525 "movdqa %%xmm0,%%xmm1 \n"
526 "movdqa %%xmm0,%%xmm2 \n"
527 "pslld $0x8,%%xmm0 \n"
528 "psrld $0x3,%%xmm1 \n"
529 "psrld $0x5,%%xmm2 \n"
530 "psrad $0x10,%%xmm0 \n"
531 "pand %%xmm3,%%xmm1 \n"
532 "pand %%xmm4,%%xmm2 \n"
533 "pand %%xmm5,%%xmm0 \n"
534 "por %%xmm2,%%xmm1 \n"
535 "por %%xmm1,%%xmm0 \n"
536 "packssdw %%xmm0,%%xmm0 \n"
537 "lea 0x10(%0),%0 \n"
538 "movq %%xmm0,(%1) \n"
539 "lea 0x8(%1),%1 \n"
540 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000541 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000542 : "+r"(src), // %0
543 "+r"(dst), // %1
544 "+r"(pix) // %2
545 :
546 : "memory", "cc"
547#if defined(__SSE2__)
548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
549#endif
550 );
551}
552
553void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000554 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000555 "pcmpeqb %%xmm4,%%xmm4 \n"
556 "psrld $0x1b,%%xmm4 \n"
557 "movdqa %%xmm4,%%xmm5 \n"
558 "pslld $0x5,%%xmm5 \n"
559 "movdqa %%xmm4,%%xmm6 \n"
560 "pslld $0xa,%%xmm6 \n"
561 "pcmpeqb %%xmm7,%%xmm7 \n"
562 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000563 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000564 "1: \n"
565 "movdqa (%0),%%xmm0 \n"
566 "movdqa %%xmm0,%%xmm1 \n"
567 "movdqa %%xmm0,%%xmm2 \n"
568 "movdqa %%xmm0,%%xmm3 \n"
569 "psrad $0x10,%%xmm0 \n"
570 "psrld $0x3,%%xmm1 \n"
571 "psrld $0x6,%%xmm2 \n"
572 "psrld $0x9,%%xmm3 \n"
573 "pand %%xmm7,%%xmm0 \n"
574 "pand %%xmm4,%%xmm1 \n"
575 "pand %%xmm5,%%xmm2 \n"
576 "pand %%xmm6,%%xmm3 \n"
577 "por %%xmm1,%%xmm0 \n"
578 "por %%xmm3,%%xmm2 \n"
579 "por %%xmm2,%%xmm0 \n"
580 "packssdw %%xmm0,%%xmm0 \n"
581 "lea 0x10(%0),%0 \n"
582 "movq %%xmm0,(%1) \n"
583 "lea 0x8(%1),%1 \n"
584 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000585 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 : "+r"(src), // %0
587 "+r"(dst), // %1
588 "+r"(pix) // %2
589 :
590 : "memory", "cc"
591#if defined(__SSE2__)
592 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
593#endif
594 );
595}
596
597void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000598 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000599 "pcmpeqb %%xmm4,%%xmm4 \n"
600 "psllw $0xc,%%xmm4 \n"
601 "movdqa %%xmm4,%%xmm3 \n"
602 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000603 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000604 "1: \n"
605 "movdqa (%0),%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "pand %%xmm3,%%xmm0 \n"
608 "pand %%xmm4,%%xmm1 \n"
609 "psrlq $0x4,%%xmm0 \n"
610 "psrlq $0x8,%%xmm1 \n"
611 "por %%xmm1,%%xmm0 \n"
612 "packuswb %%xmm0,%%xmm0 \n"
613 "lea 0x10(%0),%0 \n"
614 "movq %%xmm0,(%1) \n"
615 "lea 0x8(%1),%1 \n"
616 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000617 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000618 : "+r"(src), // %0
619 "+r"(dst), // %1
620 "+r"(pix) // %2
621 :
622 : "memory", "cc"
623#if defined(__SSE2__)
624 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
625#endif
626 );
627}
628
fbarchard@google.comb6149762011-11-07 21:58:52 +0000629void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000630 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000631 "movdqa %4,%%xmm5 \n"
632 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000633 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000634 "1: \n"
635 "movdqa (%0),%%xmm0 \n"
636 "movdqa 0x10(%0),%%xmm1 \n"
637 "movdqa 0x20(%0),%%xmm2 \n"
638 "movdqa 0x30(%0),%%xmm3 \n"
639 "pmaddubsw %%xmm4,%%xmm0 \n"
640 "pmaddubsw %%xmm4,%%xmm1 \n"
641 "pmaddubsw %%xmm4,%%xmm2 \n"
642 "pmaddubsw %%xmm4,%%xmm3 \n"
643 "lea 0x40(%0),%0 \n"
644 "phaddw %%xmm1,%%xmm0 \n"
645 "phaddw %%xmm3,%%xmm2 \n"
646 "psrlw $0x7,%%xmm0 \n"
647 "psrlw $0x7,%%xmm2 \n"
648 "packuswb %%xmm2,%%xmm0 \n"
649 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000650 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000651 "movdqa %%xmm0,(%1) \n"
652 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000653 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000654 : "+r"(src_argb), // %0
655 "+r"(dst_y), // %1
656 "+r"(pix) // %2
657 : "m"(kARGBToY), // %3
658 "m"(kAddY16) // %4
659 : "memory", "cc"
660#if defined(__SSE2__)
661 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
662#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000663 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000664}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000665
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000666void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
667 asm volatile (
668 "movdqa %3,%%xmm4 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000669 "movdqa %4,%%xmm5 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000670 ".p2align 4 \n"
671 "1: \n"
672 "movdqa (%0),%%xmm0 \n"
673 "movdqa 0x10(%0),%%xmm1 \n"
674 "movdqa 0x20(%0),%%xmm2 \n"
675 "movdqa 0x30(%0),%%xmm3 \n"
676 "pmaddubsw %%xmm4,%%xmm0 \n"
677 "pmaddubsw %%xmm4,%%xmm1 \n"
678 "pmaddubsw %%xmm4,%%xmm2 \n"
679 "pmaddubsw %%xmm4,%%xmm3 \n"
680 "lea 0x40(%0),%0 \n"
681 "phaddw %%xmm1,%%xmm0 \n"
682 "phaddw %%xmm3,%%xmm2 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000683 "paddw %%xmm5,%%xmm0 \n"
684 "paddw %%xmm5,%%xmm2 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000685 "psrlw $0x7,%%xmm0 \n"
686 "psrlw $0x7,%%xmm2 \n"
687 "packuswb %%xmm2,%%xmm0 \n"
688 "sub $0x10,%2 \n"
689 "movdqa %%xmm0,(%1) \n"
690 "lea 0x10(%1),%1 \n"
691 "jg 1b \n"
692 : "+r"(src_argb), // %0
693 "+r"(dst_y), // %1
694 "+r"(pix) // %2
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000695 : "m"(kARGBToYJ), // %3
696 "m"(kAddYJ64) // %4
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000697 : "memory", "cc"
698#if defined(__SSE2__)
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000699 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000700#endif
701 );
702}
703
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000704void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000705 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000706 "movdqa %4,%%xmm5 \n"
707 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000708 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000709 "1: \n"
710 "movdqu (%0),%%xmm0 \n"
711 "movdqu 0x10(%0),%%xmm1 \n"
712 "movdqu 0x20(%0),%%xmm2 \n"
713 "movdqu 0x30(%0),%%xmm3 \n"
714 "pmaddubsw %%xmm4,%%xmm0 \n"
715 "pmaddubsw %%xmm4,%%xmm1 \n"
716 "pmaddubsw %%xmm4,%%xmm2 \n"
717 "pmaddubsw %%xmm4,%%xmm3 \n"
718 "lea 0x40(%0),%0 \n"
719 "phaddw %%xmm1,%%xmm0 \n"
720 "phaddw %%xmm3,%%xmm2 \n"
721 "psrlw $0x7,%%xmm0 \n"
722 "psrlw $0x7,%%xmm2 \n"
723 "packuswb %%xmm2,%%xmm0 \n"
724 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000725 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000726 "movdqu %%xmm0,(%1) \n"
727 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000728 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000729 : "+r"(src_argb), // %0
730 "+r"(dst_y), // %1
731 "+r"(pix) // %2
732 : "m"(kARGBToY), // %3
733 "m"(kAddY16) // %4
734 : "memory", "cc"
735#if defined(__SSE2__)
736 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
737#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000738 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000739}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000740
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000741void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
742 asm volatile (
743 "movdqa %3,%%xmm4 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000744 "movdqa %4,%%xmm5 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000745 ".p2align 4 \n"
746 "1: \n"
747 "movdqu (%0),%%xmm0 \n"
748 "movdqu 0x10(%0),%%xmm1 \n"
749 "movdqu 0x20(%0),%%xmm2 \n"
750 "movdqu 0x30(%0),%%xmm3 \n"
751 "pmaddubsw %%xmm4,%%xmm0 \n"
752 "pmaddubsw %%xmm4,%%xmm1 \n"
753 "pmaddubsw %%xmm4,%%xmm2 \n"
754 "pmaddubsw %%xmm4,%%xmm3 \n"
755 "lea 0x40(%0),%0 \n"
756 "phaddw %%xmm1,%%xmm0 \n"
757 "phaddw %%xmm3,%%xmm2 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000758 "paddw %%xmm5,%%xmm0 \n"
759 "paddw %%xmm5,%%xmm2 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000760 "psrlw $0x7,%%xmm0 \n"
761 "psrlw $0x7,%%xmm2 \n"
762 "packuswb %%xmm2,%%xmm0 \n"
763 "sub $0x10,%2 \n"
764 "movdqu %%xmm0,(%1) \n"
765 "lea 0x10(%1),%1 \n"
766 "jg 1b \n"
767 : "+r"(src_argb), // %0
768 "+r"(dst_y), // %1
769 "+r"(pix) // %2
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000770 : "m"(kARGBToYJ), // %3
771 "m"(kAddYJ64) // %4
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000772 : "memory", "cc"
773#if defined(__SSE2__)
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000774 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000775#endif
776 );
777}
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000778
fbarchard@google.com714050a2012-02-17 22:59:56 +0000779// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000780// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
781// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
782// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000783// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000784void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
785 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000786 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000787 "movdqa %0,%%xmm4 \n"
788 "movdqa %1,%%xmm3 \n"
789 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000790 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000791 : "m"(kARGBToU), // %0
792 "m"(kARGBToV), // %1
793 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000794 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000795 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000796 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000797 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000798 "1: \n"
799 "movdqa (%0),%%xmm0 \n"
800 "movdqa 0x10(%0),%%xmm1 \n"
801 "movdqa 0x20(%0),%%xmm2 \n"
802 "movdqa 0x30(%0),%%xmm6 \n"
803 "pavgb (%0,%4,1),%%xmm0 \n"
804 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
805 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
806 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
807 "lea 0x40(%0),%0 \n"
808 "movdqa %%xmm0,%%xmm7 \n"
809 "shufps $0x88,%%xmm1,%%xmm0 \n"
810 "shufps $0xdd,%%xmm1,%%xmm7 \n"
811 "pavgb %%xmm7,%%xmm0 \n"
812 "movdqa %%xmm2,%%xmm7 \n"
813 "shufps $0x88,%%xmm6,%%xmm2 \n"
814 "shufps $0xdd,%%xmm6,%%xmm7 \n"
815 "pavgb %%xmm7,%%xmm2 \n"
816 "movdqa %%xmm0,%%xmm1 \n"
817 "movdqa %%xmm2,%%xmm6 \n"
818 "pmaddubsw %%xmm4,%%xmm0 \n"
819 "pmaddubsw %%xmm4,%%xmm2 \n"
820 "pmaddubsw %%xmm3,%%xmm1 \n"
821 "pmaddubsw %%xmm3,%%xmm6 \n"
822 "phaddw %%xmm2,%%xmm0 \n"
823 "phaddw %%xmm6,%%xmm1 \n"
824 "psraw $0x8,%%xmm0 \n"
825 "psraw $0x8,%%xmm1 \n"
826 "packsswb %%xmm1,%%xmm0 \n"
827 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000828 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000829 "movlps %%xmm0,(%1) \n"
830 "movhps %%xmm0,(%1,%2,1) \n"
831 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000832 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000833 : "+r"(src_argb0), // %0
834 "+r"(dst_u), // %1
835 "+r"(dst_v), // %2
836 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000837 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000838 : "memory", "cc"
839#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000840 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000841#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000842 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000843}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000844
fbarchard@google.com050b39a2013-04-01 20:07:14 +0000845// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
846void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
847 uint8* dst_u, uint8* dst_v, int width) {
848 asm volatile (
849 "movdqa %0,%%xmm4 \n"
850 "movdqa %1,%%xmm3 \n"
851 "movdqa %2,%%xmm5 \n"
852 :
853 : "m"(kARGBToUJ), // %0
854 "m"(kARGBToVJ), // %1
855 "m"(kAddUVJ128) // %2
856 );
857 asm volatile (
858 "sub %1,%2 \n"
859 ".p2align 4 \n"
860 "1: \n"
861 "movdqa (%0),%%xmm0 \n"
862 "movdqa 0x10(%0),%%xmm1 \n"
863 "movdqa 0x20(%0),%%xmm2 \n"
864 "movdqa 0x30(%0),%%xmm6 \n"
865 "pavgb (%0,%4,1),%%xmm0 \n"
866 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
867 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
868 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
869 "lea 0x40(%0),%0 \n"
870 "movdqa %%xmm0,%%xmm7 \n"
871 "shufps $0x88,%%xmm1,%%xmm0 \n"
872 "shufps $0xdd,%%xmm1,%%xmm7 \n"
873 "pavgb %%xmm7,%%xmm0 \n"
874 "movdqa %%xmm2,%%xmm7 \n"
875 "shufps $0x88,%%xmm6,%%xmm2 \n"
876 "shufps $0xdd,%%xmm6,%%xmm7 \n"
877 "pavgb %%xmm7,%%xmm2 \n"
878 "movdqa %%xmm0,%%xmm1 \n"
879 "movdqa %%xmm2,%%xmm6 \n"
880 "pmaddubsw %%xmm4,%%xmm0 \n"
881 "pmaddubsw %%xmm4,%%xmm2 \n"
882 "pmaddubsw %%xmm3,%%xmm1 \n"
883 "pmaddubsw %%xmm3,%%xmm6 \n"
884 "phaddw %%xmm2,%%xmm0 \n"
885 "phaddw %%xmm6,%%xmm1 \n"
886 "paddw %%xmm5,%%xmm0 \n"
887 "paddw %%xmm5,%%xmm1 \n"
888 "psraw $0x8,%%xmm0 \n"
889 "psraw $0x8,%%xmm1 \n"
890 "packsswb %%xmm1,%%xmm0 \n"
891 "sub $0x10,%3 \n"
892 "movlps %%xmm0,(%1) \n"
893 "movhps %%xmm0,(%1,%2,1) \n"
894 "lea 0x8(%1),%1 \n"
895 "jg 1b \n"
896 : "+r"(src_argb0), // %0
897 "+r"(dst_u), // %1
898 "+r"(dst_v), // %2
899 "+rm"(width) // %3
900 : "r"(static_cast<intptr_t>(src_stride_argb))
901 : "memory", "cc"
902#if defined(__SSE2__)
903 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
904#endif
905 );
906}
907
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000908void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
909 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000910 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000911 "movdqa %0,%%xmm4 \n"
912 "movdqa %1,%%xmm3 \n"
913 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000914 :
915 : "m"(kARGBToU), // %0
916 "m"(kARGBToV), // %1
917 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000918 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000919 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000920 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000921 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000922 "1: \n"
923 "movdqu (%0),%%xmm0 \n"
924 "movdqu 0x10(%0),%%xmm1 \n"
925 "movdqu 0x20(%0),%%xmm2 \n"
926 "movdqu 0x30(%0),%%xmm6 \n"
927 "movdqu (%0,%4,1),%%xmm7 \n"
928 "pavgb %%xmm7,%%xmm0 \n"
929 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
930 "pavgb %%xmm7,%%xmm1 \n"
931 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
932 "pavgb %%xmm7,%%xmm2 \n"
933 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
934 "pavgb %%xmm7,%%xmm6 \n"
935 "lea 0x40(%0),%0 \n"
936 "movdqa %%xmm0,%%xmm7 \n"
937 "shufps $0x88,%%xmm1,%%xmm0 \n"
938 "shufps $0xdd,%%xmm1,%%xmm7 \n"
939 "pavgb %%xmm7,%%xmm0 \n"
940 "movdqa %%xmm2,%%xmm7 \n"
941 "shufps $0x88,%%xmm6,%%xmm2 \n"
942 "shufps $0xdd,%%xmm6,%%xmm7 \n"
943 "pavgb %%xmm7,%%xmm2 \n"
944 "movdqa %%xmm0,%%xmm1 \n"
945 "movdqa %%xmm2,%%xmm6 \n"
946 "pmaddubsw %%xmm4,%%xmm0 \n"
947 "pmaddubsw %%xmm4,%%xmm2 \n"
948 "pmaddubsw %%xmm3,%%xmm1 \n"
949 "pmaddubsw %%xmm3,%%xmm6 \n"
950 "phaddw %%xmm2,%%xmm0 \n"
951 "phaddw %%xmm6,%%xmm1 \n"
952 "psraw $0x8,%%xmm0 \n"
953 "psraw $0x8,%%xmm1 \n"
954 "packsswb %%xmm1,%%xmm0 \n"
955 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000956 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000957 "movlps %%xmm0,(%1) \n"
958 "movhps %%xmm0,(%1,%2,1) \n"
959 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000960 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000961 : "+r"(src_argb0), // %0
962 "+r"(dst_u), // %1
963 "+r"(dst_v), // %2
964 "+rm"(width) // %3
965 : "r"(static_cast<intptr_t>(src_stride_argb))
966 : "memory", "cc"
967#if defined(__SSE2__)
968 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
969#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000970 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000971}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000972
fbarchard@google.com050b39a2013-04-01 20:07:14 +0000973void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
974 uint8* dst_u, uint8* dst_v, int width) {
975 asm volatile (
976 "movdqa %0,%%xmm4 \n"
977 "movdqa %1,%%xmm3 \n"
978 "movdqa %2,%%xmm5 \n"
979 :
980 : "m"(kARGBToUJ), // %0
981 "m"(kARGBToVJ), // %1
982 "m"(kAddUVJ128) // %2
983 );
984 asm volatile (
985 "sub %1,%2 \n"
986 ".p2align 4 \n"
987 "1: \n"
988 "movdqu (%0),%%xmm0 \n"
989 "movdqu 0x10(%0),%%xmm1 \n"
990 "movdqu 0x20(%0),%%xmm2 \n"
991 "movdqu 0x30(%0),%%xmm6 \n"
992 "movdqu (%0,%4,1),%%xmm7 \n"
993 "pavgb %%xmm7,%%xmm0 \n"
994 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
995 "pavgb %%xmm7,%%xmm1 \n"
996 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
997 "pavgb %%xmm7,%%xmm2 \n"
998 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
999 "pavgb %%xmm7,%%xmm6 \n"
1000 "lea 0x40(%0),%0 \n"
1001 "movdqa %%xmm0,%%xmm7 \n"
1002 "shufps $0x88,%%xmm1,%%xmm0 \n"
1003 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1004 "pavgb %%xmm7,%%xmm0 \n"
1005 "movdqa %%xmm2,%%xmm7 \n"
1006 "shufps $0x88,%%xmm6,%%xmm2 \n"
1007 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1008 "pavgb %%xmm7,%%xmm2 \n"
1009 "movdqa %%xmm0,%%xmm1 \n"
1010 "movdqa %%xmm2,%%xmm6 \n"
1011 "pmaddubsw %%xmm4,%%xmm0 \n"
1012 "pmaddubsw %%xmm4,%%xmm2 \n"
1013 "pmaddubsw %%xmm3,%%xmm1 \n"
1014 "pmaddubsw %%xmm3,%%xmm6 \n"
1015 "phaddw %%xmm2,%%xmm0 \n"
1016 "phaddw %%xmm6,%%xmm1 \n"
1017 "paddw %%xmm5,%%xmm0 \n"
1018 "paddw %%xmm5,%%xmm1 \n"
1019 "psraw $0x8,%%xmm0 \n"
1020 "psraw $0x8,%%xmm1 \n"
1021 "packsswb %%xmm1,%%xmm0 \n"
1022 "sub $0x10,%3 \n"
1023 "movlps %%xmm0,(%1) \n"
1024 "movhps %%xmm0,(%1,%2,1) \n"
1025 "lea 0x8(%1),%1 \n"
1026 "jg 1b \n"
1027 : "+r"(src_argb0), // %0
1028 "+r"(dst_u), // %1
1029 "+r"(dst_v), // %2
1030 "+rm"(width) // %3
1031 : "r"(static_cast<intptr_t>(src_stride_argb))
1032 : "memory", "cc"
1033#if defined(__SSE2__)
1034 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1035#endif
1036 );
1037}
1038
fbarchard@google.com762c0502013-02-04 18:47:21 +00001039void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1040 int width) {
1041 asm volatile (
1042 "movdqa %0,%%xmm4 \n"
1043 "movdqa %1,%%xmm3 \n"
1044 "movdqa %2,%%xmm5 \n"
1045 :
1046 : "m"(kARGBToU), // %0
1047 "m"(kARGBToV), // %1
1048 "m"(kAddUV128) // %2
1049 );
1050 asm volatile (
1051 "sub %1,%2 \n"
1052 ".p2align 4 \n"
1053 "1: \n"
1054 "movdqa (%0),%%xmm0 \n"
1055 "movdqa 0x10(%0),%%xmm1 \n"
1056 "movdqa 0x20(%0),%%xmm2 \n"
1057 "movdqa 0x30(%0),%%xmm6 \n"
1058 "pmaddubsw %%xmm4,%%xmm0 \n"
1059 "pmaddubsw %%xmm4,%%xmm1 \n"
1060 "pmaddubsw %%xmm4,%%xmm2 \n"
1061 "pmaddubsw %%xmm4,%%xmm6 \n"
1062 "phaddw %%xmm1,%%xmm0 \n"
1063 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +00001064 "psraw $0x8,%%xmm0 \n"
1065 "psraw $0x8,%%xmm2 \n"
1066 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +00001067 "paddb %%xmm5,%%xmm0 \n"
1068 "sub $0x10,%3 \n"
1069 "movdqa %%xmm0,(%1) \n"
1070 "movdqa (%0),%%xmm0 \n"
1071 "movdqa 0x10(%0),%%xmm1 \n"
1072 "movdqa 0x20(%0),%%xmm2 \n"
1073 "movdqa 0x30(%0),%%xmm6 \n"
1074 "pmaddubsw %%xmm3,%%xmm0 \n"
1075 "pmaddubsw %%xmm3,%%xmm1 \n"
1076 "pmaddubsw %%xmm3,%%xmm2 \n"
1077 "pmaddubsw %%xmm3,%%xmm6 \n"
1078 "phaddw %%xmm1,%%xmm0 \n"
1079 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +00001080 "psraw $0x8,%%xmm0 \n"
1081 "psraw $0x8,%%xmm2 \n"
1082 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +00001083 "paddb %%xmm5,%%xmm0 \n"
1084 "lea 0x40(%0),%0 \n"
1085 "movdqa %%xmm0,(%1,%2,1) \n"
1086 "lea 0x10(%1),%1 \n"
1087 "jg 1b \n"
1088 : "+r"(src_argb), // %0
1089 "+r"(dst_u), // %1
1090 "+r"(dst_v), // %2
1091 "+rm"(width) // %3
1092 :
1093 : "memory", "cc"
1094#if defined(__SSE2__)
1095 , "xmm0", "xmm1", "xmm2", "xmm6"
1096#endif
1097 );
1098}
1099
1100void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
1101 uint8* dst_v, int width) {
1102 asm volatile (
1103 "movdqa %0,%%xmm4 \n"
1104 "movdqa %1,%%xmm3 \n"
1105 "movdqa %2,%%xmm5 \n"
1106 :
1107 : "m"(kARGBToU), // %0
1108 "m"(kARGBToV), // %1
1109 "m"(kAddUV128) // %2
1110 );
1111 asm volatile (
1112 "sub %1,%2 \n"
1113 ".p2align 4 \n"
1114 "1: \n"
1115 "movdqu (%0),%%xmm0 \n"
1116 "movdqu 0x10(%0),%%xmm1 \n"
1117 "movdqu 0x20(%0),%%xmm2 \n"
1118 "movdqu 0x30(%0),%%xmm6 \n"
1119 "pmaddubsw %%xmm4,%%xmm0 \n"
1120 "pmaddubsw %%xmm4,%%xmm1 \n"
1121 "pmaddubsw %%xmm4,%%xmm2 \n"
1122 "pmaddubsw %%xmm4,%%xmm6 \n"
1123 "phaddw %%xmm1,%%xmm0 \n"
1124 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +00001125 "psraw $0x8,%%xmm0 \n"
1126 "psraw $0x8,%%xmm2 \n"
1127 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +00001128 "paddb %%xmm5,%%xmm0 \n"
1129 "sub $0x10,%3 \n"
1130 "movdqu %%xmm0,(%1) \n"
1131 "movdqu (%0),%%xmm0 \n"
1132 "movdqu 0x10(%0),%%xmm1 \n"
1133 "movdqu 0x20(%0),%%xmm2 \n"
1134 "movdqu 0x30(%0),%%xmm6 \n"
1135 "pmaddubsw %%xmm3,%%xmm0 \n"
1136 "pmaddubsw %%xmm3,%%xmm1 \n"
1137 "pmaddubsw %%xmm3,%%xmm2 \n"
1138 "pmaddubsw %%xmm3,%%xmm6 \n"
1139 "phaddw %%xmm1,%%xmm0 \n"
1140 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +00001141 "psraw $0x8,%%xmm0 \n"
1142 "psraw $0x8,%%xmm2 \n"
1143 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +00001144 "paddb %%xmm5,%%xmm0 \n"
1145 "lea 0x40(%0),%0 \n"
1146 "movdqu %%xmm0,(%1,%2,1) \n"
1147 "lea 0x10(%1),%1 \n"
1148 "jg 1b \n"
1149 : "+r"(src_argb), // %0
1150 "+r"(dst_u), // %1
1151 "+r"(dst_v), // %2
1152 "+rm"(width) // %3
1153 :
1154 : "memory", "cc"
1155#if defined(__SSE2__)
1156 , "xmm0", "xmm1", "xmm2", "xmm6"
1157#endif
1158 );
1159}
1160
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001161void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1162 uint8* dst_u, uint8* dst_v, int width) {
1163 asm volatile (
1164 "movdqa %0,%%xmm4 \n"
1165 "movdqa %1,%%xmm3 \n"
1166 "movdqa %2,%%xmm5 \n"
1167 :
1168 : "m"(kARGBToU), // %0
1169 "m"(kARGBToV), // %1
1170 "m"(kAddUV128) // %2
1171 );
1172 asm volatile (
1173 "sub %1,%2 \n"
1174 ".p2align 4 \n"
1175 "1: \n"
1176 "movdqa (%0),%%xmm0 \n"
1177 "movdqa 0x10(%0),%%xmm1 \n"
1178 "movdqa 0x20(%0),%%xmm2 \n"
1179 "movdqa 0x30(%0),%%xmm6 \n"
1180 "lea 0x40(%0),%0 \n"
1181 "movdqa %%xmm0,%%xmm7 \n"
1182 "shufps $0x88,%%xmm1,%%xmm0 \n"
1183 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1184 "pavgb %%xmm7,%%xmm0 \n"
1185 "movdqa %%xmm2,%%xmm7 \n"
1186 "shufps $0x88,%%xmm6,%%xmm2 \n"
1187 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1188 "pavgb %%xmm7,%%xmm2 \n"
1189 "movdqa %%xmm0,%%xmm1 \n"
1190 "movdqa %%xmm2,%%xmm6 \n"
1191 "pmaddubsw %%xmm4,%%xmm0 \n"
1192 "pmaddubsw %%xmm4,%%xmm2 \n"
1193 "pmaddubsw %%xmm3,%%xmm1 \n"
1194 "pmaddubsw %%xmm3,%%xmm6 \n"
1195 "phaddw %%xmm2,%%xmm0 \n"
1196 "phaddw %%xmm6,%%xmm1 \n"
1197 "psraw $0x8,%%xmm0 \n"
1198 "psraw $0x8,%%xmm1 \n"
1199 "packsswb %%xmm1,%%xmm0 \n"
1200 "paddb %%xmm5,%%xmm0 \n"
1201 "sub $0x10,%3 \n"
1202 "movlps %%xmm0,(%1) \n"
1203 "movhps %%xmm0,(%1,%2,1) \n"
1204 "lea 0x8(%1),%1 \n"
1205 "jg 1b \n"
1206 : "+r"(src_argb0), // %0
1207 "+r"(dst_u), // %1
1208 "+r"(dst_v), // %2
1209 "+rm"(width) // %3
1210 :
1211 : "memory", "cc"
1212#if defined(__SSE2__)
1213 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1214#endif
1215 );
1216}
1217
1218void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1219 uint8* dst_u, uint8* dst_v, int width) {
1220 asm volatile (
1221 "movdqa %0,%%xmm4 \n"
1222 "movdqa %1,%%xmm3 \n"
1223 "movdqa %2,%%xmm5 \n"
1224 :
1225 : "m"(kARGBToU), // %0
1226 "m"(kARGBToV), // %1
1227 "m"(kAddUV128) // %2
1228 );
1229 asm volatile (
1230 "sub %1,%2 \n"
1231 ".p2align 4 \n"
1232 "1: \n"
1233 "movdqu (%0),%%xmm0 \n"
1234 "movdqu 0x10(%0),%%xmm1 \n"
1235 "movdqu 0x20(%0),%%xmm2 \n"
1236 "movdqu 0x30(%0),%%xmm6 \n"
1237 "lea 0x40(%0),%0 \n"
1238 "movdqa %%xmm0,%%xmm7 \n"
1239 "shufps $0x88,%%xmm1,%%xmm0 \n"
1240 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1241 "pavgb %%xmm7,%%xmm0 \n"
1242 "movdqa %%xmm2,%%xmm7 \n"
1243 "shufps $0x88,%%xmm6,%%xmm2 \n"
1244 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1245 "pavgb %%xmm7,%%xmm2 \n"
1246 "movdqa %%xmm0,%%xmm1 \n"
1247 "movdqa %%xmm2,%%xmm6 \n"
1248 "pmaddubsw %%xmm4,%%xmm0 \n"
1249 "pmaddubsw %%xmm4,%%xmm2 \n"
1250 "pmaddubsw %%xmm3,%%xmm1 \n"
1251 "pmaddubsw %%xmm3,%%xmm6 \n"
1252 "phaddw %%xmm2,%%xmm0 \n"
1253 "phaddw %%xmm6,%%xmm1 \n"
1254 "psraw $0x8,%%xmm0 \n"
1255 "psraw $0x8,%%xmm1 \n"
1256 "packsswb %%xmm1,%%xmm0 \n"
1257 "paddb %%xmm5,%%xmm0 \n"
1258 "sub $0x10,%3 \n"
1259 "movlps %%xmm0,(%1) \n"
1260 "movhps %%xmm0,(%1,%2,1) \n"
1261 "lea 0x8(%1),%1 \n"
1262 "jg 1b \n"
1263 : "+r"(src_argb0), // %0
1264 "+r"(dst_u), // %1
1265 "+r"(dst_v), // %2
1266 "+rm"(width) // %3
1267 :
1268 : "memory", "cc"
1269#if defined(__SSE2__)
1270 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1271#endif
1272 );
1273}
1274
fbarchard@google.com714050a2012-02-17 22:59:56 +00001275void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001276 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001277 "movdqa %4,%%xmm5 \n"
1278 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001279 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001280 "1: \n"
1281 "movdqa (%0),%%xmm0 \n"
1282 "movdqa 0x10(%0),%%xmm1 \n"
1283 "movdqa 0x20(%0),%%xmm2 \n"
1284 "movdqa 0x30(%0),%%xmm3 \n"
1285 "pmaddubsw %%xmm4,%%xmm0 \n"
1286 "pmaddubsw %%xmm4,%%xmm1 \n"
1287 "pmaddubsw %%xmm4,%%xmm2 \n"
1288 "pmaddubsw %%xmm4,%%xmm3 \n"
1289 "lea 0x40(%0),%0 \n"
1290 "phaddw %%xmm1,%%xmm0 \n"
1291 "phaddw %%xmm3,%%xmm2 \n"
1292 "psrlw $0x7,%%xmm0 \n"
1293 "psrlw $0x7,%%xmm2 \n"
1294 "packuswb %%xmm2,%%xmm0 \n"
1295 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001296 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001297 "movdqa %%xmm0,(%1) \n"
1298 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001299 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001300 : "+r"(src_bgra), // %0
1301 "+r"(dst_y), // %1
1302 "+r"(pix) // %2
1303 : "m"(kBGRAToY), // %3
1304 "m"(kAddY16) // %4
1305 : "memory", "cc"
1306#if defined(__SSE2__)
1307 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001308#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +00001309 );
1310}
1311
1312void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001313 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001314 "movdqa %4,%%xmm5 \n"
1315 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001316 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001317 "1: \n"
1318 "movdqu (%0),%%xmm0 \n"
1319 "movdqu 0x10(%0),%%xmm1 \n"
1320 "movdqu 0x20(%0),%%xmm2 \n"
1321 "movdqu 0x30(%0),%%xmm3 \n"
1322 "pmaddubsw %%xmm4,%%xmm0 \n"
1323 "pmaddubsw %%xmm4,%%xmm1 \n"
1324 "pmaddubsw %%xmm4,%%xmm2 \n"
1325 "pmaddubsw %%xmm4,%%xmm3 \n"
1326 "lea 0x40(%0),%0 \n"
1327 "phaddw %%xmm1,%%xmm0 \n"
1328 "phaddw %%xmm3,%%xmm2 \n"
1329 "psrlw $0x7,%%xmm0 \n"
1330 "psrlw $0x7,%%xmm2 \n"
1331 "packuswb %%xmm2,%%xmm0 \n"
1332 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001333 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001334 "movdqu %%xmm0,(%1) \n"
1335 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001336 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001337 : "+r"(src_bgra), // %0
1338 "+r"(dst_y), // %1
1339 "+r"(pix) // %2
1340 : "m"(kBGRAToY), // %3
1341 "m"(kAddY16) // %4
1342 : "memory", "cc"
1343#if defined(__SSE2__)
1344 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1345#endif
1346 );
1347}
1348
1349void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1350 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001351 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001352 "movdqa %0,%%xmm4 \n"
1353 "movdqa %1,%%xmm3 \n"
1354 "movdqa %2,%%xmm5 \n"
1355 :
1356 : "m"(kBGRAToU), // %0
1357 "m"(kBGRAToV), // %1
1358 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001359 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001360 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001361 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001362 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001363 "1: \n"
1364 "movdqa (%0),%%xmm0 \n"
1365 "movdqa 0x10(%0),%%xmm1 \n"
1366 "movdqa 0x20(%0),%%xmm2 \n"
1367 "movdqa 0x30(%0),%%xmm6 \n"
1368 "pavgb (%0,%4,1),%%xmm0 \n"
1369 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1370 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1371 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1372 "lea 0x40(%0),%0 \n"
1373 "movdqa %%xmm0,%%xmm7 \n"
1374 "shufps $0x88,%%xmm1,%%xmm0 \n"
1375 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1376 "pavgb %%xmm7,%%xmm0 \n"
1377 "movdqa %%xmm2,%%xmm7 \n"
1378 "shufps $0x88,%%xmm6,%%xmm2 \n"
1379 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1380 "pavgb %%xmm7,%%xmm2 \n"
1381 "movdqa %%xmm0,%%xmm1 \n"
1382 "movdqa %%xmm2,%%xmm6 \n"
1383 "pmaddubsw %%xmm4,%%xmm0 \n"
1384 "pmaddubsw %%xmm4,%%xmm2 \n"
1385 "pmaddubsw %%xmm3,%%xmm1 \n"
1386 "pmaddubsw %%xmm3,%%xmm6 \n"
1387 "phaddw %%xmm2,%%xmm0 \n"
1388 "phaddw %%xmm6,%%xmm1 \n"
1389 "psraw $0x8,%%xmm0 \n"
1390 "psraw $0x8,%%xmm1 \n"
1391 "packsswb %%xmm1,%%xmm0 \n"
1392 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001393 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001394 "movlps %%xmm0,(%1) \n"
1395 "movhps %%xmm0,(%1,%2,1) \n"
1396 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001397 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001398 : "+r"(src_bgra0), // %0
1399 "+r"(dst_u), // %1
1400 "+r"(dst_v), // %2
1401 "+rm"(width) // %3
1402 : "r"(static_cast<intptr_t>(src_stride_bgra))
1403 : "memory", "cc"
1404#if defined(__SSE2__)
1405 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1406#endif
1407 );
1408}
1409
1410void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1411 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001412 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001413 "movdqa %0,%%xmm4 \n"
1414 "movdqa %1,%%xmm3 \n"
1415 "movdqa %2,%%xmm5 \n"
1416 :
1417 : "m"(kBGRAToU), // %0
1418 "m"(kBGRAToV), // %1
1419 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001420 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001421 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001422 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001423 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001424 "1: \n"
1425 "movdqu (%0),%%xmm0 \n"
1426 "movdqu 0x10(%0),%%xmm1 \n"
1427 "movdqu 0x20(%0),%%xmm2 \n"
1428 "movdqu 0x30(%0),%%xmm6 \n"
1429 "movdqu (%0,%4,1),%%xmm7 \n"
1430 "pavgb %%xmm7,%%xmm0 \n"
1431 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1432 "pavgb %%xmm7,%%xmm1 \n"
1433 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1434 "pavgb %%xmm7,%%xmm2 \n"
1435 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1436 "pavgb %%xmm7,%%xmm6 \n"
1437 "lea 0x40(%0),%0 \n"
1438 "movdqa %%xmm0,%%xmm7 \n"
1439 "shufps $0x88,%%xmm1,%%xmm0 \n"
1440 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1441 "pavgb %%xmm7,%%xmm0 \n"
1442 "movdqa %%xmm2,%%xmm7 \n"
1443 "shufps $0x88,%%xmm6,%%xmm2 \n"
1444 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1445 "pavgb %%xmm7,%%xmm2 \n"
1446 "movdqa %%xmm0,%%xmm1 \n"
1447 "movdqa %%xmm2,%%xmm6 \n"
1448 "pmaddubsw %%xmm4,%%xmm0 \n"
1449 "pmaddubsw %%xmm4,%%xmm2 \n"
1450 "pmaddubsw %%xmm3,%%xmm1 \n"
1451 "pmaddubsw %%xmm3,%%xmm6 \n"
1452 "phaddw %%xmm2,%%xmm0 \n"
1453 "phaddw %%xmm6,%%xmm1 \n"
1454 "psraw $0x8,%%xmm0 \n"
1455 "psraw $0x8,%%xmm1 \n"
1456 "packsswb %%xmm1,%%xmm0 \n"
1457 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001458 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001459 "movlps %%xmm0,(%1) \n"
1460 "movhps %%xmm0,(%1,%2,1) \n"
1461 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001462 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001463 : "+r"(src_bgra0), // %0
1464 "+r"(dst_u), // %1
1465 "+r"(dst_v), // %2
1466 "+rm"(width) // %3
1467 : "r"(static_cast<intptr_t>(src_stride_bgra))
1468 : "memory", "cc"
1469#if defined(__SSE2__)
1470 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1471#endif
1472 );
1473}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001474
1475void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001476 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001477 "movdqa %4,%%xmm5 \n"
1478 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001479 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001480 "1: \n"
1481 "movdqa (%0),%%xmm0 \n"
1482 "movdqa 0x10(%0),%%xmm1 \n"
1483 "movdqa 0x20(%0),%%xmm2 \n"
1484 "movdqa 0x30(%0),%%xmm3 \n"
1485 "pmaddubsw %%xmm4,%%xmm0 \n"
1486 "pmaddubsw %%xmm4,%%xmm1 \n"
1487 "pmaddubsw %%xmm4,%%xmm2 \n"
1488 "pmaddubsw %%xmm4,%%xmm3 \n"
1489 "lea 0x40(%0),%0 \n"
1490 "phaddw %%xmm1,%%xmm0 \n"
1491 "phaddw %%xmm3,%%xmm2 \n"
1492 "psrlw $0x7,%%xmm0 \n"
1493 "psrlw $0x7,%%xmm2 \n"
1494 "packuswb %%xmm2,%%xmm0 \n"
1495 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001496 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001497 "movdqa %%xmm0,(%1) \n"
1498 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001499 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001500 : "+r"(src_abgr), // %0
1501 "+r"(dst_y), // %1
1502 "+r"(pix) // %2
1503 : "m"(kABGRToY), // %3
1504 "m"(kAddY16) // %4
1505 : "memory", "cc"
1506#if defined(__SSE2__)
1507 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1508#endif
1509 );
1510}
1511
1512void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001513 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001514 "movdqa %4,%%xmm5 \n"
1515 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001516 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001517 "1: \n"
1518 "movdqu (%0),%%xmm0 \n"
1519 "movdqu 0x10(%0),%%xmm1 \n"
1520 "movdqu 0x20(%0),%%xmm2 \n"
1521 "movdqu 0x30(%0),%%xmm3 \n"
1522 "pmaddubsw %%xmm4,%%xmm0 \n"
1523 "pmaddubsw %%xmm4,%%xmm1 \n"
1524 "pmaddubsw %%xmm4,%%xmm2 \n"
1525 "pmaddubsw %%xmm4,%%xmm3 \n"
1526 "lea 0x40(%0),%0 \n"
1527 "phaddw %%xmm1,%%xmm0 \n"
1528 "phaddw %%xmm3,%%xmm2 \n"
1529 "psrlw $0x7,%%xmm0 \n"
1530 "psrlw $0x7,%%xmm2 \n"
1531 "packuswb %%xmm2,%%xmm0 \n"
1532 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001533 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001534 "movdqu %%xmm0,(%1) \n"
1535 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001536 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001537 : "+r"(src_abgr), // %0
1538 "+r"(dst_y), // %1
1539 "+r"(pix) // %2
1540 : "m"(kABGRToY), // %3
1541 "m"(kAddY16) // %4
1542 : "memory", "cc"
1543#if defined(__SSE2__)
1544 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1545#endif
1546 );
1547}
1548
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001549void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1550 asm volatile (
1551 "movdqa %4,%%xmm5 \n"
1552 "movdqa %3,%%xmm4 \n"
1553 ".p2align 4 \n"
1554 "1: \n"
1555 "movdqa (%0),%%xmm0 \n"
1556 "movdqa 0x10(%0),%%xmm1 \n"
1557 "movdqa 0x20(%0),%%xmm2 \n"
1558 "movdqa 0x30(%0),%%xmm3 \n"
1559 "pmaddubsw %%xmm4,%%xmm0 \n"
1560 "pmaddubsw %%xmm4,%%xmm1 \n"
1561 "pmaddubsw %%xmm4,%%xmm2 \n"
1562 "pmaddubsw %%xmm4,%%xmm3 \n"
1563 "lea 0x40(%0),%0 \n"
1564 "phaddw %%xmm1,%%xmm0 \n"
1565 "phaddw %%xmm3,%%xmm2 \n"
1566 "psrlw $0x7,%%xmm0 \n"
1567 "psrlw $0x7,%%xmm2 \n"
1568 "packuswb %%xmm2,%%xmm0 \n"
1569 "paddb %%xmm5,%%xmm0 \n"
1570 "sub $0x10,%2 \n"
1571 "movdqa %%xmm0,(%1) \n"
1572 "lea 0x10(%1),%1 \n"
1573 "jg 1b \n"
1574 : "+r"(src_rgba), // %0
1575 "+r"(dst_y), // %1
1576 "+r"(pix) // %2
1577 : "m"(kRGBAToY), // %3
1578 "m"(kAddY16) // %4
1579 : "memory", "cc"
1580#if defined(__SSE2__)
1581 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1582#endif
1583 );
1584}
1585
1586void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1587 asm volatile (
1588 "movdqa %4,%%xmm5 \n"
1589 "movdqa %3,%%xmm4 \n"
1590 ".p2align 4 \n"
1591 "1: \n"
1592 "movdqu (%0),%%xmm0 \n"
1593 "movdqu 0x10(%0),%%xmm1 \n"
1594 "movdqu 0x20(%0),%%xmm2 \n"
1595 "movdqu 0x30(%0),%%xmm3 \n"
1596 "pmaddubsw %%xmm4,%%xmm0 \n"
1597 "pmaddubsw %%xmm4,%%xmm1 \n"
1598 "pmaddubsw %%xmm4,%%xmm2 \n"
1599 "pmaddubsw %%xmm4,%%xmm3 \n"
1600 "lea 0x40(%0),%0 \n"
1601 "phaddw %%xmm1,%%xmm0 \n"
1602 "phaddw %%xmm3,%%xmm2 \n"
1603 "psrlw $0x7,%%xmm0 \n"
1604 "psrlw $0x7,%%xmm2 \n"
1605 "packuswb %%xmm2,%%xmm0 \n"
1606 "paddb %%xmm5,%%xmm0 \n"
1607 "sub $0x10,%2 \n"
1608 "movdqu %%xmm0,(%1) \n"
1609 "lea 0x10(%1),%1 \n"
1610 "jg 1b \n"
1611 : "+r"(src_rgba), // %0
1612 "+r"(dst_y), // %1
1613 "+r"(pix) // %2
1614 : "m"(kRGBAToY), // %3
1615 "m"(kAddY16) // %4
1616 : "memory", "cc"
1617#if defined(__SSE2__)
1618 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1619#endif
1620 );
1621}
1622
fbarchard@google.com714050a2012-02-17 22:59:56 +00001623void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1624 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001625 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001626 "movdqa %0,%%xmm4 \n"
1627 "movdqa %1,%%xmm3 \n"
1628 "movdqa %2,%%xmm5 \n"
1629 :
1630 : "m"(kABGRToU), // %0
1631 "m"(kABGRToV), // %1
1632 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001633 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001634 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001635 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001636 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001637 "1: \n"
1638 "movdqa (%0),%%xmm0 \n"
1639 "movdqa 0x10(%0),%%xmm1 \n"
1640 "movdqa 0x20(%0),%%xmm2 \n"
1641 "movdqa 0x30(%0),%%xmm6 \n"
1642 "pavgb (%0,%4,1),%%xmm0 \n"
1643 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1644 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1645 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1646 "lea 0x40(%0),%0 \n"
1647 "movdqa %%xmm0,%%xmm7 \n"
1648 "shufps $0x88,%%xmm1,%%xmm0 \n"
1649 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1650 "pavgb %%xmm7,%%xmm0 \n"
1651 "movdqa %%xmm2,%%xmm7 \n"
1652 "shufps $0x88,%%xmm6,%%xmm2 \n"
1653 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1654 "pavgb %%xmm7,%%xmm2 \n"
1655 "movdqa %%xmm0,%%xmm1 \n"
1656 "movdqa %%xmm2,%%xmm6 \n"
1657 "pmaddubsw %%xmm4,%%xmm0 \n"
1658 "pmaddubsw %%xmm4,%%xmm2 \n"
1659 "pmaddubsw %%xmm3,%%xmm1 \n"
1660 "pmaddubsw %%xmm3,%%xmm6 \n"
1661 "phaddw %%xmm2,%%xmm0 \n"
1662 "phaddw %%xmm6,%%xmm1 \n"
1663 "psraw $0x8,%%xmm0 \n"
1664 "psraw $0x8,%%xmm1 \n"
1665 "packsswb %%xmm1,%%xmm0 \n"
1666 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001667 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001668 "movlps %%xmm0,(%1) \n"
1669 "movhps %%xmm0,(%1,%2,1) \n"
1670 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001671 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001672 : "+r"(src_abgr0), // %0
1673 "+r"(dst_u), // %1
1674 "+r"(dst_v), // %2
1675 "+rm"(width) // %3
1676 : "r"(static_cast<intptr_t>(src_stride_abgr))
1677 : "memory", "cc"
1678#if defined(__SSE2__)
1679 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1680#endif
1681 );
1682}
1683
1684void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1685 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001686 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001687 "movdqa %0,%%xmm4 \n"
1688 "movdqa %1,%%xmm3 \n"
1689 "movdqa %2,%%xmm5 \n"
1690 :
1691 : "m"(kABGRToU), // %0
1692 "m"(kABGRToV), // %1
1693 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001694 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001695 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001696 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001697 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001698 "1: \n"
1699 "movdqu (%0),%%xmm0 \n"
1700 "movdqu 0x10(%0),%%xmm1 \n"
1701 "movdqu 0x20(%0),%%xmm2 \n"
1702 "movdqu 0x30(%0),%%xmm6 \n"
1703 "movdqu (%0,%4,1),%%xmm7 \n"
1704 "pavgb %%xmm7,%%xmm0 \n"
1705 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1706 "pavgb %%xmm7,%%xmm1 \n"
1707 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1708 "pavgb %%xmm7,%%xmm2 \n"
1709 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1710 "pavgb %%xmm7,%%xmm6 \n"
1711 "lea 0x40(%0),%0 \n"
1712 "movdqa %%xmm0,%%xmm7 \n"
1713 "shufps $0x88,%%xmm1,%%xmm0 \n"
1714 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1715 "pavgb %%xmm7,%%xmm0 \n"
1716 "movdqa %%xmm2,%%xmm7 \n"
1717 "shufps $0x88,%%xmm6,%%xmm2 \n"
1718 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1719 "pavgb %%xmm7,%%xmm2 \n"
1720 "movdqa %%xmm0,%%xmm1 \n"
1721 "movdqa %%xmm2,%%xmm6 \n"
1722 "pmaddubsw %%xmm4,%%xmm0 \n"
1723 "pmaddubsw %%xmm4,%%xmm2 \n"
1724 "pmaddubsw %%xmm3,%%xmm1 \n"
1725 "pmaddubsw %%xmm3,%%xmm6 \n"
1726 "phaddw %%xmm2,%%xmm0 \n"
1727 "phaddw %%xmm6,%%xmm1 \n"
1728 "psraw $0x8,%%xmm0 \n"
1729 "psraw $0x8,%%xmm1 \n"
1730 "packsswb %%xmm1,%%xmm0 \n"
1731 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001732 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001733 "movlps %%xmm0,(%1) \n"
1734 "movhps %%xmm0,(%1,%2,1) \n"
1735 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001736 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001737 : "+r"(src_abgr0), // %0
1738 "+r"(dst_u), // %1
1739 "+r"(dst_v), // %2
1740 "+rm"(width) // %3
1741 : "r"(static_cast<intptr_t>(src_stride_abgr))
1742 : "memory", "cc"
1743#if defined(__SSE2__)
1744 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1745#endif
1746 );
1747}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001748
1749void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1750 uint8* dst_u, uint8* dst_v, int width) {
1751 asm volatile (
1752 "movdqa %0,%%xmm4 \n"
1753 "movdqa %1,%%xmm3 \n"
1754 "movdqa %2,%%xmm5 \n"
1755 :
1756 : "m"(kRGBAToU), // %0
1757 "m"(kRGBAToV), // %1
1758 "m"(kAddUV128) // %2
1759 );
1760 asm volatile (
1761 "sub %1,%2 \n"
1762 ".p2align 4 \n"
1763 "1: \n"
1764 "movdqa (%0),%%xmm0 \n"
1765 "movdqa 0x10(%0),%%xmm1 \n"
1766 "movdqa 0x20(%0),%%xmm2 \n"
1767 "movdqa 0x30(%0),%%xmm6 \n"
1768 "pavgb (%0,%4,1),%%xmm0 \n"
1769 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1770 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1771 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1772 "lea 0x40(%0),%0 \n"
1773 "movdqa %%xmm0,%%xmm7 \n"
1774 "shufps $0x88,%%xmm1,%%xmm0 \n"
1775 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1776 "pavgb %%xmm7,%%xmm0 \n"
1777 "movdqa %%xmm2,%%xmm7 \n"
1778 "shufps $0x88,%%xmm6,%%xmm2 \n"
1779 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1780 "pavgb %%xmm7,%%xmm2 \n"
1781 "movdqa %%xmm0,%%xmm1 \n"
1782 "movdqa %%xmm2,%%xmm6 \n"
1783 "pmaddubsw %%xmm4,%%xmm0 \n"
1784 "pmaddubsw %%xmm4,%%xmm2 \n"
1785 "pmaddubsw %%xmm3,%%xmm1 \n"
1786 "pmaddubsw %%xmm3,%%xmm6 \n"
1787 "phaddw %%xmm2,%%xmm0 \n"
1788 "phaddw %%xmm6,%%xmm1 \n"
1789 "psraw $0x8,%%xmm0 \n"
1790 "psraw $0x8,%%xmm1 \n"
1791 "packsswb %%xmm1,%%xmm0 \n"
1792 "paddb %%xmm5,%%xmm0 \n"
1793 "sub $0x10,%3 \n"
1794 "movlps %%xmm0,(%1) \n"
1795 "movhps %%xmm0,(%1,%2,1) \n"
1796 "lea 0x8(%1),%1 \n"
1797 "jg 1b \n"
1798 : "+r"(src_rgba0), // %0
1799 "+r"(dst_u), // %1
1800 "+r"(dst_v), // %2
1801 "+rm"(width) // %3
1802 : "r"(static_cast<intptr_t>(src_stride_rgba))
1803 : "memory", "cc"
1804#if defined(__SSE2__)
1805 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1806#endif
1807 );
1808}
1809
1810void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1811 uint8* dst_u, uint8* dst_v, int width) {
1812 asm volatile (
1813 "movdqa %0,%%xmm4 \n"
1814 "movdqa %1,%%xmm3 \n"
1815 "movdqa %2,%%xmm5 \n"
1816 :
1817 : "m"(kRGBAToU), // %0
1818 "m"(kRGBAToV), // %1
1819 "m"(kAddUV128) // %2
1820 );
1821 asm volatile (
1822 "sub %1,%2 \n"
1823 ".p2align 4 \n"
1824 "1: \n"
1825 "movdqu (%0),%%xmm0 \n"
1826 "movdqu 0x10(%0),%%xmm1 \n"
1827 "movdqu 0x20(%0),%%xmm2 \n"
1828 "movdqu 0x30(%0),%%xmm6 \n"
1829 "movdqu (%0,%4,1),%%xmm7 \n"
1830 "pavgb %%xmm7,%%xmm0 \n"
1831 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1832 "pavgb %%xmm7,%%xmm1 \n"
1833 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1834 "pavgb %%xmm7,%%xmm2 \n"
1835 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1836 "pavgb %%xmm7,%%xmm6 \n"
1837 "lea 0x40(%0),%0 \n"
1838 "movdqa %%xmm0,%%xmm7 \n"
1839 "shufps $0x88,%%xmm1,%%xmm0 \n"
1840 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1841 "pavgb %%xmm7,%%xmm0 \n"
1842 "movdqa %%xmm2,%%xmm7 \n"
1843 "shufps $0x88,%%xmm6,%%xmm2 \n"
1844 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1845 "pavgb %%xmm7,%%xmm2 \n"
1846 "movdqa %%xmm0,%%xmm1 \n"
1847 "movdqa %%xmm2,%%xmm6 \n"
1848 "pmaddubsw %%xmm4,%%xmm0 \n"
1849 "pmaddubsw %%xmm4,%%xmm2 \n"
1850 "pmaddubsw %%xmm3,%%xmm1 \n"
1851 "pmaddubsw %%xmm3,%%xmm6 \n"
1852 "phaddw %%xmm2,%%xmm0 \n"
1853 "phaddw %%xmm6,%%xmm1 \n"
1854 "psraw $0x8,%%xmm0 \n"
1855 "psraw $0x8,%%xmm1 \n"
1856 "packsswb %%xmm1,%%xmm0 \n"
1857 "paddb %%xmm5,%%xmm0 \n"
1858 "sub $0x10,%3 \n"
1859 "movlps %%xmm0,(%1) \n"
1860 "movhps %%xmm0,(%1,%2,1) \n"
1861 "lea 0x8(%1),%1 \n"
1862 "jg 1b \n"
1863 : "+r"(src_rgba0), // %0
1864 "+r"(dst_u), // %1
1865 "+r"(dst_v), // %2
1866 "+rm"(width) // %3
1867 : "r"(static_cast<intptr_t>(src_stride_rgba))
1868 : "memory", "cc"
1869#if defined(__SSE2__)
1870 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1871#endif
1872 );
1873}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001874#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001875
fbarchard@google.come214fe32012-06-04 23:47:11 +00001876#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001877#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1878#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1879#define UR 0
1880
1881#define VB 0
1882#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1883#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1884
1885// Bias
1886#define BB UB * 128 + VB * 128
1887#define BG UG * 128 + VG * 128
1888#define BR UR * 128 + VR * 128
1889
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001890#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001891
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001892struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001893 vec8 kUVToB; // 0
1894 vec8 kUVToG; // 16
1895 vec8 kUVToR; // 32
1896 vec16 kUVBiasB; // 48
1897 vec16 kUVBiasG; // 64
1898 vec16 kUVBiasR; // 80
1899 vec16 kYSub16; // 96
1900 vec16 kYToRgb; // 112
1901 vec8 kVUToB; // 128
1902 vec8 kVUToG; // 144
1903 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001904} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001905 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1906 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1907 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1908 { BB, BB, BB, BB, BB, BB, BB, BB },
1909 { BG, BG, BG, BG, BG, BG, BG, BG },
1910 { BR, BR, BR, BR, BR, BR, BR, BR },
1911 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001912 { YG, YG, YG, YG, YG, YG, YG, YG },
1913 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1914 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1915 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001916};
1917
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001918
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001919// Read 8 UV from 411
1920#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001921 "movq (%[u_buf]),%%xmm0 \n" \
1922 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1923 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001924 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001925
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001926// Read 4 UV from 422, upsample to 8 UV
1927#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001928 "movd (%[u_buf]),%%xmm0 \n" \
1929 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1930 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001931 "punpcklbw %%xmm1,%%xmm0 \n" \
1932 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001933
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001934// Read 2 UV from 411, upsample to 8 UV
1935#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001936 "movd (%[u_buf]),%%xmm0 \n" \
1937 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1938 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001939 "punpcklbw %%xmm1,%%xmm0 \n" \
1940 "punpcklwd %%xmm0,%%xmm0 \n" \
1941 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001942
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001943// Read 4 UV from NV12, upsample to 8 UV
1944#define READNV12 \
1945 "movq (%[uv_buf]),%%xmm0 \n" \
1946 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001947 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001948
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001949// Convert 8 pixels: 8 UV and 8 Y
1950#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001951 "movdqa %%xmm0,%%xmm1 \n" \
1952 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001953 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1954 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1955 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1956 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1957 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1958 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1959 "movq (%[y_buf]),%%xmm3 \n" \
1960 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001961 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001962 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1963 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001964 "paddsw %%xmm3,%%xmm0 \n" \
1965 "paddsw %%xmm3,%%xmm1 \n" \
1966 "paddsw %%xmm3,%%xmm2 \n" \
1967 "psraw $0x6,%%xmm0 \n" \
1968 "psraw $0x6,%%xmm1 \n" \
1969 "psraw $0x6,%%xmm2 \n" \
1970 "packuswb %%xmm0,%%xmm0 \n" \
1971 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001972 "packuswb %%xmm2,%%xmm2 \n" \
1973
1974// Convert 8 pixels: 8 VU and 8 Y
1975#define YVUTORGB \
1976 "movdqa %%xmm0,%%xmm1 \n" \
1977 "movdqa %%xmm0,%%xmm2 \n" \
1978 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1979 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1980 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1981 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1982 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1983 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1984 "movq (%[y_buf]),%%xmm3 \n" \
1985 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1986 "punpcklbw %%xmm4,%%xmm3 \n" \
1987 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1988 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1989 "paddsw %%xmm3,%%xmm0 \n" \
1990 "paddsw %%xmm3,%%xmm1 \n" \
1991 "paddsw %%xmm3,%%xmm2 \n" \
1992 "psraw $0x6,%%xmm0 \n" \
1993 "psraw $0x6,%%xmm1 \n" \
1994 "psraw $0x6,%%xmm2 \n" \
1995 "packuswb %%xmm0,%%xmm0 \n" \
1996 "packuswb %%xmm1,%%xmm1 \n" \
1997 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001998
1999void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002000 const uint8* u_buf,
2001 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002002 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002003 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002004 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002005 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002006 "pcmpeqb %%xmm5,%%xmm5 \n"
2007 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002008 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002009 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002010 READYUV444
2011 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002012 "punpcklbw %%xmm1,%%xmm0 \n"
2013 "punpcklbw %%xmm5,%%xmm2 \n"
2014 "movdqa %%xmm0,%%xmm1 \n"
2015 "punpcklwd %%xmm2,%%xmm0 \n"
2016 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002017 "movdqa %%xmm0,(%[dst_argb]) \n"
2018 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2019 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002020 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002021 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002022 : [y_buf]"+r"(y_buf), // %[y_buf]
2023 [u_buf]"+r"(u_buf), // %[u_buf]
2024 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002025 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002026 [width]"+rm"(width) // %[width]
2027 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002028 : "memory", "cc"
2029#if defined(__SSE2__)
2030 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2031#endif
2032 );
2033}
2034
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002035void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
2036 const uint8* u_buf,
2037 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002038 uint8* dst_rgb24,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002039 int width) {
2040// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
2041#ifdef __APPLE__
2042 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002043 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2044 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2045 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2046 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002047#endif
2048
2049 asm volatile (
2050#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002051 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2052 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002053#endif
2054 "sub %[u_buf],%[v_buf] \n"
2055 "pxor %%xmm4,%%xmm4 \n"
2056 ".p2align 4 \n"
2057 "1: \n"
2058 READYUV422
2059 YUVTORGB
2060 "punpcklbw %%xmm1,%%xmm0 \n"
2061 "punpcklbw %%xmm2,%%xmm2 \n"
2062 "movdqa %%xmm0,%%xmm1 \n"
2063 "punpcklwd %%xmm2,%%xmm0 \n"
2064 "punpckhwd %%xmm2,%%xmm1 \n"
2065 "pshufb %%xmm5,%%xmm0 \n"
2066 "pshufb %%xmm6,%%xmm1 \n"
2067 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002068 "movq %%xmm0,(%[dst_rgb24]) \n"
2069 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
2070 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002071 "sub $0x8,%[width] \n"
2072 "jg 1b \n"
2073 : [y_buf]"+r"(y_buf), // %[y_buf]
2074 [u_buf]"+r"(u_buf), // %[u_buf]
2075 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002076 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002077 [width]"+rm"(width) // %[width]
2078 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
2079#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002080 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2081 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002082#endif
2083 : "memory", "cc"
2084#if defined(__SSE2__)
2085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2086#endif
2087 );
2088}
2089
2090void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
2091 const uint8* u_buf,
2092 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002093 uint8* dst_raw,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002094 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002095// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002096#ifdef __APPLE__
2097 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002098 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
2099 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
2100 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
2101 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002102#endif
2103
2104 asm volatile (
2105#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002106 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
2107 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002108#endif
2109 "sub %[u_buf],%[v_buf] \n"
2110 "pxor %%xmm4,%%xmm4 \n"
2111 ".p2align 4 \n"
2112 "1: \n"
2113 READYUV422
2114 YUVTORGB
2115 "punpcklbw %%xmm1,%%xmm0 \n"
2116 "punpcklbw %%xmm2,%%xmm2 \n"
2117 "movdqa %%xmm0,%%xmm1 \n"
2118 "punpcklwd %%xmm2,%%xmm0 \n"
2119 "punpckhwd %%xmm2,%%xmm1 \n"
2120 "pshufb %%xmm5,%%xmm0 \n"
2121 "pshufb %%xmm6,%%xmm1 \n"
2122 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002123 "movq %%xmm0,(%[dst_raw]) \n"
2124 "movdqu %%xmm1,0x8(%[dst_raw]) \n"
2125 "lea 0x18(%[dst_raw]),%[dst_raw] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002126 "sub $0x8,%[width] \n"
2127 "jg 1b \n"
2128 : [y_buf]"+r"(y_buf), // %[y_buf]
2129 [u_buf]"+r"(u_buf), // %[u_buf]
2130 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002131 [dst_raw]"+r"(dst_raw), // %[dst_raw]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002132 [width]"+rm"(width) // %[width]
2133 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
2134#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002135 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
2136 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002137#endif
2138 : "memory", "cc"
2139#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002140 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002141#endif
2142 );
2143}
2144
fbarchard@google.come214fe32012-06-04 23:47:11 +00002145void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002146 const uint8* u_buf,
2147 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002148 uint8* dst_argb,
fbarchard@google.comdbcabea2012-10-29 21:20:25 +00002149 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002150 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002151 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002152 "pcmpeqb %%xmm5,%%xmm5 \n"
2153 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002154 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002155 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002156 READYUV422
2157 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002158 "punpcklbw %%xmm1,%%xmm0 \n"
2159 "punpcklbw %%xmm5,%%xmm2 \n"
2160 "movdqa %%xmm0,%%xmm1 \n"
2161 "punpcklwd %%xmm2,%%xmm0 \n"
2162 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002163 "movdqa %%xmm0,(%[dst_argb]) \n"
2164 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2165 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002166 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002167 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002168 : [y_buf]"+r"(y_buf), // %[y_buf]
2169 [u_buf]"+r"(u_buf), // %[u_buf]
2170 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002171 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002172 [width]"+rm"(width) // %[width]
2173 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002174 : "memory", "cc"
2175#if defined(__SSE2__)
2176 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2177#endif
2178 );
2179}
2180
2181void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
2182 const uint8* u_buf,
2183 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002184 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002185 int width) {
2186 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002187 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002188 "pcmpeqb %%xmm5,%%xmm5 \n"
2189 "pxor %%xmm4,%%xmm4 \n"
2190 ".p2align 4 \n"
2191 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002192 READYUV411
2193 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002194 "punpcklbw %%xmm1,%%xmm0 \n"
2195 "punpcklbw %%xmm5,%%xmm2 \n"
2196 "movdqa %%xmm0,%%xmm1 \n"
2197 "punpcklwd %%xmm2,%%xmm0 \n"
2198 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002199 "movdqa %%xmm0,(%[dst_argb]) \n"
2200 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2201 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002202 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002203 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002204 : [y_buf]"+r"(y_buf), // %[y_buf]
2205 [u_buf]"+r"(u_buf), // %[u_buf]
2206 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002207 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002208 [width]"+rm"(width) // %[width]
2209 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2210 : "memory", "cc"
2211#if defined(__SSE2__)
2212 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2213#endif
2214 );
2215}
2216
2217void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
2218 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002219 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002220 int width) {
2221 asm volatile (
2222 "pcmpeqb %%xmm5,%%xmm5 \n"
2223 "pxor %%xmm4,%%xmm4 \n"
2224 ".p2align 4 \n"
2225 "1: \n"
2226 READNV12
2227 YUVTORGB
2228 "punpcklbw %%xmm1,%%xmm0 \n"
2229 "punpcklbw %%xmm5,%%xmm2 \n"
2230 "movdqa %%xmm0,%%xmm1 \n"
2231 "punpcklwd %%xmm2,%%xmm0 \n"
2232 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002233 "movdqa %%xmm0,(%[dst_argb]) \n"
2234 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2235 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002236 "sub $0x8,%[width] \n"
2237 "jg 1b \n"
2238 : [y_buf]"+r"(y_buf), // %[y_buf]
2239 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002240 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002241 [width]"+rm"(width) // %[width]
2242 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2243 : "memory", "cc"
2244#if defined(__SSE2__)
2245 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2246#endif
2247 );
2248}
2249
2250void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002251 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002252 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002253 int width) {
2254 asm volatile (
2255 "pcmpeqb %%xmm5,%%xmm5 \n"
2256 "pxor %%xmm4,%%xmm4 \n"
2257 ".p2align 4 \n"
2258 "1: \n"
2259 READNV12
2260 YVUTORGB
2261 "punpcklbw %%xmm1,%%xmm0 \n"
2262 "punpcklbw %%xmm5,%%xmm2 \n"
2263 "movdqa %%xmm0,%%xmm1 \n"
2264 "punpcklwd %%xmm2,%%xmm0 \n"
2265 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002266 "movdqa %%xmm0,(%[dst_argb]) \n"
2267 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2268 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002269 "sub $0x8,%[width] \n"
2270 "jg 1b \n"
2271 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002272 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2273 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002274 [width]"+rm"(width) // %[width]
2275 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002276 : "memory", "cc"
2277#if defined(__SSE2__)
2278 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2279#endif
2280 );
2281}
2282
2283void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2284 const uint8* u_buf,
2285 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002286 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002287 int width) {
2288 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002289 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002290 "pcmpeqb %%xmm5,%%xmm5 \n"
2291 "pxor %%xmm4,%%xmm4 \n"
2292 ".p2align 4 \n"
2293 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002294 READYUV444
2295 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002296 "punpcklbw %%xmm1,%%xmm0 \n"
2297 "punpcklbw %%xmm5,%%xmm2 \n"
2298 "movdqa %%xmm0,%%xmm1 \n"
2299 "punpcklwd %%xmm2,%%xmm0 \n"
2300 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002301 "movdqu %%xmm0,(%[dst_argb]) \n"
2302 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2303 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002304 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002305 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002306 : [y_buf]"+r"(y_buf), // %[y_buf]
2307 [u_buf]"+r"(u_buf), // %[u_buf]
2308 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002309 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002310 [width]"+rm"(width) // %[width]
2311 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002312 : "memory", "cc"
2313#if defined(__SSE2__)
2314 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2315#endif
2316 );
2317}
2318
2319void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2320 const uint8* u_buf,
2321 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002322 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002323 int width) {
2324 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002325 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002326 "pcmpeqb %%xmm5,%%xmm5 \n"
2327 "pxor %%xmm4,%%xmm4 \n"
2328 ".p2align 4 \n"
2329 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002330 READYUV422
2331 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002332 "punpcklbw %%xmm1,%%xmm0 \n"
2333 "punpcklbw %%xmm5,%%xmm2 \n"
2334 "movdqa %%xmm0,%%xmm1 \n"
2335 "punpcklwd %%xmm2,%%xmm0 \n"
2336 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002337 "movdqu %%xmm0,(%[dst_argb]) \n"
2338 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2339 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002340 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002341 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002342 : [y_buf]"+r"(y_buf), // %[y_buf]
2343 [u_buf]"+r"(u_buf), // %[u_buf]
2344 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002345 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002346 [width]"+rm"(width) // %[width]
2347 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002348 : "memory", "cc"
2349#if defined(__SSE2__)
2350 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2351#endif
2352 );
2353}
2354
2355void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2356 const uint8* u_buf,
2357 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002358 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002359 int width) {
2360 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002361 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002362 "pcmpeqb %%xmm5,%%xmm5 \n"
2363 "pxor %%xmm4,%%xmm4 \n"
2364 ".p2align 4 \n"
2365 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002366 READYUV411
2367 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002368 "punpcklbw %%xmm1,%%xmm0 \n"
2369 "punpcklbw %%xmm5,%%xmm2 \n"
2370 "movdqa %%xmm0,%%xmm1 \n"
2371 "punpcklwd %%xmm2,%%xmm0 \n"
2372 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002373 "movdqu %%xmm0,(%[dst_argb]) \n"
2374 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2375 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002376 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002377 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002378 : [y_buf]"+r"(y_buf), // %[y_buf]
2379 [u_buf]"+r"(u_buf), // %[u_buf]
2380 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002381 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002382 [width]"+rm"(width) // %[width]
2383 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2384 : "memory", "cc"
2385#if defined(__SSE2__)
2386 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2387#endif
2388 );
2389}
2390
2391void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2392 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002393 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002394 int width) {
2395 asm volatile (
2396 "pcmpeqb %%xmm5,%%xmm5 \n"
2397 "pxor %%xmm4,%%xmm4 \n"
2398 ".p2align 4 \n"
2399 "1: \n"
2400 READNV12
2401 YUVTORGB
2402 "punpcklbw %%xmm1,%%xmm0 \n"
2403 "punpcklbw %%xmm5,%%xmm2 \n"
2404 "movdqa %%xmm0,%%xmm1 \n"
2405 "punpcklwd %%xmm2,%%xmm0 \n"
2406 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002407 "movdqu %%xmm0,(%[dst_argb]) \n"
2408 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2409 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002410 "sub $0x8,%[width] \n"
2411 "jg 1b \n"
2412 : [y_buf]"+r"(y_buf), // %[y_buf]
2413 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002414 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002415 [width]"+rm"(width) // %[width]
2416 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2417 : "memory", "cc"
2418#if defined(__SSE2__)
2419 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2420#endif
2421 );
2422}
2423
2424void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002425 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002426 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002427 int width) {
2428 asm volatile (
2429 "pcmpeqb %%xmm5,%%xmm5 \n"
2430 "pxor %%xmm4,%%xmm4 \n"
2431 ".p2align 4 \n"
2432 "1: \n"
2433 READNV12
2434 YVUTORGB
2435 "punpcklbw %%xmm1,%%xmm0 \n"
2436 "punpcklbw %%xmm5,%%xmm2 \n"
2437 "movdqa %%xmm0,%%xmm1 \n"
2438 "punpcklwd %%xmm2,%%xmm0 \n"
2439 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002440 "movdqu %%xmm0,(%[dst_argb]) \n"
2441 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2442 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002443 "sub $0x8,%[width] \n"
2444 "jg 1b \n"
2445 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002446 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2447 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002448 [width]"+rm"(width) // %[width]
2449 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002450 : "memory", "cc"
2451#if defined(__SSE2__)
2452 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2453#endif
2454 );
2455}
2456
2457void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2458 const uint8* u_buf,
2459 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002460 uint8* dst_bgra,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002461 int width) {
2462 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002463 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002464 "pcmpeqb %%xmm5,%%xmm5 \n"
2465 "pxor %%xmm4,%%xmm4 \n"
2466 ".p2align 4 \n"
2467 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002468 READYUV422
2469 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002470 "pcmpeqb %%xmm5,%%xmm5 \n"
2471 "punpcklbw %%xmm0,%%xmm1 \n"
2472 "punpcklbw %%xmm2,%%xmm5 \n"
2473 "movdqa %%xmm5,%%xmm0 \n"
2474 "punpcklwd %%xmm1,%%xmm5 \n"
2475 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002476 "movdqa %%xmm5,(%[dst_bgra]) \n"
2477 "movdqa %%xmm0,0x10(%[dst_bgra]) \n"
2478 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002479 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002480 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002481 : [y_buf]"+r"(y_buf), // %[y_buf]
2482 [u_buf]"+r"(u_buf), // %[u_buf]
2483 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002484 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002485 [width]"+rm"(width) // %[width]
2486 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002487 : "memory", "cc"
2488#if defined(__SSE2__)
2489 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2490#endif
2491 );
2492}
2493
fbarchard@google.come214fe32012-06-04 23:47:11 +00002494void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002495 const uint8* u_buf,
2496 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002497 uint8* dst_abgr,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002498 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002499 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002500 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002501 "pcmpeqb %%xmm5,%%xmm5 \n"
2502 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002503 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002504 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002505 READYUV422
2506 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002507 "punpcklbw %%xmm1,%%xmm2 \n"
2508 "punpcklbw %%xmm5,%%xmm0 \n"
2509 "movdqa %%xmm2,%%xmm1 \n"
2510 "punpcklwd %%xmm0,%%xmm2 \n"
2511 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002512 "movdqa %%xmm2,(%[dst_abgr]) \n"
2513 "movdqa %%xmm1,0x10(%[dst_abgr]) \n"
2514 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002515 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002516 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002517 : [y_buf]"+r"(y_buf), // %[y_buf]
2518 [u_buf]"+r"(u_buf), // %[u_buf]
2519 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002520 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002521 [width]"+rm"(width) // %[width]
2522 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002523 : "memory", "cc"
2524#if defined(__SSE2__)
2525 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2526#endif
2527 );
2528}
2529
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002530void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2531 const uint8* u_buf,
2532 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002533 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002534 int width) {
2535 asm volatile (
2536 "sub %[u_buf],%[v_buf] \n"
2537 "pcmpeqb %%xmm5,%%xmm5 \n"
2538 "pxor %%xmm4,%%xmm4 \n"
2539 ".p2align 4 \n"
2540 "1: \n"
2541 READYUV422
2542 YUVTORGB
2543 "pcmpeqb %%xmm5,%%xmm5 \n"
2544 "punpcklbw %%xmm2,%%xmm1 \n"
2545 "punpcklbw %%xmm0,%%xmm5 \n"
2546 "movdqa %%xmm5,%%xmm0 \n"
2547 "punpcklwd %%xmm1,%%xmm5 \n"
2548 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002549 "movdqa %%xmm5,(%[dst_rgba]) \n"
2550 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2551 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002552 "sub $0x8,%[width] \n"
2553 "jg 1b \n"
2554 : [y_buf]"+r"(y_buf), // %[y_buf]
2555 [u_buf]"+r"(u_buf), // %[u_buf]
2556 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002557 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002558 [width]"+rm"(width) // %[width]
2559 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2560 : "memory", "cc"
2561#if defined(__SSE2__)
2562 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2563#endif
2564 );
2565}
2566
fbarchard@google.come214fe32012-06-04 23:47:11 +00002567void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002568 const uint8* u_buf,
2569 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002570 uint8* dst_bgra,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002571 int width) {
2572 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002573 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002574 "pcmpeqb %%xmm5,%%xmm5 \n"
2575 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002576 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002577 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002578 READYUV422
2579 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002580 "pcmpeqb %%xmm5,%%xmm5 \n"
2581 "punpcklbw %%xmm0,%%xmm1 \n"
2582 "punpcklbw %%xmm2,%%xmm5 \n"
2583 "movdqa %%xmm5,%%xmm0 \n"
2584 "punpcklwd %%xmm1,%%xmm5 \n"
2585 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002586 "movdqu %%xmm5,(%[dst_bgra]) \n"
2587 "movdqu %%xmm0,0x10(%[dst_bgra]) \n"
2588 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002589 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002590 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002591 : [y_buf]"+r"(y_buf), // %[y_buf]
2592 [u_buf]"+r"(u_buf), // %[u_buf]
2593 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002594 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002595 [width]"+rm"(width) // %[width]
2596 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002597 : "memory", "cc"
2598#if defined(__SSE2__)
2599 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2600#endif
2601 );
2602}
2603
fbarchard@google.come214fe32012-06-04 23:47:11 +00002604void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002605 const uint8* u_buf,
2606 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002607 uint8* dst_abgr,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002608 int width) {
2609 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002610 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002611 "pcmpeqb %%xmm5,%%xmm5 \n"
2612 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002613 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002614 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002615 READYUV422
2616 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002617 "punpcklbw %%xmm1,%%xmm2 \n"
2618 "punpcklbw %%xmm5,%%xmm0 \n"
2619 "movdqa %%xmm2,%%xmm1 \n"
2620 "punpcklwd %%xmm0,%%xmm2 \n"
2621 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002622 "movdqu %%xmm2,(%[dst_abgr]) \n"
2623 "movdqu %%xmm1,0x10(%[dst_abgr]) \n"
2624 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002625 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002626 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002627 : [y_buf]"+r"(y_buf), // %[y_buf]
2628 [u_buf]"+r"(u_buf), // %[u_buf]
2629 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002630 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002631 [width]"+rm"(width) // %[width]
2632 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002633 : "memory", "cc"
2634#if defined(__SSE2__)
2635 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2636#endif
2637 );
2638}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002639
2640void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2641 const uint8* u_buf,
2642 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002643 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002644 int width) {
2645 asm volatile (
2646 "sub %[u_buf],%[v_buf] \n"
2647 "pcmpeqb %%xmm5,%%xmm5 \n"
2648 "pxor %%xmm4,%%xmm4 \n"
2649 ".p2align 4 \n"
2650 "1: \n"
2651 READYUV422
2652 YUVTORGB
2653 "pcmpeqb %%xmm5,%%xmm5 \n"
2654 "punpcklbw %%xmm2,%%xmm1 \n"
2655 "punpcklbw %%xmm0,%%xmm5 \n"
2656 "movdqa %%xmm5,%%xmm0 \n"
2657 "punpcklwd %%xmm1,%%xmm5 \n"
2658 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002659 "movdqa %%xmm5,(%[dst_rgba]) \n"
2660 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2661 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002662 "sub $0x8,%[width] \n"
2663 "jg 1b \n"
2664 : [y_buf]"+r"(y_buf), // %[y_buf]
2665 [u_buf]"+r"(u_buf), // %[u_buf]
2666 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002667 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002668 [width]"+rm"(width) // %[width]
2669 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2670 : "memory", "cc"
2671#if defined(__SSE2__)
2672 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2673#endif
2674 );
2675}
2676
fbarchard@google.come214fe32012-06-04 23:47:11 +00002677#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002678
2679#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002680void YToARGBRow_SSE2(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002681 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002682 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002683 asm volatile (
fbarchard@google.com30859f72012-11-02 09:51:29 +00002684 "pxor %%xmm5,%%xmm5 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002685 "pcmpeqb %%xmm4,%%xmm4 \n"
2686 "pslld $0x18,%%xmm4 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002687 "mov $0x00100010,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002688 "movd %%eax,%%xmm3 \n"
2689 "pshufd $0x0,%%xmm3,%%xmm3 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002690 "mov $0x004a004a,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002691 "movd %%eax,%%xmm2 \n"
2692 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002693 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002694 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002695 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002696 "movq (%0),%%xmm0 \n"
2697 "lea 0x8(%0),%0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002698 "punpcklbw %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002699 "psubusw %%xmm3,%%xmm0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002700 "pmullw %%xmm2,%%xmm0 \n"
2701 "psrlw $6, %%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002702 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002703
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002704 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002705 "punpcklbw %%xmm0,%%xmm0 \n"
2706 "movdqa %%xmm0,%%xmm1 \n"
2707 "punpcklwd %%xmm0,%%xmm0 \n"
2708 "punpckhwd %%xmm1,%%xmm1 \n"
2709 "por %%xmm4,%%xmm0 \n"
2710 "por %%xmm4,%%xmm1 \n"
2711 "movdqa %%xmm0,(%1) \n"
2712 "movdqa %%xmm1,16(%1) \n"
2713 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002714
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002715 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002716 "jg 1b \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002717 : "+r"(y_buf), // %0
2718 "+r"(dst_argb), // %1
2719 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002720 :
2721 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002722#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002723 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002724#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002725 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002726}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002727#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002728
fbarchard@google.com42831e02012-01-21 02:54:17 +00002729#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002730// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002731CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002732 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2733};
2734
fbarchard@google.com42831e02012-01-21 02:54:17 +00002735void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002736 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002737 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002738 "movdqa %3,%%xmm5 \n"
2739 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002740 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002741 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002742 "movdqa (%0,%2),%%xmm0 \n"
2743 "pshufb %%xmm5,%%xmm0 \n"
2744 "sub $0x10,%2 \n"
2745 "movdqa %%xmm0,(%1) \n"
2746 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002747 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002748 : "+r"(src), // %0
2749 "+r"(dst), // %1
2750 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002751 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002752 : "memory", "cc"
2753#if defined(__SSE2__)
2754 , "xmm0", "xmm5"
2755#endif
2756 );
2757}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002758#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002759
fbarchard@google.com42831e02012-01-21 02:54:17 +00002760#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002761void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002762 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002763 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002764 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002765 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002766 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002767 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002768 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002769 "psllw $0x8,%%xmm0 \n"
2770 "psrlw $0x8,%%xmm1 \n"
2771 "por %%xmm1,%%xmm0 \n"
2772 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2773 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2774 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2775 "sub $0x10,%2 \n"
2776 "movdqu %%xmm0,(%1) \n"
2777 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002778 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002779 : "+r"(src), // %0
2780 "+r"(dst), // %1
2781 "+r"(temp_width) // %2
2782 :
2783 : "memory", "cc"
2784#if defined(__SSE2__)
2785 , "xmm0", "xmm1"
2786#endif
2787 );
2788}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002789#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002790
fbarchard@google.com16a96642012-03-02 22:38:09 +00002791#ifdef HAS_MIRRORROW_UV_SSSE3
2792// Shuffle table for reversing the bytes of UV channels.
2793CONST uvec8 kShuffleMirrorUV = {
2794 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2795};
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002796void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
fbarchard@google.com16a96642012-03-02 22:38:09 +00002797 int width) {
2798 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002799 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002800 "movdqa %4,%%xmm1 \n"
2801 "lea -16(%0,%3,2),%0 \n"
2802 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002803 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002804 "1: \n"
2805 "movdqa (%0),%%xmm0 \n"
2806 "lea -16(%0),%0 \n"
2807 "pshufb %%xmm1,%%xmm0 \n"
2808 "sub $8,%3 \n"
2809 "movlpd %%xmm0,(%1) \n"
2810 "movhpd %%xmm0,(%1,%2) \n"
2811 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002812 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002813 : "+r"(src), // %0
2814 "+r"(dst_u), // %1
2815 "+r"(dst_v), // %2
2816 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002817 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002818 : "memory", "cc"
2819#if defined(__SSE2__)
2820 , "xmm0", "xmm1"
2821#endif
2822 );
2823}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002824#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002825
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002826#ifdef HAS_ARGBMIRRORROW_SSSE3
2827// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002828CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002829 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2830};
2831
2832void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2833 intptr_t temp_width = static_cast<intptr_t>(width);
2834 asm volatile (
2835 "movdqa %3,%%xmm5 \n"
2836 "lea -0x10(%0),%0 \n"
2837 ".p2align 4 \n"
2838 "1: \n"
2839 "movdqa (%0,%2,4),%%xmm0 \n"
2840 "pshufb %%xmm5,%%xmm0 \n"
2841 "sub $0x4,%2 \n"
2842 "movdqa %%xmm0,(%1) \n"
2843 "lea 0x10(%1),%1 \n"
2844 "jg 1b \n"
2845 : "+r"(src), // %0
2846 "+r"(dst), // %1
2847 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002848 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002849 : "memory", "cc"
2850#if defined(__SSE2__)
2851 , "xmm0", "xmm5"
2852#endif
2853 );
2854}
2855#endif // HAS_ARGBMIRRORROW_SSSE3
2856
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002857#ifdef HAS_SPLITUVROW_SSE2
2858void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002859 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002860 "pcmpeqb %%xmm5,%%xmm5 \n"
2861 "psrlw $0x8,%%xmm5 \n"
2862 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002863 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002864 "1: \n"
2865 "movdqa (%0),%%xmm0 \n"
2866 "movdqa 0x10(%0),%%xmm1 \n"
2867 "lea 0x20(%0),%0 \n"
2868 "movdqa %%xmm0,%%xmm2 \n"
2869 "movdqa %%xmm1,%%xmm3 \n"
2870 "pand %%xmm5,%%xmm0 \n"
2871 "pand %%xmm5,%%xmm1 \n"
2872 "packuswb %%xmm1,%%xmm0 \n"
2873 "psrlw $0x8,%%xmm2 \n"
2874 "psrlw $0x8,%%xmm3 \n"
2875 "packuswb %%xmm3,%%xmm2 \n"
2876 "movdqa %%xmm0,(%1) \n"
2877 "movdqa %%xmm2,(%1,%2) \n"
2878 "lea 0x10(%1),%1 \n"
2879 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002880 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002881 : "+r"(src_uv), // %0
2882 "+r"(dst_u), // %1
2883 "+r"(dst_v), // %2
2884 "+r"(pix) // %3
2885 :
2886 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002887#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002888 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002889#endif
2890 );
2891}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002892
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002893void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2894 int pix) {
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002895 asm volatile (
2896 "pcmpeqb %%xmm5,%%xmm5 \n"
2897 "psrlw $0x8,%%xmm5 \n"
2898 "sub %1,%2 \n"
2899 ".p2align 4 \n"
2900 "1: \n"
2901 "movdqu (%0),%%xmm0 \n"
2902 "movdqu 0x10(%0),%%xmm1 \n"
2903 "lea 0x20(%0),%0 \n"
2904 "movdqa %%xmm0,%%xmm2 \n"
2905 "movdqa %%xmm1,%%xmm3 \n"
2906 "pand %%xmm5,%%xmm0 \n"
2907 "pand %%xmm5,%%xmm1 \n"
2908 "packuswb %%xmm1,%%xmm0 \n"
2909 "psrlw $0x8,%%xmm2 \n"
2910 "psrlw $0x8,%%xmm3 \n"
2911 "packuswb %%xmm3,%%xmm2 \n"
2912 "movdqu %%xmm0,(%1) \n"
2913 "movdqu %%xmm2,(%1,%2) \n"
2914 "lea 0x10(%1),%1 \n"
2915 "sub $0x10,%3 \n"
2916 "jg 1b \n"
2917 : "+r"(src_uv), // %0
2918 "+r"(dst_u), // %1
2919 "+r"(dst_v), // %2
2920 "+r"(pix) // %3
2921 :
2922 : "memory", "cc"
2923#if defined(__SSE2__)
2924 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2925#endif
2926 );
2927}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002928#endif // HAS_SPLITUVROW_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002929
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002930#ifdef HAS_MERGEUVROW_SSE2
2931void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2932 int width) {
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002933 asm volatile (
2934 "sub %0,%1 \n"
2935 ".p2align 4 \n"
2936 "1: \n"
2937 "movdqa (%0),%%xmm0 \n"
2938 "movdqa (%0,%1,1),%%xmm1 \n"
2939 "lea 0x10(%0),%0 \n"
2940 "movdqa %%xmm0,%%xmm2 \n"
2941 "punpcklbw %%xmm1,%%xmm0 \n"
2942 "punpckhbw %%xmm1,%%xmm2 \n"
2943 "movdqa %%xmm0,(%2) \n"
2944 "movdqa %%xmm2,0x10(%2) \n"
2945 "lea 0x20(%2),%2 \n"
2946 "sub $0x10,%3 \n"
2947 "jg 1b \n"
2948 : "+r"(src_u), // %0
2949 "+r"(src_v), // %1
2950 "+r"(dst_uv), // %2
2951 "+r"(width) // %3
2952 :
2953 : "memory", "cc"
2954#if defined(__SSE2__)
2955 , "xmm0", "xmm1", "xmm2"
2956#endif
2957 );
2958}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002959
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002960void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2961 uint8* dst_uv, int width) {
fbarchard@google.come0d86482012-10-27 19:07:55 +00002962 asm volatile (
2963 "sub %0,%1 \n"
2964 ".p2align 4 \n"
2965 "1: \n"
2966 "movdqu (%0),%%xmm0 \n"
2967 "movdqu (%0,%1,1),%%xmm1 \n"
2968 "lea 0x10(%0),%0 \n"
2969 "movdqa %%xmm0,%%xmm2 \n"
2970 "punpcklbw %%xmm1,%%xmm0 \n"
2971 "punpckhbw %%xmm1,%%xmm2 \n"
2972 "movdqu %%xmm0,(%2) \n"
2973 "movdqu %%xmm2,0x10(%2) \n"
2974 "lea 0x20(%2),%2 \n"
2975 "sub $0x10,%3 \n"
2976 "jg 1b \n"
2977 : "+r"(src_u), // %0
2978 "+r"(src_v), // %1
2979 "+r"(dst_uv), // %2
2980 "+r"(width) // %3
2981 :
2982 : "memory", "cc"
2983#if defined(__SSE2__)
2984 , "xmm0", "xmm1", "xmm2"
2985#endif
2986 );
2987}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002988#endif // HAS_MERGEUVROW_SSE2
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002989
fbarchard@google.com19932f82012-02-16 22:19:14 +00002990#ifdef HAS_COPYROW_SSE2
2991void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002992 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002993 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002994 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002995 "1: \n"
2996 "movdqa (%0),%%xmm0 \n"
2997 "movdqa 0x10(%0),%%xmm1 \n"
2998 "movdqa %%xmm0,(%0,%1) \n"
2999 "movdqa %%xmm1,0x10(%0,%1) \n"
3000 "lea 0x20(%0),%0 \n"
3001 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003002 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00003003 : "+r"(src), // %0
3004 "+r"(dst), // %1
3005 "+r"(count) // %2
3006 :
3007 : "memory", "cc"
3008#if defined(__SSE2__)
3009 , "xmm0", "xmm1"
3010#endif
3011 );
3012}
3013#endif // HAS_COPYROW_SSE2
3014
3015#ifdef HAS_COPYROW_X86
3016void CopyRow_X86(const uint8* src, uint8* dst, int width) {
3017 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003018 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00003019 "shr $0x2,%2 \n"
3020 "rep movsl \n"
3021 : "+S"(src), // %0
3022 "+D"(dst), // %1
3023 "+c"(width_tmp) // %2
3024 :
3025 : "memory", "cc"
3026 );
3027}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00003028#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00003029
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003030#ifdef HAS_SETROW_X86
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00003031void SetRow_X86(uint8* dst, uint32 v32, int width) {
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003032 size_t width_tmp = static_cast<size_t>(width);
3033 asm volatile (
3034 "shr $0x2,%1 \n"
3035 "rep stosl \n"
3036 : "+D"(dst), // %0
3037 "+c"(width_tmp) // %1
3038 : "a"(v32) // %2
3039 : "memory", "cc");
3040}
3041
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00003042void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003043 int dst_stride, int height) {
3044 for (int y = 0; y < height; ++y) {
3045 size_t width_tmp = static_cast<size_t>(width);
3046 uint32* d = reinterpret_cast<uint32*>(dst);
3047 asm volatile (
3048 "rep stosl \n"
3049 : "+D"(d), // %0
3050 "+c"(width_tmp) // %1
3051 : "a"(v32) // %2
3052 : "memory", "cc");
3053 dst += dst_stride;
3054 }
3055}
3056#endif // HAS_SETROW_X86
3057
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003058#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003059void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003060 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003061 "pcmpeqb %%xmm5,%%xmm5 \n"
3062 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003063 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003064 "1: \n"
3065 "movdqa (%0),%%xmm0 \n"
3066 "movdqa 0x10(%0),%%xmm1 \n"
3067 "lea 0x20(%0),%0 \n"
3068 "pand %%xmm5,%%xmm0 \n"
3069 "pand %%xmm5,%%xmm1 \n"
3070 "packuswb %%xmm1,%%xmm0 \n"
3071 "movdqa %%xmm0,(%1) \n"
3072 "lea 0x10(%1),%1 \n"
3073 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003074 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003075 : "+r"(src_yuy2), // %0
3076 "+r"(dst_y), // %1
3077 "+r"(pix) // %2
3078 :
3079 : "memory", "cc"
3080#if defined(__SSE2__)
3081 , "xmm0", "xmm1", "xmm5"
3082#endif
3083 );
3084}
3085
3086void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003087 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003088 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003089 "pcmpeqb %%xmm5,%%xmm5 \n"
3090 "psrlw $0x8,%%xmm5 \n"
3091 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003092 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003093 "1: \n"
3094 "movdqa (%0),%%xmm0 \n"
3095 "movdqa 0x10(%0),%%xmm1 \n"
3096 "movdqa (%0,%4,1),%%xmm2 \n"
3097 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
3098 "lea 0x20(%0),%0 \n"
3099 "pavgb %%xmm2,%%xmm0 \n"
3100 "pavgb %%xmm3,%%xmm1 \n"
3101 "psrlw $0x8,%%xmm0 \n"
3102 "psrlw $0x8,%%xmm1 \n"
3103 "packuswb %%xmm1,%%xmm0 \n"
3104 "movdqa %%xmm0,%%xmm1 \n"
3105 "pand %%xmm5,%%xmm0 \n"
3106 "packuswb %%xmm0,%%xmm0 \n"
3107 "psrlw $0x8,%%xmm1 \n"
3108 "packuswb %%xmm1,%%xmm1 \n"
3109 "movq %%xmm0,(%1) \n"
3110 "movq %%xmm1,(%1,%2) \n"
3111 "lea 0x8(%1),%1 \n"
3112 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003113 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003114 : "+r"(src_yuy2), // %0
3115 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003116 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003117 "+r"(pix) // %3
3118 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
3119 : "memory", "cc"
3120#if defined(__SSE2__)
3121 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3122#endif
3123 );
3124}
3125
fbarchard@google.comc704f782012-08-30 19:53:48 +00003126void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3127 uint8* dst_u, uint8* dst_v, int pix) {
3128 asm volatile (
3129 "pcmpeqb %%xmm5,%%xmm5 \n"
3130 "psrlw $0x8,%%xmm5 \n"
3131 "sub %1,%2 \n"
3132 ".p2align 4 \n"
3133 "1: \n"
3134 "movdqa (%0),%%xmm0 \n"
3135 "movdqa 0x10(%0),%%xmm1 \n"
3136 "lea 0x20(%0),%0 \n"
3137 "psrlw $0x8,%%xmm0 \n"
3138 "psrlw $0x8,%%xmm1 \n"
3139 "packuswb %%xmm1,%%xmm0 \n"
3140 "movdqa %%xmm0,%%xmm1 \n"
3141 "pand %%xmm5,%%xmm0 \n"
3142 "packuswb %%xmm0,%%xmm0 \n"
3143 "psrlw $0x8,%%xmm1 \n"
3144 "packuswb %%xmm1,%%xmm1 \n"
3145 "movq %%xmm0,(%1) \n"
3146 "movq %%xmm1,(%1,%2) \n"
3147 "lea 0x8(%1),%1 \n"
3148 "sub $0x10,%3 \n"
3149 "jg 1b \n"
3150 : "+r"(src_yuy2), // %0
3151 "+r"(dst_u), // %1
3152 "+r"(dst_v), // %2
3153 "+r"(pix) // %3
3154 :
3155 : "memory", "cc"
3156#if defined(__SSE2__)
3157 , "xmm0", "xmm1", "xmm5"
3158#endif
3159 );
3160}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00003161
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003162void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
3163 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003164 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003165 "pcmpeqb %%xmm5,%%xmm5 \n"
3166 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003167 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003168 "1: \n"
3169 "movdqu (%0),%%xmm0 \n"
3170 "movdqu 0x10(%0),%%xmm1 \n"
3171 "lea 0x20(%0),%0 \n"
3172 "pand %%xmm5,%%xmm0 \n"
3173 "pand %%xmm5,%%xmm1 \n"
3174 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003175 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003176 "movdqu %%xmm0,(%1) \n"
3177 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003178 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003179 : "+r"(src_yuy2), // %0
3180 "+r"(dst_y), // %1
3181 "+r"(pix) // %2
3182 :
3183 : "memory", "cc"
3184#if defined(__SSE2__)
3185 , "xmm0", "xmm1", "xmm5"
3186#endif
3187 );
3188}
3189
3190void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
3191 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00003192 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003193 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003194 "pcmpeqb %%xmm5,%%xmm5 \n"
3195 "psrlw $0x8,%%xmm5 \n"
3196 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003197 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003198 "1: \n"
3199 "movdqu (%0),%%xmm0 \n"
3200 "movdqu 0x10(%0),%%xmm1 \n"
3201 "movdqu (%0,%4,1),%%xmm2 \n"
3202 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3203 "lea 0x20(%0),%0 \n"
3204 "pavgb %%xmm2,%%xmm0 \n"
3205 "pavgb %%xmm3,%%xmm1 \n"
3206 "psrlw $0x8,%%xmm0 \n"
3207 "psrlw $0x8,%%xmm1 \n"
3208 "packuswb %%xmm1,%%xmm0 \n"
3209 "movdqa %%xmm0,%%xmm1 \n"
3210 "pand %%xmm5,%%xmm0 \n"
3211 "packuswb %%xmm0,%%xmm0 \n"
3212 "psrlw $0x8,%%xmm1 \n"
3213 "packuswb %%xmm1,%%xmm1 \n"
3214 "movq %%xmm0,(%1) \n"
3215 "movq %%xmm1,(%1,%2) \n"
3216 "lea 0x8(%1),%1 \n"
3217 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003218 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003219 : "+r"(src_yuy2), // %0
3220 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003221 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003222 "+r"(pix) // %3
3223 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
3224 : "memory", "cc"
3225#if defined(__SSE2__)
3226 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3227#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00003228 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003229}
3230
fbarchard@google.comc704f782012-08-30 19:53:48 +00003231void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
3232 uint8* dst_u, uint8* dst_v, int pix) {
3233 asm volatile (
3234 "pcmpeqb %%xmm5,%%xmm5 \n"
3235 "psrlw $0x8,%%xmm5 \n"
3236 "sub %1,%2 \n"
3237 ".p2align 4 \n"
3238 "1: \n"
3239 "movdqu (%0),%%xmm0 \n"
3240 "movdqu 0x10(%0),%%xmm1 \n"
3241 "lea 0x20(%0),%0 \n"
3242 "psrlw $0x8,%%xmm0 \n"
3243 "psrlw $0x8,%%xmm1 \n"
3244 "packuswb %%xmm1,%%xmm0 \n"
3245 "movdqa %%xmm0,%%xmm1 \n"
3246 "pand %%xmm5,%%xmm0 \n"
3247 "packuswb %%xmm0,%%xmm0 \n"
3248 "psrlw $0x8,%%xmm1 \n"
3249 "packuswb %%xmm1,%%xmm1 \n"
3250 "movq %%xmm0,(%1) \n"
3251 "movq %%xmm1,(%1,%2) \n"
3252 "lea 0x8(%1),%1 \n"
3253 "sub $0x10,%3 \n"
3254 "jg 1b \n"
3255 : "+r"(src_yuy2), // %0
3256 "+r"(dst_u), // %1
3257 "+r"(dst_v), // %2
3258 "+r"(pix) // %3
3259 :
3260 : "memory", "cc"
3261#if defined(__SSE2__)
3262 , "xmm0", "xmm1", "xmm5"
3263#endif
3264 );
3265}
3266
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003267void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003268 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003269 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003270 "1: \n"
3271 "movdqa (%0),%%xmm0 \n"
3272 "movdqa 0x10(%0),%%xmm1 \n"
3273 "lea 0x20(%0),%0 \n"
3274 "psrlw $0x8,%%xmm0 \n"
3275 "psrlw $0x8,%%xmm1 \n"
3276 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003277 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003278 "movdqa %%xmm0,(%1) \n"
3279 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003280 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003281 : "+r"(src_uyvy), // %0
3282 "+r"(dst_y), // %1
3283 "+r"(pix) // %2
3284 :
3285 : "memory", "cc"
3286#if defined(__SSE2__)
3287 , "xmm0", "xmm1"
3288#endif
3289 );
3290}
3291
3292void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003293 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003294 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003295 "pcmpeqb %%xmm5,%%xmm5 \n"
3296 "psrlw $0x8,%%xmm5 \n"
3297 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003298 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003299 "1: \n"
3300 "movdqa (%0),%%xmm0 \n"
3301 "movdqa 0x10(%0),%%xmm1 \n"
3302 "movdqa (%0,%4,1),%%xmm2 \n"
3303 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
3304 "lea 0x20(%0),%0 \n"
3305 "pavgb %%xmm2,%%xmm0 \n"
3306 "pavgb %%xmm3,%%xmm1 \n"
3307 "pand %%xmm5,%%xmm0 \n"
3308 "pand %%xmm5,%%xmm1 \n"
3309 "packuswb %%xmm1,%%xmm0 \n"
3310 "movdqa %%xmm0,%%xmm1 \n"
3311 "pand %%xmm5,%%xmm0 \n"
3312 "packuswb %%xmm0,%%xmm0 \n"
3313 "psrlw $0x8,%%xmm1 \n"
3314 "packuswb %%xmm1,%%xmm1 \n"
3315 "movq %%xmm0,(%1) \n"
3316 "movq %%xmm1,(%1,%2) \n"
3317 "lea 0x8(%1),%1 \n"
3318 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003319 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003320 : "+r"(src_uyvy), // %0
3321 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003322 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003323 "+r"(pix) // %3
3324 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3325 : "memory", "cc"
3326#if defined(__SSE2__)
3327 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3328#endif
3329 );
3330}
3331
fbarchard@google.comc704f782012-08-30 19:53:48 +00003332void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3333 uint8* dst_u, uint8* dst_v, int pix) {
3334 asm volatile (
3335 "pcmpeqb %%xmm5,%%xmm5 \n"
3336 "psrlw $0x8,%%xmm5 \n"
3337 "sub %1,%2 \n"
3338 ".p2align 4 \n"
3339 "1: \n"
3340 "movdqa (%0),%%xmm0 \n"
3341 "movdqa 0x10(%0),%%xmm1 \n"
3342 "lea 0x20(%0),%0 \n"
3343 "pand %%xmm5,%%xmm0 \n"
3344 "pand %%xmm5,%%xmm1 \n"
3345 "packuswb %%xmm1,%%xmm0 \n"
3346 "movdqa %%xmm0,%%xmm1 \n"
3347 "pand %%xmm5,%%xmm0 \n"
3348 "packuswb %%xmm0,%%xmm0 \n"
3349 "psrlw $0x8,%%xmm1 \n"
3350 "packuswb %%xmm1,%%xmm1 \n"
3351 "movq %%xmm0,(%1) \n"
3352 "movq %%xmm1,(%1,%2) \n"
3353 "lea 0x8(%1),%1 \n"
3354 "sub $0x10,%3 \n"
3355 "jg 1b \n"
3356 : "+r"(src_uyvy), // %0
3357 "+r"(dst_u), // %1
3358 "+r"(dst_v), // %2
3359 "+r"(pix) // %3
3360 :
3361 : "memory", "cc"
3362#if defined(__SSE2__)
3363 , "xmm0", "xmm1", "xmm5"
3364#endif
3365 );
3366}
3367
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003368void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3369 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003370 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003371 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003372 "1: \n"
3373 "movdqu (%0),%%xmm0 \n"
3374 "movdqu 0x10(%0),%%xmm1 \n"
3375 "lea 0x20(%0),%0 \n"
3376 "psrlw $0x8,%%xmm0 \n"
3377 "psrlw $0x8,%%xmm1 \n"
3378 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003379 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003380 "movdqu %%xmm0,(%1) \n"
3381 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003382 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003383 : "+r"(src_uyvy), // %0
3384 "+r"(dst_y), // %1
3385 "+r"(pix) // %2
3386 :
3387 : "memory", "cc"
3388#if defined(__SSE2__)
3389 , "xmm0", "xmm1"
3390#endif
3391 );
3392}
3393
3394void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003395 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003396 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003397 "pcmpeqb %%xmm5,%%xmm5 \n"
3398 "psrlw $0x8,%%xmm5 \n"
3399 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003400 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003401 "1: \n"
3402 "movdqu (%0),%%xmm0 \n"
3403 "movdqu 0x10(%0),%%xmm1 \n"
3404 "movdqu (%0,%4,1),%%xmm2 \n"
3405 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3406 "lea 0x20(%0),%0 \n"
3407 "pavgb %%xmm2,%%xmm0 \n"
3408 "pavgb %%xmm3,%%xmm1 \n"
3409 "pand %%xmm5,%%xmm0 \n"
3410 "pand %%xmm5,%%xmm1 \n"
3411 "packuswb %%xmm1,%%xmm0 \n"
3412 "movdqa %%xmm0,%%xmm1 \n"
3413 "pand %%xmm5,%%xmm0 \n"
3414 "packuswb %%xmm0,%%xmm0 \n"
3415 "psrlw $0x8,%%xmm1 \n"
3416 "packuswb %%xmm1,%%xmm1 \n"
3417 "movq %%xmm0,(%1) \n"
3418 "movq %%xmm1,(%1,%2) \n"
3419 "lea 0x8(%1),%1 \n"
3420 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003421 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003422 : "+r"(src_uyvy), // %0
3423 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003424 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003425 "+r"(pix) // %3
3426 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3427 : "memory", "cc"
3428#if defined(__SSE2__)
3429 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3430#endif
3431 );
3432}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003433
3434void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3435 uint8* dst_u, uint8* dst_v, int pix) {
3436 asm volatile (
3437 "pcmpeqb %%xmm5,%%xmm5 \n"
3438 "psrlw $0x8,%%xmm5 \n"
3439 "sub %1,%2 \n"
3440 ".p2align 4 \n"
3441 "1: \n"
3442 "movdqu (%0),%%xmm0 \n"
3443 "movdqu 0x10(%0),%%xmm1 \n"
3444 "lea 0x20(%0),%0 \n"
3445 "pand %%xmm5,%%xmm0 \n"
3446 "pand %%xmm5,%%xmm1 \n"
3447 "packuswb %%xmm1,%%xmm0 \n"
3448 "movdqa %%xmm0,%%xmm1 \n"
3449 "pand %%xmm5,%%xmm0 \n"
3450 "packuswb %%xmm0,%%xmm0 \n"
3451 "psrlw $0x8,%%xmm1 \n"
3452 "packuswb %%xmm1,%%xmm1 \n"
3453 "movq %%xmm0,(%1) \n"
3454 "movq %%xmm1,(%1,%2) \n"
3455 "lea 0x8(%1),%1 \n"
3456 "sub $0x10,%3 \n"
3457 "jg 1b \n"
3458 : "+r"(src_uyvy), // %0
3459 "+r"(dst_u), // %1
3460 "+r"(dst_v), // %2
3461 "+r"(pix) // %3
3462 :
3463 : "memory", "cc"
3464#if defined(__SSE2__)
3465 , "xmm0", "xmm1", "xmm5"
3466#endif
3467 );
3468}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003469#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003470
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003471#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003472// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003473void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3474 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003475 asm volatile (
3476 "pcmpeqb %%xmm7,%%xmm7 \n"
3477 "psrlw $0xf,%%xmm7 \n"
3478 "pcmpeqb %%xmm6,%%xmm6 \n"
3479 "psrlw $0x8,%%xmm6 \n"
3480 "pcmpeqb %%xmm5,%%xmm5 \n"
3481 "psllw $0x8,%%xmm5 \n"
3482 "pcmpeqb %%xmm4,%%xmm4 \n"
3483 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003484 "sub $0x1,%3 \n"
3485 "je 91f \n"
3486 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003487
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003488 // 1 pixel loop until destination pointer is aligned.
3489 "10: \n"
3490 "test $0xf,%2 \n"
3491 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003492 "movd (%0),%%xmm3 \n"
3493 "lea 0x4(%0),%0 \n"
3494 "movdqa %%xmm3,%%xmm0 \n"
3495 "pxor %%xmm4,%%xmm3 \n"
3496 "movd (%1),%%xmm2 \n"
3497 "psrlw $0x8,%%xmm3 \n"
3498 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3499 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3500 "pand %%xmm6,%%xmm2 \n"
3501 "paddw %%xmm7,%%xmm3 \n"
3502 "pmullw %%xmm3,%%xmm2 \n"
3503 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003504 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003505 "psrlw $0x8,%%xmm1 \n"
3506 "por %%xmm4,%%xmm0 \n"
3507 "pmullw %%xmm3,%%xmm1 \n"
3508 "psrlw $0x8,%%xmm2 \n"
3509 "paddusb %%xmm2,%%xmm0 \n"
3510 "pand %%xmm5,%%xmm1 \n"
3511 "paddusb %%xmm1,%%xmm0 \n"
3512 "sub $0x1,%3 \n"
3513 "movd %%xmm0,(%2) \n"
3514 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003515 "jge 10b \n"
3516
3517 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003518 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003519 "jl 49f \n"
3520
fbarchard@google.com794fe122012-06-15 01:05:01 +00003521 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003522 ".p2align 2 \n"
3523 "41: \n"
3524 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003525 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003526 "movdqa %%xmm3,%%xmm0 \n"
3527 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003528 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003529 "psrlw $0x8,%%xmm3 \n"
3530 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3531 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003532 "pand %%xmm6,%%xmm2 \n"
3533 "paddw %%xmm7,%%xmm3 \n"
3534 "pmullw %%xmm3,%%xmm2 \n"
3535 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003536 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003537 "psrlw $0x8,%%xmm1 \n"
3538 "por %%xmm4,%%xmm0 \n"
3539 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003540 "psrlw $0x8,%%xmm2 \n"
3541 "paddusb %%xmm2,%%xmm0 \n"
3542 "pand %%xmm5,%%xmm1 \n"
3543 "paddusb %%xmm1,%%xmm0 \n"
3544 "sub $0x4,%3 \n"
3545 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003546 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003547 "jge 41b \n"
3548
3549 "49: \n"
3550 "add $0x3,%3 \n"
3551 "jl 99f \n"
3552
fbarchard@google.com794fe122012-06-15 01:05:01 +00003553 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003554 "91: \n"
3555 "movd (%0),%%xmm3 \n"
3556 "lea 0x4(%0),%0 \n"
3557 "movdqa %%xmm3,%%xmm0 \n"
3558 "pxor %%xmm4,%%xmm3 \n"
3559 "movd (%1),%%xmm2 \n"
3560 "psrlw $0x8,%%xmm3 \n"
3561 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3562 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3563 "pand %%xmm6,%%xmm2 \n"
3564 "paddw %%xmm7,%%xmm3 \n"
3565 "pmullw %%xmm3,%%xmm2 \n"
3566 "movd (%1),%%xmm1 \n"
3567 "lea 0x4(%1),%1 \n"
3568 "psrlw $0x8,%%xmm1 \n"
3569 "por %%xmm4,%%xmm0 \n"
3570 "pmullw %%xmm3,%%xmm1 \n"
3571 "psrlw $0x8,%%xmm2 \n"
3572 "paddusb %%xmm2,%%xmm0 \n"
3573 "pand %%xmm5,%%xmm1 \n"
3574 "paddusb %%xmm1,%%xmm0 \n"
3575 "sub $0x1,%3 \n"
3576 "movd %%xmm0,(%2) \n"
3577 "lea 0x4(%2),%2 \n"
3578 "jge 91b \n"
3579 "99: \n"
3580 : "+r"(src_argb0), // %0
3581 "+r"(src_argb1), // %1
3582 "+r"(dst_argb), // %2
3583 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003584 :
3585 : "memory", "cc"
3586#if defined(__SSE2__)
3587 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3588#endif
3589 );
3590}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003591#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003592
fbarchard@google.com96af8702012-04-06 18:22:27 +00003593#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003594// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003595CONST uvec8 kShuffleAlpha = {
3596 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3597 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3598};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003599
3600// Blend 8 pixels at a time
3601// Shuffle table for reversing the bytes.
3602
3603// Same as SSE2, but replaces
3604// psrlw xmm3, 8 // alpha
3605// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3606// pshuflw xmm3, xmm3,0F5h
3607// with..
3608// pshufb xmm3, kShuffleAlpha // alpha
3609
3610void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3611 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003612 asm volatile (
3613 "pcmpeqb %%xmm7,%%xmm7 \n"
3614 "psrlw $0xf,%%xmm7 \n"
3615 "pcmpeqb %%xmm6,%%xmm6 \n"
3616 "psrlw $0x8,%%xmm6 \n"
3617 "pcmpeqb %%xmm5,%%xmm5 \n"
3618 "psllw $0x8,%%xmm5 \n"
3619 "pcmpeqb %%xmm4,%%xmm4 \n"
3620 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003621 "sub $0x1,%3 \n"
3622 "je 91f \n"
3623 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003624
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003625 // 1 pixel loop until destination pointer is aligned.
3626 "10: \n"
3627 "test $0xf,%2 \n"
3628 "je 19f \n"
3629 "movd (%0),%%xmm3 \n"
3630 "lea 0x4(%0),%0 \n"
3631 "movdqa %%xmm3,%%xmm0 \n"
3632 "pxor %%xmm4,%%xmm3 \n"
3633 "movd (%1),%%xmm2 \n"
3634 "pshufb %4,%%xmm3 \n"
3635 "pand %%xmm6,%%xmm2 \n"
3636 "paddw %%xmm7,%%xmm3 \n"
3637 "pmullw %%xmm3,%%xmm2 \n"
3638 "movd (%1),%%xmm1 \n"
3639 "lea 0x4(%1),%1 \n"
3640 "psrlw $0x8,%%xmm1 \n"
3641 "por %%xmm4,%%xmm0 \n"
3642 "pmullw %%xmm3,%%xmm1 \n"
3643 "psrlw $0x8,%%xmm2 \n"
3644 "paddusb %%xmm2,%%xmm0 \n"
3645 "pand %%xmm5,%%xmm1 \n"
3646 "paddusb %%xmm1,%%xmm0 \n"
3647 "sub $0x1,%3 \n"
3648 "movd %%xmm0,(%2) \n"
3649 "lea 0x4(%2),%2 \n"
3650 "jge 10b \n"
3651
3652 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003653 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003654 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003655 "test $0xf,%0 \n"
3656 "jne 41f \n"
3657 "test $0xf,%1 \n"
3658 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003659
fbarchard@google.com794fe122012-06-15 01:05:01 +00003660 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003661 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003662 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003663 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003664 "lea 0x10(%0),%0 \n"
3665 "movdqa %%xmm3,%%xmm0 \n"
3666 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003667 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003668 "pshufb %4,%%xmm3 \n"
3669 "pand %%xmm6,%%xmm2 \n"
3670 "paddw %%xmm7,%%xmm3 \n"
3671 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003672 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003673 "lea 0x10(%1),%1 \n"
3674 "psrlw $0x8,%%xmm1 \n"
3675 "por %%xmm4,%%xmm0 \n"
3676 "pmullw %%xmm3,%%xmm1 \n"
3677 "psrlw $0x8,%%xmm2 \n"
3678 "paddusb %%xmm2,%%xmm0 \n"
3679 "pand %%xmm5,%%xmm1 \n"
3680 "paddusb %%xmm1,%%xmm0 \n"
3681 "sub $0x4,%3 \n"
3682 "movdqa %%xmm0,(%2) \n"
3683 "lea 0x10(%2),%2 \n"
3684 "jge 40b \n"
3685 "jmp 49f \n"
3686
3687 // 4 pixel unaligned loop.
3688 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003689 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003690 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003691 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003692 "movdqa %%xmm3,%%xmm0 \n"
3693 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003694 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003695 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003696 "pand %%xmm6,%%xmm2 \n"
3697 "paddw %%xmm7,%%xmm3 \n"
3698 "pmullw %%xmm3,%%xmm2 \n"
3699 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003700 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003701 "psrlw $0x8,%%xmm1 \n"
3702 "por %%xmm4,%%xmm0 \n"
3703 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003704 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003705 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003706 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003707 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003708 "sub $0x4,%3 \n"
3709 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003710 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003711 "jge 41b \n"
3712
3713 "49: \n"
3714 "add $0x3,%3 \n"
3715 "jl 99f \n"
3716
fbarchard@google.com794fe122012-06-15 01:05:01 +00003717 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003718 "91: \n"
3719 "movd (%0),%%xmm3 \n"
3720 "lea 0x4(%0),%0 \n"
3721 "movdqa %%xmm3,%%xmm0 \n"
3722 "pxor %%xmm4,%%xmm3 \n"
3723 "movd (%1),%%xmm2 \n"
3724 "pshufb %4,%%xmm3 \n"
3725 "pand %%xmm6,%%xmm2 \n"
3726 "paddw %%xmm7,%%xmm3 \n"
3727 "pmullw %%xmm3,%%xmm2 \n"
3728 "movd (%1),%%xmm1 \n"
3729 "lea 0x4(%1),%1 \n"
3730 "psrlw $0x8,%%xmm1 \n"
3731 "por %%xmm4,%%xmm0 \n"
3732 "pmullw %%xmm3,%%xmm1 \n"
3733 "psrlw $0x8,%%xmm2 \n"
3734 "paddusb %%xmm2,%%xmm0 \n"
3735 "pand %%xmm5,%%xmm1 \n"
3736 "paddusb %%xmm1,%%xmm0 \n"
3737 "sub $0x1,%3 \n"
3738 "movd %%xmm0,(%2) \n"
3739 "lea 0x4(%2),%2 \n"
3740 "jge 91b \n"
3741 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003742 : "+r"(src_argb0), // %0
3743 "+r"(src_argb1), // %1
3744 "+r"(dst_argb), // %2
3745 "+r"(width) // %3
3746 : "m"(kShuffleAlpha) // %4
3747 : "memory", "cc"
3748#if defined(__SSE2__)
3749 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3750#endif
3751 );
3752}
3753#endif // HAS_ARGBBLENDROW_SSSE3
3754
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003755#ifdef HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003756// Attenuate 4 pixels at a time.
3757// aligned to 16 bytes
3758void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3759 asm volatile (
3760 "sub %0,%1 \n"
3761 "pcmpeqb %%xmm4,%%xmm4 \n"
3762 "pslld $0x18,%%xmm4 \n"
3763 "pcmpeqb %%xmm5,%%xmm5 \n"
3764 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003765
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003766 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003767 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003768 "1: \n"
3769 "movdqa (%0),%%xmm0 \n"
3770 "punpcklbw %%xmm0,%%xmm0 \n"
3771 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3772 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3773 "pmulhuw %%xmm2,%%xmm0 \n"
3774 "movdqa (%0),%%xmm1 \n"
3775 "punpckhbw %%xmm1,%%xmm1 \n"
3776 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3777 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3778 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003779 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003780 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003781 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003782 "psrlw $0x8,%%xmm1 \n"
3783 "packuswb %%xmm1,%%xmm0 \n"
3784 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003785 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003786 "sub $0x4,%2 \n"
3787 "movdqa %%xmm0,(%0,%1,1) \n"
3788 "lea 0x10(%0),%0 \n"
3789 "jg 1b \n"
3790 : "+r"(src_argb), // %0
3791 "+r"(dst_argb), // %1
3792 "+r"(width) // %2
3793 :
3794 : "memory", "cc"
3795#if defined(__SSE2__)
3796 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3797#endif
3798 );
3799}
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003800#endif // HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003801
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003802#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003803// Shuffle table duplicating alpha
3804CONST uvec8 kShuffleAlpha0 = {
3805 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3806};
3807CONST uvec8 kShuffleAlpha1 = {
3808 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3809 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3810};
3811// Attenuate 4 pixels at a time.
3812// aligned to 16 bytes
3813void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3814 asm volatile (
3815 "sub %0,%1 \n"
3816 "pcmpeqb %%xmm3,%%xmm3 \n"
3817 "pslld $0x18,%%xmm3 \n"
3818 "movdqa %3,%%xmm4 \n"
3819 "movdqa %4,%%xmm5 \n"
3820
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003821 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003822 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003823 "1: \n"
3824 "movdqa (%0),%%xmm0 \n"
3825 "pshufb %%xmm4,%%xmm0 \n"
3826 "movdqa (%0),%%xmm1 \n"
3827 "punpcklbw %%xmm1,%%xmm1 \n"
3828 "pmulhuw %%xmm1,%%xmm0 \n"
3829 "movdqa (%0),%%xmm1 \n"
3830 "pshufb %%xmm5,%%xmm1 \n"
3831 "movdqa (%0),%%xmm2 \n"
3832 "punpckhbw %%xmm2,%%xmm2 \n"
3833 "pmulhuw %%xmm2,%%xmm1 \n"
3834 "movdqa (%0),%%xmm2 \n"
3835 "pand %%xmm3,%%xmm2 \n"
3836 "psrlw $0x8,%%xmm0 \n"
3837 "psrlw $0x8,%%xmm1 \n"
3838 "packuswb %%xmm1,%%xmm0 \n"
3839 "por %%xmm2,%%xmm0 \n"
3840 "sub $0x4,%2 \n"
3841 "movdqa %%xmm0,(%0,%1,1) \n"
3842 "lea 0x10(%0),%0 \n"
3843 "jg 1b \n"
3844 : "+r"(src_argb), // %0
3845 "+r"(dst_argb), // %1
3846 "+r"(width) // %2
3847 : "m"(kShuffleAlpha0), // %3
3848 "m"(kShuffleAlpha1) // %4
3849 : "memory", "cc"
3850#if defined(__SSE2__)
3851 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3852#endif
3853 );
3854}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003855#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003856
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003857#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003858// Unattenuate 4 pixels at a time.
3859// aligned to 16 bytes
3860void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3861 int width) {
3862 uintptr_t alpha = 0;
3863 asm volatile (
3864 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003865
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003866 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003867 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003868 "1: \n"
3869 "movdqa (%0),%%xmm0 \n"
3870 "movzb 0x3(%0),%3 \n"
3871 "punpcklbw %%xmm0,%%xmm0 \n"
3872 "movd 0x0(%4,%3,4),%%xmm2 \n"
3873 "movzb 0x7(%0),%3 \n"
3874 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003875 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3876 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003877 "movlhps %%xmm3,%%xmm2 \n"
3878 "pmulhuw %%xmm2,%%xmm0 \n"
3879 "movdqa (%0),%%xmm1 \n"
3880 "movzb 0xb(%0),%3 \n"
3881 "punpckhbw %%xmm1,%%xmm1 \n"
3882 "movd 0x0(%4,%3,4),%%xmm2 \n"
3883 "movzb 0xf(%0),%3 \n"
3884 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003885 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3886 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003887 "movlhps %%xmm3,%%xmm2 \n"
3888 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003889 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003890 "sub $0x4,%2 \n"
3891 "movdqa %%xmm0,(%0,%1,1) \n"
3892 "lea 0x10(%0),%0 \n"
3893 "jg 1b \n"
3894 : "+r"(src_argb), // %0
3895 "+r"(dst_argb), // %1
3896 "+r"(width), // %2
3897 "+r"(alpha) // %3
3898 : "r"(fixed_invtbl8) // %4
3899 : "memory", "cc"
3900#if defined(__SSE2__)
3901 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3902#endif
3903 );
3904}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003905#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003906
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003907#ifdef HAS_ARGBGRAYROW_SSSE3
3908// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003909void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003910 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003911 "movdqa %3,%%xmm4 \n"
fbarchard@google.com050b39a2013-04-01 20:07:14 +00003912 "movdqa %4,%%xmm5 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003913 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003914
3915 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003916 ".p2align 4 \n"
3917 "1: \n"
3918 "movdqa (%0),%%xmm0 \n"
3919 "movdqa 0x10(%0),%%xmm1 \n"
3920 "pmaddubsw %%xmm4,%%xmm0 \n"
3921 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003922 "phaddw %%xmm1,%%xmm0 \n"
fbarchard@google.com050b39a2013-04-01 20:07:14 +00003923 "paddw %%xmm5,%%xmm0 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003924 "psrlw $0x7,%%xmm0 \n"
3925 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003926 "movdqa (%0),%%xmm2 \n"
3927 "movdqa 0x10(%0),%%xmm3 \n"
3928 "psrld $0x18,%%xmm2 \n"
3929 "psrld $0x18,%%xmm3 \n"
3930 "packuswb %%xmm3,%%xmm2 \n"
3931 "packuswb %%xmm2,%%xmm2 \n"
3932 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003933 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003934 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003935 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003936 "punpcklwd %%xmm3,%%xmm0 \n"
3937 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003938 "sub $0x8,%2 \n"
3939 "movdqa %%xmm0,(%0,%1,1) \n"
3940 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003941 "lea 0x20(%0),%0 \n"
3942 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003943 : "+r"(src_argb), // %0
3944 "+r"(dst_argb), // %1
3945 "+r"(width) // %2
fbarchard@google.com050b39a2013-04-01 20:07:14 +00003946 : "m"(kARGBToYJ), // %3
3947 "m"(kAddYJ64) // %4
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003948 : "memory", "cc"
3949#if defined(__SSE2__)
fbarchard@google.com050b39a2013-04-01 20:07:14 +00003950 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003951#endif
3952 );
3953}
3954#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003955
3956#ifdef HAS_ARGBSEPIAROW_SSSE3
3957// b = (r * 35 + g * 68 + b * 17) >> 7
3958// g = (r * 45 + g * 88 + b * 22) >> 7
3959// r = (r * 50 + g * 98 + b * 24) >> 7
3960// Constant for ARGB color to sepia tone
3961CONST vec8 kARGBToSepiaB = {
3962 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3963};
3964
3965CONST vec8 kARGBToSepiaG = {
3966 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3967};
3968
3969CONST vec8 kARGBToSepiaR = {
3970 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3971};
3972
fbarchard@google.come442dc42012-06-18 17:37:09 +00003973// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003974void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3975 asm volatile (
3976 "movdqa %2,%%xmm2 \n"
3977 "movdqa %3,%%xmm3 \n"
3978 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003979
3980 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003981 ".p2align 4 \n"
3982 "1: \n"
3983 "movdqa (%0),%%xmm0 \n"
3984 "movdqa 0x10(%0),%%xmm6 \n"
3985 "pmaddubsw %%xmm2,%%xmm0 \n"
3986 "pmaddubsw %%xmm2,%%xmm6 \n"
3987 "phaddw %%xmm6,%%xmm0 \n"
3988 "psrlw $0x7,%%xmm0 \n"
3989 "packuswb %%xmm0,%%xmm0 \n"
3990 "movdqa (%0),%%xmm5 \n"
3991 "movdqa 0x10(%0),%%xmm1 \n"
3992 "pmaddubsw %%xmm3,%%xmm5 \n"
3993 "pmaddubsw %%xmm3,%%xmm1 \n"
3994 "phaddw %%xmm1,%%xmm5 \n"
3995 "psrlw $0x7,%%xmm5 \n"
3996 "packuswb %%xmm5,%%xmm5 \n"
3997 "punpcklbw %%xmm5,%%xmm0 \n"
3998 "movdqa (%0),%%xmm5 \n"
3999 "movdqa 0x10(%0),%%xmm1 \n"
4000 "pmaddubsw %%xmm4,%%xmm5 \n"
4001 "pmaddubsw %%xmm4,%%xmm1 \n"
4002 "phaddw %%xmm1,%%xmm5 \n"
4003 "psrlw $0x7,%%xmm5 \n"
4004 "packuswb %%xmm5,%%xmm5 \n"
4005 "movdqa (%0),%%xmm6 \n"
4006 "movdqa 0x10(%0),%%xmm1 \n"
4007 "psrld $0x18,%%xmm6 \n"
4008 "psrld $0x18,%%xmm1 \n"
4009 "packuswb %%xmm1,%%xmm6 \n"
4010 "packuswb %%xmm6,%%xmm6 \n"
4011 "punpcklbw %%xmm6,%%xmm5 \n"
4012 "movdqa %%xmm0,%%xmm1 \n"
4013 "punpcklwd %%xmm5,%%xmm0 \n"
4014 "punpckhwd %%xmm5,%%xmm1 \n"
4015 "sub $0x8,%1 \n"
4016 "movdqa %%xmm0,(%0) \n"
4017 "movdqa %%xmm1,0x10(%0) \n"
4018 "lea 0x20(%0),%0 \n"
4019 "jg 1b \n"
4020 : "+r"(dst_argb), // %0
4021 "+r"(width) // %1
4022 : "m"(kARGBToSepiaB), // %2
4023 "m"(kARGBToSepiaG), // %3
4024 "m"(kARGBToSepiaR) // %4
4025 : "memory", "cc"
4026#if defined(__SSE2__)
4027 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4028#endif
4029 );
4030}
4031#endif // HAS_ARGBSEPIAROW_SSSE3
4032
fbarchard@google.come442dc42012-06-18 17:37:09 +00004033#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4034// Tranform 8 ARGB pixels (32 bytes) with color matrix.
4035// Same as Sepia except matrix is provided.
4036void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
4037 int width) {
4038 asm volatile (
4039 "movd (%2),%%xmm2 \n"
4040 "movd 0x4(%2),%%xmm3 \n"
4041 "movd 0x8(%2),%%xmm4 \n"
4042 "pshufd $0x0,%%xmm2,%%xmm2 \n"
4043 "pshufd $0x0,%%xmm3,%%xmm3 \n"
4044 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00004045
4046 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00004047 ".p2align 4 \n"
4048 "1: \n"
4049 "movdqa (%0),%%xmm0 \n"
4050 "movdqa 0x10(%0),%%xmm6 \n"
4051 "pmaddubsw %%xmm2,%%xmm0 \n"
4052 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004053 "movdqa (%0),%%xmm5 \n"
4054 "movdqa 0x10(%0),%%xmm1 \n"
4055 "pmaddubsw %%xmm3,%%xmm5 \n"
4056 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00004057 "phaddsw %%xmm6,%%xmm0 \n"
4058 "phaddsw %%xmm1,%%xmm5 \n"
4059 "psraw $0x7,%%xmm0 \n"
4060 "psraw $0x7,%%xmm5 \n"
4061 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004062 "packuswb %%xmm5,%%xmm5 \n"
4063 "punpcklbw %%xmm5,%%xmm0 \n"
4064 "movdqa (%0),%%xmm5 \n"
4065 "movdqa 0x10(%0),%%xmm1 \n"
4066 "pmaddubsw %%xmm4,%%xmm5 \n"
4067 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00004068 "phaddsw %%xmm1,%%xmm5 \n"
4069 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004070 "packuswb %%xmm5,%%xmm5 \n"
4071 "movdqa (%0),%%xmm6 \n"
4072 "movdqa 0x10(%0),%%xmm1 \n"
4073 "psrld $0x18,%%xmm6 \n"
4074 "psrld $0x18,%%xmm1 \n"
4075 "packuswb %%xmm1,%%xmm6 \n"
4076 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004077 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00004078 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004079 "punpcklwd %%xmm5,%%xmm0 \n"
4080 "punpckhwd %%xmm5,%%xmm1 \n"
4081 "sub $0x8,%1 \n"
4082 "movdqa %%xmm0,(%0) \n"
4083 "movdqa %%xmm1,0x10(%0) \n"
4084 "lea 0x20(%0),%0 \n"
4085 "jg 1b \n"
4086 : "+r"(dst_argb), // %0
4087 "+r"(width) // %1
4088 : "r"(matrix_argb) // %2
4089 : "memory", "cc"
4090#if defined(__SSE2__)
4091 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4092#endif
4093 );
4094}
4095#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4096
fbarchard@google.com81b804e2012-06-20 02:15:01 +00004097#ifdef HAS_ARGBQUANTIZEROW_SSE2
4098// Quantize 4 ARGB pixels (16 bytes).
4099// aligned to 16 bytes
4100void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4101 int interval_offset, int width) {
4102 asm volatile (
4103 "movd %2,%%xmm2 \n"
4104 "movd %3,%%xmm3 \n"
4105 "movd %4,%%xmm4 \n"
4106 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4107 "pshufd $0x44,%%xmm2,%%xmm2 \n"
4108 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4109 "pshufd $0x44,%%xmm3,%%xmm3 \n"
4110 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
4111 "pshufd $0x44,%%xmm4,%%xmm4 \n"
4112 "pxor %%xmm5,%%xmm5 \n"
4113 "pcmpeqb %%xmm6,%%xmm6 \n"
4114 "pslld $0x18,%%xmm6 \n"
4115
4116 // 4 pixel loop.
4117 ".p2align 2 \n"
4118 "1: \n"
4119 "movdqa (%0),%%xmm0 \n"
4120 "punpcklbw %%xmm5,%%xmm0 \n"
4121 "pmulhuw %%xmm2,%%xmm0 \n"
4122 "movdqa (%0),%%xmm1 \n"
4123 "punpckhbw %%xmm5,%%xmm1 \n"
4124 "pmulhuw %%xmm2,%%xmm1 \n"
4125 "pmullw %%xmm3,%%xmm0 \n"
4126 "movdqa (%0),%%xmm7 \n"
4127 "pmullw %%xmm3,%%xmm1 \n"
4128 "pand %%xmm6,%%xmm7 \n"
4129 "paddw %%xmm4,%%xmm0 \n"
4130 "paddw %%xmm4,%%xmm1 \n"
4131 "packuswb %%xmm1,%%xmm0 \n"
4132 "por %%xmm7,%%xmm0 \n"
4133 "sub $0x4,%1 \n"
4134 "movdqa %%xmm0,(%0) \n"
4135 "lea 0x10(%0),%0 \n"
4136 "jg 1b \n"
4137 : "+r"(dst_argb), // %0
4138 "+r"(width) // %1
4139 : "r"(scale), // %2
4140 "r"(interval_size), // %3
4141 "r"(interval_offset) // %4
4142 : "memory", "cc"
4143#if defined(__SSE2__)
4144 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4145#endif
4146 );
4147}
4148#endif // HAS_ARGBQUANTIZEROW_SSE2
4149
fbarchard@google.comb94b1392012-12-03 20:36:40 +00004150#ifdef HAS_ARGBSHADEROW_SSE2
4151// Shade 4 pixels at a time by specified value.
4152// Aligned to 16 bytes.
4153void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4154 uint32 value) {
4155 asm volatile (
4156 "movd %3,%%xmm2 \n"
4157 "sub %0,%1 \n"
4158 "punpcklbw %%xmm2,%%xmm2 \n"
4159 "punpcklqdq %%xmm2,%%xmm2 \n"
4160
4161 // 4 pixel loop.
4162 ".p2align 2 \n"
4163 "1: \n"
4164 "movdqa (%0),%%xmm0 \n"
4165 "movdqa %%xmm0,%%xmm1 \n"
4166 "punpcklbw %%xmm0,%%xmm0 \n"
4167 "punpckhbw %%xmm1,%%xmm1 \n"
4168 "pmulhuw %%xmm2,%%xmm0 \n"
4169 "pmulhuw %%xmm2,%%xmm1 \n"
4170 "psrlw $0x8,%%xmm0 \n"
4171 "psrlw $0x8,%%xmm1 \n"
4172 "packuswb %%xmm1,%%xmm0 \n"
4173 "sub $0x4,%2 \n"
4174 "movdqa %%xmm0,(%0,%1,1) \n"
4175 "lea 0x10(%0),%0 \n"
4176 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004177 : "+r"(src_argb), // %0
4178 "+r"(dst_argb), // %1
4179 "+r"(width) // %2
4180 : "r"(value) // %3
fbarchard@google.comb94b1392012-12-03 20:36:40 +00004181 : "memory", "cc"
4182#if defined(__SSE2__)
4183 , "xmm0", "xmm1", "xmm2"
4184#endif
4185 );
4186}
4187#endif // HAS_ARGBSHADEROW_SSE2
4188
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004189#ifdef HAS_ARGBMULTIPLYROW_SSE2
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004190// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004191// Aligned to 16 bytes.
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004192void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4193 uint8* dst_argb, int width) {
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004194 asm volatile (
4195 "pxor %%xmm5,%%xmm5 \n"
4196 "sub %0,%1 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004197 "sub %0,%2 \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004198
4199 // 4 pixel loop.
4200 ".p2align 4 \n"
4201 "1: \n"
4202 "movdqa (%0),%%xmm0 \n"
4203 "movdqa (%0,%1),%%xmm2 \n"
4204 "movdqa %%xmm0,%%xmm1 \n"
4205 "movdqa %%xmm2,%%xmm3 \n"
4206 "punpcklbw %%xmm0,%%xmm0 \n"
4207 "punpckhbw %%xmm1,%%xmm1 \n"
4208 "punpcklbw %%xmm5,%%xmm2 \n"
4209 "punpckhbw %%xmm5,%%xmm3 \n"
4210 "pmulhuw %%xmm2,%%xmm0 \n"
4211 "pmulhuw %%xmm3,%%xmm1 \n"
4212 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004213 "sub $0x4,%3 \n"
4214 "movdqa %%xmm0,(%0,%2,1) \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004215 "lea 0x10(%0),%0 \n"
4216 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004217 : "+r"(src_argb0), // %0
4218 "+r"(src_argb1), // %1
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004219 "+r"(dst_argb), // %2
4220 "+r"(width) // %3
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004221 :
4222 : "memory", "cc"
4223#if defined(__SSE2__)
4224 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4225#endif
4226 );
4227}
4228#endif // HAS_ARGBMULTIPLYROW_SSE2
4229
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004230#ifdef HAS_ARGBADDROW_SSE2
4231// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4232// Aligned to 16 bytes.
4233void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4234 uint8* dst_argb, int width) {
4235 asm volatile (
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004236 "sub %0,%1 \n"
4237 "sub %0,%2 \n"
4238
4239 // 4 pixel loop.
4240 ".p2align 4 \n"
4241 "1: \n"
4242 "movdqa (%0),%%xmm0 \n"
4243 "movdqa (%0,%1),%%xmm1 \n"
4244 "paddusb %%xmm1,%%xmm0 \n"
4245 "sub $0x4,%3 \n"
4246 "movdqa %%xmm0,(%0,%2,1) \n"
4247 "lea 0x10(%0),%0 \n"
4248 "jg 1b \n"
4249 : "+r"(src_argb0), // %0
4250 "+r"(src_argb1), // %1
4251 "+r"(dst_argb), // %2
4252 "+r"(width) // %3
4253 :
4254 : "memory", "cc"
4255#if defined(__SSE2__)
fbarchard@google.com573a8832013-01-24 23:08:12 +00004256 , "xmm0", "xmm1"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004257#endif
4258 );
4259}
4260#endif // HAS_ARGBADDROW_SSE2
4261
fbarchard@google.com573a8832013-01-24 23:08:12 +00004262#ifdef HAS_ARGBSUBTRACTROW_SSE2
4263// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4264// Aligned to 16 bytes.
4265void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4266 uint8* dst_argb, int width) {
4267 asm volatile (
4268 "sub %0,%1 \n"
4269 "sub %0,%2 \n"
4270
4271 // 4 pixel loop.
4272 ".p2align 4 \n"
4273 "1: \n"
4274 "movdqa (%0),%%xmm0 \n"
4275 "movdqa (%0,%1),%%xmm1 \n"
4276 "psubusb %%xmm1,%%xmm0 \n"
4277 "sub $0x4,%3 \n"
4278 "movdqa %%xmm0,(%0,%2,1) \n"
4279 "lea 0x10(%0),%0 \n"
4280 "jg 1b \n"
4281 : "+r"(src_argb0), // %0
4282 "+r"(src_argb1), // %1
4283 "+r"(dst_argb), // %2
4284 "+r"(width) // %3
4285 :
4286 : "memory", "cc"
4287#if defined(__SSE2__)
4288 , "xmm0", "xmm1"
4289#endif
4290 );
4291}
4292#endif // HAS_ARGBSUBTRACTROW_SSE2
4293
fbarchard@google.com9d48df92013-03-24 20:12:25 +00004294#ifdef HAS_SOBELXROW_SSSE3
4295// SobelX as a matrix is
4296// -1 0 1
4297// -2 0 2
4298// -1 0 1
4299void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
4300 const uint8* src_y2, uint8* dst_sobelx, int width) {
4301 asm volatile (
4302 "sub %0,%1 \n"
4303 "sub %0,%2 \n"
4304 "sub %0,%3 \n"
4305 "pxor %%xmm5,%%xmm5 \n"
4306
4307 // 8 pixel loop.
4308 ".p2align 4 \n"
4309 "1: \n"
4310 "movq (%0),%%xmm0 \n"
4311 "movq 0x2(%0),%%xmm1 \n"
4312 "punpcklbw %%xmm5,%%xmm0 \n"
4313 "punpcklbw %%xmm5,%%xmm1 \n"
4314 "psubw %%xmm1,%%xmm0 \n"
4315 "movq (%0,%1,1),%%xmm1 \n"
4316 "movq 0x2(%0,%1,1),%%xmm2 \n"
4317 "punpcklbw %%xmm5,%%xmm1 \n"
4318 "punpcklbw %%xmm5,%%xmm2 \n"
4319 "psubw %%xmm2,%%xmm1 \n"
4320 "movq (%0,%2,1),%%xmm2 \n"
4321 "movq 0x2(%0,%2,1),%%xmm3 \n"
4322 "punpcklbw %%xmm5,%%xmm2 \n"
4323 "punpcklbw %%xmm5,%%xmm3 \n"
4324 "psubw %%xmm3,%%xmm2 \n"
4325 "paddw %%xmm2,%%xmm0 \n"
4326 "paddw %%xmm1,%%xmm0 \n"
4327 "paddw %%xmm1,%%xmm0 \n"
4328 "pabsw %%xmm0,%%xmm0 \n"
4329 "packuswb %%xmm0,%%xmm0 \n"
4330 "sub $0x8,%4 \n"
4331 "movq %%xmm0,(%0,%3,1) \n"
4332 "lea 0x8(%0),%0 \n"
4333 "jg 1b \n"
4334 : "+r"(src_y0), // %0
4335 "+r"(src_y1), // %1
4336 "+r"(src_y2), // %2
4337 "+r"(dst_sobelx), // %3
4338 "+r"(width) // %4
4339 :
4340 : "memory", "cc"
4341#if defined(__SSE2__)
4342 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4343#endif
4344 );
4345}
4346#endif // HAS_SOBELXROW_SSSE3
4347
4348#ifdef HAS_SOBELYROW_SSSE3
4349// SobelY as a matrix is
4350// -1 -2 -1
4351// 0 0 0
4352// 1 2 1
4353void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
4354 uint8* dst_sobely, int width) {
4355 asm volatile (
4356 "sub %0,%1 \n"
4357 "sub %0,%2 \n"
4358 "pxor %%xmm5,%%xmm5 \n"
4359
4360 // 8 pixel loop.
4361 ".p2align 4 \n"
4362 "1: \n"
4363 "movq (%0),%%xmm0 \n"
4364 "movq (%0,%1,1),%%xmm1 \n"
4365 "punpcklbw %%xmm5,%%xmm0 \n"
4366 "punpcklbw %%xmm5,%%xmm1 \n"
4367 "psubw %%xmm1,%%xmm0 \n"
4368 "movq 0x1(%0),%%xmm1 \n"
4369 "movq 0x1(%0,%1,1),%%xmm2 \n"
4370 "punpcklbw %%xmm5,%%xmm1 \n"
4371 "punpcklbw %%xmm5,%%xmm2 \n"
4372 "psubw %%xmm2,%%xmm1 \n"
4373 "movq 0x2(%0),%%xmm2 \n"
4374 "movq 0x2(%0,%1,1),%%xmm3 \n"
4375 "punpcklbw %%xmm5,%%xmm2 \n"
4376 "punpcklbw %%xmm5,%%xmm3 \n"
4377 "psubw %%xmm3,%%xmm2 \n"
4378 "paddw %%xmm2,%%xmm0 \n"
4379 "paddw %%xmm1,%%xmm0 \n"
4380 "paddw %%xmm1,%%xmm0 \n"
4381 "pabsw %%xmm0,%%xmm0 \n"
4382 "packuswb %%xmm0,%%xmm0 \n"
4383 "sub $0x8,%3 \n"
4384 "movq %%xmm0,(%0,%2,1) \n"
4385 "lea 0x8(%0),%0 \n"
4386 "jg 1b \n"
4387 : "+r"(src_y0), // %0
4388 "+r"(src_y1), // %1
4389 "+r"(dst_sobely), // %2
4390 "+r"(width) // %3
4391 :
4392 : "memory", "cc"
4393#if defined(__SSE2__)
4394 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4395#endif
4396 );
4397}
4398#endif // HAS_SOBELYROW_SSSE3
4399
4400#ifdef HAS_SOBELROW_SSE2
4401// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4402// A = 255
4403// R = Sobel
4404// G = Sobel
4405// B = Sobel
4406void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4407 uint8* dst_argb, int width) {
4408 asm volatile (
4409 "sub %0,%1 \n"
4410 "pcmpeqb %%xmm5,%%xmm5 \n"
4411 "pslld $0x18,%%xmm5 \n"
4412
4413 // 8 pixel loop.
4414 ".p2align 4 \n"
4415 "1: \n"
4416 "movdqa (%0),%%xmm0 \n"
4417 "movdqa (%0,%1,1),%%xmm1 \n"
4418 "lea 0x10(%0),%0 \n"
4419 "paddusb %%xmm1,%%xmm0 \n"
4420 "movdqa %%xmm0,%%xmm2 \n"
4421 "punpcklbw %%xmm0,%%xmm2 \n"
4422 "punpckhbw %%xmm0,%%xmm0 \n"
4423 "movdqa %%xmm2,%%xmm1 \n"
4424 "punpcklwd %%xmm2,%%xmm1 \n"
4425 "punpckhwd %%xmm2,%%xmm2 \n"
4426 "por %%xmm5,%%xmm1 \n"
4427 "por %%xmm5,%%xmm2 \n"
4428 "movdqa %%xmm0,%%xmm3 \n"
4429 "punpcklwd %%xmm0,%%xmm3 \n"
4430 "punpckhwd %%xmm0,%%xmm0 \n"
4431 "por %%xmm5,%%xmm3 \n"
4432 "por %%xmm5,%%xmm0 \n"
4433 "sub $0x10,%3 \n"
4434 "movdqa %%xmm1,(%2) \n"
4435 "movdqa %%xmm2,0x10(%2) \n"
4436 "movdqa %%xmm3,0x20(%2) \n"
4437 "movdqa %%xmm0,0x30(%2) \n"
4438 "lea 0x40(%2),%2 \n"
4439 "jg 1b \n"
4440 : "+r"(src_sobelx), // %0
4441 "+r"(src_sobely), // %1
4442 "+r"(dst_argb), // %2
4443 "+r"(width) // %3
4444 :
4445 : "memory", "cc"
4446#if defined(__SSE2__)
4447 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4448#endif
4449 );
4450}
4451#endif // HAS_SOBELROW_SSE2
4452
4453#ifdef HAS_SOBELXYROW_SSE2
4454// Mixes Sobel X, Sobel Y and Sobel into ARGB.
4455// A = 255
4456// R = Sobel X
4457// G = Sobel
4458// B = Sobel Y
4459void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4460 uint8* dst_argb, int width) {
4461 asm volatile (
4462 "sub %0,%1 \n"
4463 "pcmpeqb %%xmm5,%%xmm5 \n"
4464
4465 // 8 pixel loop.
4466 ".p2align 4 \n"
4467 "1: \n"
4468 "movdqa (%0),%%xmm0 \n"
4469 "movdqa (%0,%1,1),%%xmm1 \n"
4470 "lea 0x10(%0),%0 \n"
4471 "movdqa %%xmm0,%%xmm2 \n"
4472 "paddusb %%xmm1,%%xmm2 \n"
4473 "movdqa %%xmm0,%%xmm3 \n"
4474 "punpcklbw %%xmm5,%%xmm3 \n"
4475 "punpckhbw %%xmm5,%%xmm0 \n"
4476 "movdqa %%xmm1,%%xmm4 \n"
4477 "punpcklbw %%xmm2,%%xmm4 \n"
4478 "punpckhbw %%xmm2,%%xmm1 \n"
4479 "movdqa %%xmm4,%%xmm6 \n"
4480 "punpcklwd %%xmm3,%%xmm6 \n"
4481 "punpckhwd %%xmm3,%%xmm4 \n"
4482 "movdqa %%xmm1,%%xmm7 \n"
4483 "punpcklwd %%xmm0,%%xmm7 \n"
4484 "punpckhwd %%xmm0,%%xmm1 \n"
4485 "sub $0x10,%3 \n"
4486 "movdqa %%xmm6,(%2) \n"
4487 "movdqa %%xmm4,0x10(%2) \n"
4488 "movdqa %%xmm7,0x20(%2) \n"
4489 "movdqa %%xmm1,0x30(%2) \n"
4490 "lea 0x40(%2),%2 \n"
4491 "jg 1b \n"
4492 : "+r"(src_sobelx), // %0
4493 "+r"(src_sobely), // %1
4494 "+r"(dst_argb), // %2
4495 "+r"(width) // %3
4496 :
4497 : "memory", "cc"
4498#if defined(__SSE2__)
4499 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4500#endif
4501 );
4502}
4503#endif // HAS_SOBELXYROW_SSE2
4504
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004505#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4506// Creates a table of cumulative sums where each value is a sum of all values
4507// above and to the left of the value, inclusive of the value.
4508void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00004509 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004510 asm volatile (
4511 "sub %1,%2 \n"
4512 "pxor %%xmm0,%%xmm0 \n"
4513 "pxor %%xmm1,%%xmm1 \n"
4514 "sub $0x4,%3 \n"
4515 "jl 49f \n"
4516 "test $0xf,%1 \n"
4517 "jne 49f \n"
4518
4519 // 4 pixel loop \n"
4520 ".p2align 2 \n"
4521 "40: \n"
4522 "movdqu (%0),%%xmm2 \n"
4523 "lea 0x10(%0),%0 \n"
4524 "movdqa %%xmm2,%%xmm4 \n"
4525 "punpcklbw %%xmm1,%%xmm2 \n"
4526 "movdqa %%xmm2,%%xmm3 \n"
4527 "punpcklwd %%xmm1,%%xmm2 \n"
4528 "punpckhwd %%xmm1,%%xmm3 \n"
4529 "punpckhbw %%xmm1,%%xmm4 \n"
4530 "movdqa %%xmm4,%%xmm5 \n"
4531 "punpcklwd %%xmm1,%%xmm4 \n"
4532 "punpckhwd %%xmm1,%%xmm5 \n"
4533 "paddd %%xmm2,%%xmm0 \n"
4534 "movdqa (%1,%2,1),%%xmm2 \n"
4535 "paddd %%xmm0,%%xmm2 \n"
4536 "paddd %%xmm3,%%xmm0 \n"
4537 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
4538 "paddd %%xmm0,%%xmm3 \n"
4539 "paddd %%xmm4,%%xmm0 \n"
4540 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
4541 "paddd %%xmm0,%%xmm4 \n"
4542 "paddd %%xmm5,%%xmm0 \n"
4543 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
4544 "paddd %%xmm0,%%xmm5 \n"
4545 "movdqa %%xmm2,(%1) \n"
4546 "movdqa %%xmm3,0x10(%1) \n"
4547 "movdqa %%xmm4,0x20(%1) \n"
4548 "movdqa %%xmm5,0x30(%1) \n"
4549 "lea 0x40(%1),%1 \n"
4550 "sub $0x4,%3 \n"
4551 "jge 40b \n"
4552
4553 "49: \n"
4554 "add $0x3,%3 \n"
4555 "jl 19f \n"
4556
4557 // 1 pixel loop \n"
4558 ".p2align 2 \n"
4559 "10: \n"
4560 "movd (%0),%%xmm2 \n"
4561 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00004562 "punpcklbw %%xmm1,%%xmm2 \n"
4563 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004564 "paddd %%xmm2,%%xmm0 \n"
4565 "movdqu (%1,%2,1),%%xmm2 \n"
4566 "paddd %%xmm0,%%xmm2 \n"
4567 "movdqu %%xmm2,(%1) \n"
4568 "lea 0x10(%1),%1 \n"
4569 "sub $0x1,%3 \n"
4570 "jge 10b \n"
4571
4572 "19: \n"
4573 : "+r"(row), // %0
4574 "+r"(cumsum), // %1
4575 "+r"(previous_cumsum), // %2
4576 "+r"(width) // %3
4577 :
4578 : "memory", "cc"
4579#if defined(__SSE2__)
4580 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4581#endif
4582 );
4583}
4584#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4585
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004586#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4587void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4588 int width, int area, uint8* dst,
4589 int count) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004590 asm volatile (
4591 "movd %5,%%xmm4 \n"
4592 "cvtdq2ps %%xmm4,%%xmm4 \n"
4593 "rcpss %%xmm4,%%xmm4 \n"
4594 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4595 "sub $0x4,%3 \n"
4596 "jl 49f \n"
4597
4598 // 4 pixel loop \n"
4599 ".p2align 2 \n"
4600 "40: \n"
4601 "movdqa (%0),%%xmm0 \n"
4602 "movdqa 0x10(%0),%%xmm1 \n"
4603 "movdqa 0x20(%0),%%xmm2 \n"
4604 "movdqa 0x30(%0),%%xmm3 \n"
4605 "psubd (%0,%4,4),%%xmm0 \n"
4606 "psubd 0x10(%0,%4,4),%%xmm1 \n"
4607 "psubd 0x20(%0,%4,4),%%xmm2 \n"
4608 "psubd 0x30(%0,%4,4),%%xmm3 \n"
4609 "lea 0x40(%0),%0 \n"
4610 "psubd (%1),%%xmm0 \n"
4611 "psubd 0x10(%1),%%xmm1 \n"
4612 "psubd 0x20(%1),%%xmm2 \n"
4613 "psubd 0x30(%1),%%xmm3 \n"
4614 "paddd (%1,%4,4),%%xmm0 \n"
4615 "paddd 0x10(%1,%4,4),%%xmm1 \n"
4616 "paddd 0x20(%1,%4,4),%%xmm2 \n"
4617 "paddd 0x30(%1,%4,4),%%xmm3 \n"
4618 "lea 0x40(%1),%1 \n"
4619 "cvtdq2ps %%xmm0,%%xmm0 \n"
4620 "cvtdq2ps %%xmm1,%%xmm1 \n"
4621 "mulps %%xmm4,%%xmm0 \n"
4622 "mulps %%xmm4,%%xmm1 \n"
4623 "cvtdq2ps %%xmm2,%%xmm2 \n"
4624 "cvtdq2ps %%xmm3,%%xmm3 \n"
4625 "mulps %%xmm4,%%xmm2 \n"
4626 "mulps %%xmm4,%%xmm3 \n"
4627 "cvtps2dq %%xmm0,%%xmm0 \n"
4628 "cvtps2dq %%xmm1,%%xmm1 \n"
4629 "cvtps2dq %%xmm2,%%xmm2 \n"
4630 "cvtps2dq %%xmm3,%%xmm3 \n"
4631 "packssdw %%xmm1,%%xmm0 \n"
4632 "packssdw %%xmm3,%%xmm2 \n"
4633 "packuswb %%xmm2,%%xmm0 \n"
4634 "movdqu %%xmm0,(%2) \n"
4635 "lea 0x10(%2),%2 \n"
4636 "sub $0x4,%3 \n"
4637 "jge 40b \n"
4638
4639 "49: \n"
4640 "add $0x3,%3 \n"
4641 "jl 19f \n"
4642
4643 // 1 pixel loop \n"
4644 ".p2align 2 \n"
4645 "10: \n"
4646 "movdqa (%0),%%xmm0 \n"
4647 "psubd (%0,%4,4),%%xmm0 \n"
4648 "lea 0x10(%0),%0 \n"
4649 "psubd (%1),%%xmm0 \n"
4650 "paddd (%1,%4,4),%%xmm0 \n"
4651 "lea 0x10(%1),%1 \n"
4652 "cvtdq2ps %%xmm0,%%xmm0 \n"
4653 "mulps %%xmm4,%%xmm0 \n"
4654 "cvtps2dq %%xmm0,%%xmm0 \n"
4655 "packssdw %%xmm0,%%xmm0 \n"
4656 "packuswb %%xmm0,%%xmm0 \n"
4657 "movd %%xmm0,(%2) \n"
4658 "lea 0x4(%2),%2 \n"
4659 "sub $0x1,%3 \n"
4660 "jge 10b \n"
4661 "19: \n"
4662 : "+r"(topleft), // %0
4663 "+r"(botleft), // %1
4664 "+r"(dst), // %2
4665 "+rm"(count) // %3
4666 : "r"(static_cast<intptr_t>(width)), // %4
4667 "rm"(area) // %5
4668 : "memory", "cc"
4669#if defined(__SSE2__)
4670 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
4671#endif
4672 );
4673}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004674#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004675
fbarchard@google.com73444402012-08-09 17:33:29 +00004676#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004677// TODO(fbarchard): Find 64 bit way to avoid masking.
fbarchard@google.com73444402012-08-09 17:33:29 +00004678// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004679// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00004680// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004681
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00004682LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00004683void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004684 uint8* dst_argb, const float* src_dudv, int width) {
fbarchard@google.com73444402012-08-09 17:33:29 +00004685 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004686 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00004687 asm volatile (
4688 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004689 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004690 "shl $0x10,%1 \n"
4691 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004692 "movd %1,%%xmm5 \n"
4693 "sub $0x4,%4 \n"
4694 "jl 49f \n"
4695
4696 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4697 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004698 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004699 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004700 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004701 "movdqa %%xmm7,%%xmm4 \n"
4702 "addps %%xmm4,%%xmm4 \n"
4703 "movdqa %%xmm2,%%xmm3 \n"
4704 "addps %%xmm4,%%xmm3 \n"
4705 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004706
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004707 // 4 pixel loop \n"
4708 ".p2align 4 \n"
4709 "40: \n"
4710 "cvttps2dq %%xmm2,%%xmm0 \n"
4711 "cvttps2dq %%xmm3,%%xmm1 \n"
4712 "packssdw %%xmm1,%%xmm0 \n"
4713 "pmaddwd %%xmm5,%%xmm0 \n"
4714#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004715 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004716 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004717 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004718 "shr $32,%5 \n"
4719 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4720#else
4721 "movd %%xmm0,%1 \n"
4722 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4723 "movd %%xmm0,%5 \n"
4724 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4725#endif
4726 "movd (%0,%1,1),%%xmm1 \n"
4727 "movd (%0,%5,1),%%xmm6 \n"
4728 "punpckldq %%xmm6,%%xmm1 \n"
4729 "addps %%xmm4,%%xmm2 \n"
4730 "movq %%xmm1,(%2) \n"
4731#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004732 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004733 "mov %1,%5 \n"
4734 "and $0x0fffffff,%1 \n"
4735 "shr $32,%5 \n"
4736#else
4737 "movd %%xmm0,%1 \n"
4738 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4739 "movd %%xmm0,%5 \n"
4740#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004741 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004742 "movd (%0,%5,1),%%xmm6 \n"
4743 "punpckldq %%xmm6,%%xmm0 \n"
4744 "addps %%xmm4,%%xmm3 \n"
4745 "sub $0x4,%4 \n"
4746 "movq %%xmm0,0x08(%2) \n"
4747 "lea 0x10(%2),%2 \n"
4748 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004749
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004750 "49: \n"
4751 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004752 "jl 19f \n"
4753
4754 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004755 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004756 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004757 "cvttps2dq %%xmm2,%%xmm0 \n"
4758 "packssdw %%xmm0,%%xmm0 \n"
4759 "pmaddwd %%xmm5,%%xmm0 \n"
4760 "addps %%xmm7,%%xmm2 \n"
4761 "movd %%xmm0,%1 \n"
4762#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004763 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004764#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004765 "movd (%0,%1,1),%%xmm0 \n"
4766 "sub $0x1,%4 \n"
4767 "movd %%xmm0,(%2) \n"
4768 "lea 0x4(%2),%2 \n"
4769 "jge 10b \n"
4770 "19: \n"
4771 : "+r"(src_argb), // %0
4772 "+r"(src_argb_stride_temp), // %1
4773 "+r"(dst_argb), // %2
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004774 "+r"(src_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004775 "+rm"(width), // %4
4776 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004777 :
4778 : "memory", "cc"
4779#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004780 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004781#endif
4782 );
4783}
4784#endif // HAS_ARGBAFFINEROW_SSE2
4785
fbarchard@google.comb5491752012-11-20 09:44:46 +00004786// Bilinear image filtering.
4787// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4788void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004789 ptrdiff_t src_stride, int dst_width,
4790 int source_y_fraction) {
4791 asm volatile (
4792 "sub %1,%0 \n"
4793 "shr %3 \n"
4794 "cmp $0x0,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004795 "je 100f \n"
4796 "cmp $0x20,%3 \n"
4797 "je 75f \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004798 "cmp $0x40,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004799 "je 50f \n"
4800 "cmp $0x60,%3 \n"
4801 "je 25f \n"
4802
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004803 "movd %3,%%xmm0 \n"
4804 "neg %3 \n"
4805 "add $0x80,%3 \n"
4806 "movd %3,%%xmm5 \n"
4807 "punpcklbw %%xmm0,%%xmm5 \n"
4808 "punpcklwd %%xmm5,%%xmm5 \n"
4809 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004810
4811 // General purpose row blend.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004812 ".p2align 4 \n"
4813 "1: \n"
4814 "movdqa (%1),%%xmm0 \n"
4815 "movdqa (%1,%4,1),%%xmm2 \n"
4816 "movdqa %%xmm0,%%xmm1 \n"
4817 "punpcklbw %%xmm2,%%xmm0 \n"
4818 "punpckhbw %%xmm2,%%xmm1 \n"
4819 "pmaddubsw %%xmm5,%%xmm0 \n"
4820 "pmaddubsw %%xmm5,%%xmm1 \n"
4821 "psrlw $0x7,%%xmm0 \n"
4822 "psrlw $0x7,%%xmm1 \n"
4823 "packuswb %%xmm1,%%xmm0 \n"
4824 "sub $0x4,%2 \n"
4825 "movdqa %%xmm0,(%1,%0,1) \n"
4826 "lea 0x10(%1),%1 \n"
4827 "jg 1b \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004828 "jmp 99f \n"
4829
4830 // Blend 25 / 75.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004831 ".p2align 4 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004832 "25: \n"
4833 "movdqa (%1),%%xmm0 \n"
4834 "movdqa (%1,%4,1),%%xmm1 \n"
4835 "pavgb %%xmm1,%%xmm0 \n"
4836 "pavgb %%xmm1,%%xmm0 \n"
4837 "sub $0x4,%2 \n"
4838 "movdqa %%xmm0,(%1,%0,1) \n"
4839 "lea 0x10(%1),%1 \n"
4840 "jg 25b \n"
4841 "jmp 99f \n"
4842
4843 // Blend 50 / 50.
4844 ".p2align 4 \n"
4845 "50: \n"
4846 "movdqa (%1),%%xmm0 \n"
4847 "movdqa (%1,%4,1),%%xmm1 \n"
4848 "pavgb %%xmm1,%%xmm0 \n"
4849 "sub $0x4,%2 \n"
4850 "movdqa %%xmm0,(%1,%0,1) \n"
4851 "lea 0x10(%1),%1 \n"
4852 "jg 50b \n"
4853 "jmp 99f \n"
4854
4855 // Blend 75 / 25.
4856 ".p2align 4 \n"
4857 "75: \n"
4858 "movdqa (%1),%%xmm1 \n"
4859 "movdqa (%1,%4,1),%%xmm0 \n"
4860 "pavgb %%xmm1,%%xmm0 \n"
4861 "pavgb %%xmm1,%%xmm0 \n"
4862 "sub $0x4,%2 \n"
4863 "movdqa %%xmm0,(%1,%0,1) \n"
4864 "lea 0x10(%1),%1 \n"
4865 "jg 75b \n"
4866 "jmp 99f \n"
4867
4868 // Blend 100 / 0 - Copy row unchanged.
4869 ".p2align 4 \n"
4870 "100: \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004871 "movdqa (%1),%%xmm0 \n"
4872 "sub $0x4,%2 \n"
4873 "movdqa %%xmm0,(%1,%0,1) \n"
4874 "lea 0x10(%1),%1 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004875 "jg 100b \n"
4876
fbarchard@google.comb5491752012-11-20 09:44:46 +00004877 "99: \n"
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004878 : "+r"(dst_argb), // %0
4879 "+r"(src_argb), // %1
fbarchard@google.comb5491752012-11-20 09:44:46 +00004880 "+r"(dst_width), // %2
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004881 "+r"(source_y_fraction) // %3
4882 : "r"(static_cast<intptr_t>(src_stride)) // %4
4883 : "memory", "cc"
4884#if defined(__SSE2__)
4885 , "xmm0", "xmm1", "xmm2", "xmm5"
4886#endif
4887 );
4888}
4889
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004890// Bilinear image filtering.
4891// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4892void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
4893 ptrdiff_t src_stride, int dst_width,
4894 int source_y_fraction) {
4895 asm volatile (
4896 "sub %1,%0 \n"
4897 "shr %3 \n"
4898 "cmp $0x0,%3 \n"
4899 "je 100f \n"
4900 "cmp $0x20,%3 \n"
4901 "je 75f \n"
4902 "cmp $0x40,%3 \n"
4903 "je 50f \n"
4904 "cmp $0x60,%3 \n"
4905 "je 25f \n"
4906
4907 "movd %3,%%xmm0 \n"
4908 "neg %3 \n"
4909 "add $0x80,%3 \n"
4910 "movd %3,%%xmm5 \n"
4911 "punpcklbw %%xmm0,%%xmm5 \n"
4912 "punpcklwd %%xmm5,%%xmm5 \n"
4913 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4914 "pxor %%xmm4,%%xmm4 \n"
4915
4916 // General purpose row blend.
4917 ".p2align 4 \n"
4918 "1: \n"
4919 "movdqa (%1),%%xmm0 \n"
4920 "movdqa (%1,%4,1),%%xmm2 \n"
4921 "movdqa %%xmm0,%%xmm1 \n"
4922 "movdqa %%xmm2,%%xmm3 \n"
4923 "punpcklbw %%xmm4,%%xmm2 \n"
4924 "punpckhbw %%xmm4,%%xmm3 \n"
4925 "punpcklbw %%xmm4,%%xmm0 \n"
4926 "punpckhbw %%xmm4,%%xmm1 \n"
4927 "psubw %%xmm0,%%xmm2 \n"
4928 "psubw %%xmm1,%%xmm3 \n"
4929 "paddw %%xmm2,%%xmm2 \n"
4930 "paddw %%xmm3,%%xmm3 \n"
4931 "pmulhw %%xmm5,%%xmm2 \n"
4932 "pmulhw %%xmm5,%%xmm3 \n"
4933 "paddw %%xmm2,%%xmm0 \n"
4934 "paddw %%xmm3,%%xmm1 \n"
4935 "packuswb %%xmm1,%%xmm0 \n"
4936 "sub $0x4,%2 \n"
4937 "movdqa %%xmm0,(%1,%0,1) \n"
4938 "lea 0x10(%1),%1 \n"
4939 "jg 1b \n"
4940 "jmp 99f \n"
4941
4942 // Blend 25 / 75.
4943 ".p2align 4 \n"
4944 "25: \n"
4945 "movdqa (%1),%%xmm0 \n"
4946 "movdqa (%1,%4,1),%%xmm1 \n"
4947 "pavgb %%xmm1,%%xmm0 \n"
4948 "pavgb %%xmm1,%%xmm0 \n"
4949 "sub $0x4,%2 \n"
4950 "movdqa %%xmm0,(%1,%0,1) \n"
4951 "lea 0x10(%1),%1 \n"
4952 "jg 25b \n"
4953 "jmp 99f \n"
4954
4955 // Blend 50 / 50.
4956 ".p2align 4 \n"
4957 "50: \n"
4958 "movdqa (%1),%%xmm0 \n"
4959 "movdqa (%1,%4,1),%%xmm1 \n"
4960 "pavgb %%xmm1,%%xmm0 \n"
4961 "sub $0x4,%2 \n"
4962 "movdqa %%xmm0,(%1,%0,1) \n"
4963 "lea 0x10(%1),%1 \n"
4964 "jg 50b \n"
4965 "jmp 99f \n"
4966
4967 // Blend 75 / 25.
4968 ".p2align 4 \n"
4969 "75: \n"
4970 "movdqa (%1),%%xmm1 \n"
4971 "movdqa (%1,%4,1),%%xmm0 \n"
4972 "pavgb %%xmm1,%%xmm0 \n"
4973 "pavgb %%xmm1,%%xmm0 \n"
4974 "sub $0x4,%2 \n"
4975 "movdqa %%xmm0,(%1,%0,1) \n"
4976 "lea 0x10(%1),%1 \n"
4977 "jg 75b \n"
4978 "jmp 99f \n"
4979
4980 // Blend 100 / 0 - Copy row unchanged.
4981 ".p2align 4 \n"
4982 "100: \n"
4983 "movdqa (%1),%%xmm0 \n"
4984 "sub $0x4,%2 \n"
4985 "movdqa %%xmm0,(%1,%0,1) \n"
4986 "lea 0x10(%1),%1 \n"
4987 "jg 100b \n"
4988
4989 "99: \n"
4990 : "+r"(dst_argb), // %0
4991 "+r"(src_argb), // %1
4992 "+r"(dst_width), // %2
4993 "+r"(source_y_fraction) // %3
4994 : "r"(static_cast<intptr_t>(src_stride)) // %4
4995 : "memory", "cc"
4996#if defined(__SSE2__)
4997 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4998#endif
4999 );
5000}
5001
fbarchard@google.come91bdac2012-10-09 21:09:33 +00005002void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
5003 uint8* dst_uv, int pix) {
5004 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00005005 "sub %0,%1 \n"
5006 ".p2align 4 \n"
5007 "1: \n"
5008 "movdqa (%0),%%xmm0 \n"
5009 "pavgb (%0,%3),%%xmm0 \n"
5010 "sub $0x10,%2 \n"
5011 "movdqa %%xmm0,(%0,%1) \n"
5012 "lea 0x10(%0),%0 \n"
5013 "jg 1b \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00005014 : "+r"(src_uv), // %0
5015 "+r"(dst_uv), // %1
5016 "+r"(pix) // %2
5017 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
5018 : "memory", "cc"
5019#if defined(__SSE2__)
5020 , "xmm0"
5021#endif
5022 );
5023}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00005024
5025void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
5026 uint32 selector, int pix) {
5027 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00005028 "movd %3,%%xmm5 \n"
5029 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00005030 ".p2align 4 \n"
5031 "1: \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00005032 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00005033 "movdqa 0x10(%0),%%xmm1 \n"
5034 "lea 0x20(%0),%0 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00005035 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00005036 "pshufb %%xmm5,%%xmm1 \n"
fbarchard@google.coma3be4702013-03-22 05:20:02 +00005037 "punpckldq %%xmm1,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00005038 "sub $0x8,%2 \n"
5039 "movq %%xmm0,(%1) \n"
5040 "lea 0x8(%1),%1 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00005041 "jg 1b \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00005042 : "+r"(src_argb), // %0
5043 "+r"(dst_bayer), // %1
5044 "+r"(pix) // %2
5045 : "g"(selector) // %3
5046 : "memory", "cc"
5047#if defined(__SSE2__)
fbarchard@google.come8df16b2013-03-22 04:47:14 +00005048 , "xmm0", "xmm1", "xmm5"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00005049#endif
5050 );
5051}
fbarchard@google.com9de88672012-10-12 06:23:33 +00005052
fbarchard@google.com10965432013-03-08 23:22:32 +00005053// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5054void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5055 const uint8* shuffler, int pix) {
5056 asm volatile (
5057 "movdqa (%3),%%xmm5 \n"
5058 ".p2align 4 \n"
5059 "1: \n"
5060 "movdqa (%0),%%xmm0 \n"
5061 "movdqa 0x10(%0),%%xmm1 \n"
5062 "lea 0x20(%0),%0 \n"
5063 "pshufb %%xmm5,%%xmm0 \n"
5064 "pshufb %%xmm5,%%xmm1 \n"
5065 "sub $0x8,%2 \n"
5066 "movdqa %%xmm0,(%1) \n"
5067 "movdqa %%xmm1,0x10(%1) \n"
5068 "lea 0x20(%1),%1 \n"
5069 "jg 1b \n"
5070 : "+r"(src_argb), // %0
5071 "+r"(dst_argb), // %1
5072 "+r"(pix) // %2
5073 : "r"(shuffler) // %3
5074 : "memory", "cc"
5075#if defined(__SSE2__)
5076 , "xmm0", "xmm1", "xmm5"
5077#endif
5078 );
5079}
5080
5081void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
5082 const uint8* shuffler, int pix) {
5083 asm volatile (
5084 "movdqa (%3),%%xmm5 \n"
5085 ".p2align 4 \n"
5086 "1: \n"
5087 "movdqu (%0),%%xmm0 \n"
5088 "movdqu 0x10(%0),%%xmm1 \n"
5089 "lea 0x20(%0),%0 \n"
5090 "pshufb %%xmm5,%%xmm0 \n"
5091 "pshufb %%xmm5,%%xmm1 \n"
5092 "sub $0x8,%2 \n"
5093 "movdqu %%xmm0,(%1) \n"
5094 "movdqu %%xmm1,0x10(%1) \n"
5095 "lea 0x20(%1),%1 \n"
5096 "jg 1b \n"
5097 : "+r"(src_argb), // %0
5098 "+r"(dst_argb), // %1
5099 "+r"(pix) // %2
5100 : "r"(shuffler) // %3
5101 : "memory", "cc"
5102#if defined(__SSE2__)
5103 , "xmm0", "xmm1", "xmm5"
5104#endif
5105 );
5106}
5107
fbarchard@google.com9de88672012-10-12 06:23:33 +00005108void I422ToYUY2Row_SSE2(const uint8* src_y,
5109 const uint8* src_u,
5110 const uint8* src_v,
5111 uint8* dst_frame, int width) {
5112 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00005113 "sub %1,%2 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005114 ".p2align 4 \n"
5115 "1: \n"
5116 "movq (%1),%%xmm2 \n"
5117 "movq (%1,%2,1),%%xmm3 \n"
5118 "lea 0x8(%1),%1 \n"
5119 "punpcklbw %%xmm3,%%xmm2 \n"
5120 "movdqa (%0),%%xmm0 \n"
5121 "lea 0x10(%0),%0 \n"
fbarchard@google.comf8e90172013-04-02 21:18:12 +00005122 "movdqu %%xmm0,%%xmm1 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005123 "punpcklbw %%xmm2,%%xmm0 \n"
5124 "punpckhbw %%xmm2,%%xmm1 \n"
fbarchard@google.comf8e90172013-04-02 21:18:12 +00005125 "movdqu %%xmm0,(%3) \n"
5126 "movdqu %%xmm1,0x10(%3) \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005127 "lea 0x20(%3),%3 \n"
5128 "sub $0x10,%4 \n"
5129 "jg 1b \n"
5130 : "+r"(src_y), // %0
5131 "+r"(src_u), // %1
5132 "+r"(src_v), // %2
5133 "+r"(dst_frame), // %3
5134 "+rm"(width) // %4
5135 :
5136 : "memory", "cc"
5137#if defined(__SSE2__)
5138 , "xmm0", "xmm1", "xmm2", "xmm3"
5139#endif
5140 );
5141}
5142
5143void I422ToUYVYRow_SSE2(const uint8* src_y,
5144 const uint8* src_u,
5145 const uint8* src_v,
5146 uint8* dst_frame, int width) {
5147 asm volatile (
5148 "sub %1,%2 \n"
5149 ".p2align 4 \n"
5150 "1: \n"
5151 "movq (%1),%%xmm2 \n"
5152 "movq (%1,%2,1),%%xmm3 \n"
5153 "lea 0x8(%1),%1 \n"
5154 "punpcklbw %%xmm3,%%xmm2 \n"
fbarchard@google.comf8e90172013-04-02 21:18:12 +00005155 "movdqu (%0),%%xmm0 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005156 "movdqa %%xmm2,%%xmm1 \n"
5157 "lea 0x10(%0),%0 \n"
5158 "punpcklbw %%xmm0,%%xmm1 \n"
5159 "punpckhbw %%xmm0,%%xmm2 \n"
fbarchard@google.comf8e90172013-04-02 21:18:12 +00005160 "movdqu %%xmm1,(%3) \n"
5161 "movdqu %%xmm2,0x10(%3) \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005162 "lea 0x20(%3),%3 \n"
5163 "sub $0x10,%4 \n"
5164 "jg 1b \n"
5165 : "+r"(src_y), // %0
5166 "+r"(src_u), // %1
5167 "+r"(src_v), // %2
5168 "+r"(dst_frame), // %3
5169 "+rm"(width) // %4
5170 :
5171 : "memory", "cc"
5172#if defined(__SSE2__)
5173 , "xmm0", "xmm1", "xmm2", "xmm3"
5174#endif
5175 );
5176}
5177
fbarchard@google.com2d11d432012-02-16 02:50:39 +00005178#endif // defined(__x86_64__) || defined(__i386__)
5179
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00005180#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00005181} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00005182} // namespace libyuv
5183#endif