blob: 65416c0f79f866bbe30beb22f98666fdb3176e4e [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000101// Shuffle table for converting RGBA to ARGB.
102CONST uvec8 kShuffleMaskRGBAToARGB = {
103 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
104};
105
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000106// Shuffle table for converting ARGB to RGBA.
107CONST uvec8 kShuffleMaskARGBToRGBA = {
108 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
109};
110
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000111// Shuffle table for converting ARGB to RGB24.
112CONST uvec8 kShuffleMaskARGBToRGB24 = {
113 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
114};
115
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000116// Shuffle table for converting ARGB to RAW.
117CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000118 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000119};
120
fbarchard@google.comb6149762011-11-07 21:58:52 +0000121void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000122 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000123 "pcmpeqb %%xmm5,%%xmm5 \n"
124 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000125 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000126 "1: \n"
127 "movq (%0),%%xmm0 \n"
128 "lea 0x8(%0),%0 \n"
129 "punpcklbw %%xmm0,%%xmm0 \n"
130 "movdqa %%xmm0,%%xmm1 \n"
131 "punpcklwd %%xmm0,%%xmm0 \n"
132 "punpckhwd %%xmm1,%%xmm1 \n"
133 "por %%xmm5,%%xmm0 \n"
134 "por %%xmm5,%%xmm1 \n"
135 "movdqa %%xmm0,(%1) \n"
136 "movdqa %%xmm1,0x10(%1) \n"
137 "lea 0x20(%1),%1 \n"
138 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000139 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140 : "+r"(src_y), // %0
141 "+r"(dst_argb), // %1
142 "+r"(pix) // %2
143 :
144 : "memory", "cc"
145#if defined(__SSE2__)
146 , "xmm0", "xmm1", "xmm5"
147#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000148 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000149}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000150
151void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000152 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000153 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000154 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000155 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000156 "1: \n"
157 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000158 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000159 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000160 "movdqa %%xmm0,(%0,%1,1) \n"
161 "lea 0x10(%0),%0 \n"
162 "jg 1b \n"
163
fbarchard@google.comb6149762011-11-07 21:58:52 +0000164 : "+r"(src_abgr), // %0
165 "+r"(dst_argb), // %1
166 "+r"(pix) // %2
167 : "m"(kShuffleMaskABGRToARGB) // %3
168 : "memory", "cc"
169#if defined(__SSE2__)
170 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000171#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000172 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173}
174
175void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000176 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000177 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000178 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000179 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000180 "1: \n"
181 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000182 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000183 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000184 "movdqa %%xmm0,(%0,%1,1) \n"
185 "lea 0x10(%0),%0 \n"
186 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000187 : "+r"(src_bgra), // %0
188 "+r"(dst_argb), // %1
189 "+r"(pix) // %2
190 : "m"(kShuffleMaskBGRAToARGB) // %3
191 : "memory", "cc"
192#if defined(__SSE2__)
193 , "xmm0", "xmm5"
194#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000195 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000196}
197
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000198void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
199 asm volatile (
200 "movdqa %3,%%xmm5 \n"
201 "sub %0,%1 \n"
202 ".p2align 4 \n"
203 "1: \n"
204 "movdqa (%0),%%xmm0 \n"
205 "pshufb %%xmm5,%%xmm0 \n"
206 "sub $0x4,%2 \n"
207 "movdqa %%xmm0,(%0,%1,1) \n"
208 "lea 0x10(%0),%0 \n"
209 "jg 1b \n"
210
211 : "+r"(src_rgba), // %0
212 "+r"(dst_argb), // %1
213 "+r"(pix) // %2
214 : "m"(kShuffleMaskRGBAToARGB) // %3
215 : "memory", "cc"
216#if defined(__SSE2__)
217 , "xmm0", "xmm5"
218#endif
219 );
220}
221
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000222void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
223 asm volatile (
224 "movdqa %3,%%xmm5 \n"
225 "sub %0,%1 \n"
226 ".p2align 4 \n"
227 "1: \n"
228 "movdqa (%0),%%xmm0 \n"
229 "pshufb %%xmm5,%%xmm0 \n"
230 "sub $0x4,%2 \n"
231 "movdqa %%xmm0,(%0,%1,1) \n"
232 "lea 0x10(%0),%0 \n"
233 "jg 1b \n"
234
235 : "+r"(src_argb), // %0
236 "+r"(dst_rgba), // %1
237 "+r"(pix) // %2
238 : "m"(kShuffleMaskARGBToRGBA) // %3
239 : "memory", "cc"
240#if defined(__SSE2__)
241 , "xmm0", "xmm5"
242#endif
243 );
244}
245
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000246void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000247 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000248 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
249 "pslld $0x18,%%xmm5 \n"
250 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000251 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000252 "1: \n"
253 "movdqu (%0),%%xmm0 \n"
254 "movdqu 0x10(%0),%%xmm1 \n"
255 "movdqu 0x20(%0),%%xmm3 \n"
256 "lea 0x30(%0),%0 \n"
257 "movdqa %%xmm3,%%xmm2 \n"
258 "palignr $0x8,%%xmm1,%%xmm2 \n"
259 "pshufb %%xmm4,%%xmm2 \n"
260 "por %%xmm5,%%xmm2 \n"
261 "palignr $0xc,%%xmm0,%%xmm1 \n"
262 "pshufb %%xmm4,%%xmm0 \n"
263 "movdqa %%xmm2,0x20(%1) \n"
264 "por %%xmm5,%%xmm0 \n"
265 "pshufb %%xmm4,%%xmm1 \n"
266 "movdqa %%xmm0,(%1) \n"
267 "por %%xmm5,%%xmm1 \n"
268 "palignr $0x4,%%xmm3,%%xmm3 \n"
269 "pshufb %%xmm4,%%xmm3 \n"
270 "movdqa %%xmm1,0x10(%1) \n"
271 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000272 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000273 "movdqa %%xmm3,0x30(%1) \n"
274 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000275 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000276 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000277 "+r"(dst_argb), // %1
278 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000279 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000280 : "memory", "cc"
281#if defined(__SSE2__)
282 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
283#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000284 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000285}
286
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000287void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000288 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000289 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
290 "pslld $0x18,%%xmm5 \n"
291 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000292 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000293 "1: \n"
294 "movdqu (%0),%%xmm0 \n"
295 "movdqu 0x10(%0),%%xmm1 \n"
296 "movdqu 0x20(%0),%%xmm3 \n"
297 "lea 0x30(%0),%0 \n"
298 "movdqa %%xmm3,%%xmm2 \n"
299 "palignr $0x8,%%xmm1,%%xmm2 \n"
300 "pshufb %%xmm4,%%xmm2 \n"
301 "por %%xmm5,%%xmm2 \n"
302 "palignr $0xc,%%xmm0,%%xmm1 \n"
303 "pshufb %%xmm4,%%xmm0 \n"
304 "movdqa %%xmm2,0x20(%1) \n"
305 "por %%xmm5,%%xmm0 \n"
306 "pshufb %%xmm4,%%xmm1 \n"
307 "movdqa %%xmm0,(%1) \n"
308 "por %%xmm5,%%xmm1 \n"
309 "palignr $0x4,%%xmm3,%%xmm3 \n"
310 "pshufb %%xmm4,%%xmm3 \n"
311 "movdqa %%xmm1,0x10(%1) \n"
312 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000313 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000314 "movdqa %%xmm3,0x30(%1) \n"
315 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000316 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000317 : "+r"(src_raw), // %0
318 "+r"(dst_argb), // %1
319 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000320 : "m"(kShuffleMaskRAWToARGB) // %3
321 : "memory", "cc"
322#if defined(__SSE2__)
323 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
324#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000325 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000326}
327
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000328void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000329 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000330 "mov $0x1080108,%%eax \n"
331 "movd %%eax,%%xmm5 \n"
332 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000333 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000334 "movd %%eax,%%xmm6 \n"
335 "pshufd $0x0,%%xmm6,%%xmm6 \n"
336 "pcmpeqb %%xmm3,%%xmm3 \n"
337 "psllw $0xb,%%xmm3 \n"
338 "pcmpeqb %%xmm4,%%xmm4 \n"
339 "psllw $0xa,%%xmm4 \n"
340 "psrlw $0x5,%%xmm4 \n"
341 "pcmpeqb %%xmm7,%%xmm7 \n"
342 "psllw $0x8,%%xmm7 \n"
343 "sub %0,%1 \n"
344 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000345 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000346 "1: \n"
347 "movdqu (%0),%%xmm0 \n"
348 "movdqa %%xmm0,%%xmm1 \n"
349 "movdqa %%xmm0,%%xmm2 \n"
350 "pand %%xmm3,%%xmm1 \n"
351 "psllw $0xb,%%xmm2 \n"
352 "pmulhuw %%xmm5,%%xmm1 \n"
353 "pmulhuw %%xmm5,%%xmm2 \n"
354 "psllw $0x8,%%xmm1 \n"
355 "por %%xmm2,%%xmm1 \n"
356 "pand %%xmm4,%%xmm0 \n"
357 "pmulhuw %%xmm6,%%xmm0 \n"
358 "por %%xmm7,%%xmm0 \n"
359 "movdqa %%xmm1,%%xmm2 \n"
360 "punpcklbw %%xmm0,%%xmm1 \n"
361 "punpckhbw %%xmm0,%%xmm2 \n"
362 "movdqa %%xmm1,(%1,%0,2) \n"
363 "movdqa %%xmm2,0x10(%1,%0,2) \n"
364 "lea 0x10(%0),%0 \n"
365 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000366 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000367 : "+r"(src), // %0
368 "+r"(dst), // %1
369 "+r"(pix) // %2
370 :
371 : "memory", "cc", "eax"
372#if defined(__SSE2__)
373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
374#endif
375 );
376}
377
378void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000379 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000380 "mov $0x1080108,%%eax \n"
381 "movd %%eax,%%xmm5 \n"
382 "pshufd $0x0,%%xmm5,%%xmm5 \n"
383 "mov $0x42004200,%%eax \n"
384 "movd %%eax,%%xmm6 \n"
385 "pshufd $0x0,%%xmm6,%%xmm6 \n"
386 "pcmpeqb %%xmm3,%%xmm3 \n"
387 "psllw $0xb,%%xmm3 \n"
388 "movdqa %%xmm3,%%xmm4 \n"
389 "psrlw $0x6,%%xmm4 \n"
390 "pcmpeqb %%xmm7,%%xmm7 \n"
391 "psllw $0x8,%%xmm7 \n"
392 "sub %0,%1 \n"
393 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000394 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000395 "1: \n"
396 "movdqu (%0),%%xmm0 \n"
397 "movdqa %%xmm0,%%xmm1 \n"
398 "movdqa %%xmm0,%%xmm2 \n"
399 "psllw $0x1,%%xmm1 \n"
400 "psllw $0xb,%%xmm2 \n"
401 "pand %%xmm3,%%xmm1 \n"
402 "pmulhuw %%xmm5,%%xmm2 \n"
403 "pmulhuw %%xmm5,%%xmm1 \n"
404 "psllw $0x8,%%xmm1 \n"
405 "por %%xmm2,%%xmm1 \n"
406 "movdqa %%xmm0,%%xmm2 \n"
407 "pand %%xmm4,%%xmm0 \n"
408 "psraw $0x8,%%xmm2 \n"
409 "pmulhuw %%xmm6,%%xmm0 \n"
410 "pand %%xmm7,%%xmm2 \n"
411 "por %%xmm2,%%xmm0 \n"
412 "movdqa %%xmm1,%%xmm2 \n"
413 "punpcklbw %%xmm0,%%xmm1 \n"
414 "punpckhbw %%xmm0,%%xmm2 \n"
415 "movdqa %%xmm1,(%1,%0,2) \n"
416 "movdqa %%xmm2,0x10(%1,%0,2) \n"
417 "lea 0x10(%0),%0 \n"
418 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000419 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000420 : "+r"(src), // %0
421 "+r"(dst), // %1
422 "+r"(pix) // %2
423 :
424 : "memory", "cc", "eax"
425#if defined(__SSE2__)
426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
427#endif
428 );
429}
430
431void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000432 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000433 "mov $0xf0f0f0f,%%eax \n"
434 "movd %%eax,%%xmm4 \n"
435 "pshufd $0x0,%%xmm4,%%xmm4 \n"
436 "movdqa %%xmm4,%%xmm5 \n"
437 "pslld $0x4,%%xmm5 \n"
438 "sub %0,%1 \n"
439 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000440 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000441 "1: \n"
442 "movdqu (%0),%%xmm0 \n"
443 "movdqa %%xmm0,%%xmm2 \n"
444 "pand %%xmm4,%%xmm0 \n"
445 "pand %%xmm5,%%xmm2 \n"
446 "movdqa %%xmm0,%%xmm1 \n"
447 "movdqa %%xmm2,%%xmm3 \n"
448 "psllw $0x4,%%xmm1 \n"
449 "psrlw $0x4,%%xmm3 \n"
450 "por %%xmm1,%%xmm0 \n"
451 "por %%xmm3,%%xmm2 \n"
452 "movdqa %%xmm0,%%xmm1 \n"
453 "punpcklbw %%xmm2,%%xmm0 \n"
454 "punpckhbw %%xmm2,%%xmm1 \n"
455 "movdqa %%xmm0,(%1,%0,2) \n"
456 "movdqa %%xmm1,0x10(%1,%0,2) \n"
457 "lea 0x10(%0),%0 \n"
458 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000459 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000460 : "+r"(src), // %0
461 "+r"(dst), // %1
462 "+r"(pix) // %2
463 :
464 : "memory", "cc", "eax"
465#if defined(__SSE2__)
466 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
467#endif
468 );
469}
470
471void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000472 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000473 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000474 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000475 "1: \n"
476 "movdqa (%0),%%xmm0 \n"
477 "movdqa 0x10(%0),%%xmm1 \n"
478 "movdqa 0x20(%0),%%xmm2 \n"
479 "movdqa 0x30(%0),%%xmm3 \n"
480 "lea 0x40(%0),%0 \n"
481 "pshufb %%xmm6,%%xmm0 \n"
482 "pshufb %%xmm6,%%xmm1 \n"
483 "pshufb %%xmm6,%%xmm2 \n"
484 "pshufb %%xmm6,%%xmm3 \n"
485 "movdqa %%xmm1,%%xmm4 \n"
486 "psrldq $0x4,%%xmm1 \n"
487 "pslldq $0xc,%%xmm4 \n"
488 "movdqa %%xmm2,%%xmm5 \n"
489 "por %%xmm4,%%xmm0 \n"
490 "pslldq $0x8,%%xmm5 \n"
491 "movdqa %%xmm0,(%1) \n"
492 "por %%xmm5,%%xmm1 \n"
493 "psrldq $0x8,%%xmm2 \n"
494 "pslldq $0x4,%%xmm3 \n"
495 "por %%xmm3,%%xmm2 \n"
496 "movdqa %%xmm1,0x10(%1) \n"
497 "movdqa %%xmm2,0x20(%1) \n"
498 "lea 0x30(%1),%1 \n"
499 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000500 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000501 : "+r"(src), // %0
502 "+r"(dst), // %1
503 "+r"(pix) // %2
504 : "m"(kShuffleMaskARGBToRGB24) // %3
505 : "memory", "cc"
506#if defined(__SSE2__)
507 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
508#endif
509 );
510}
511
512void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000513 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000514 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000515 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000516 "1: \n"
517 "movdqa (%0),%%xmm0 \n"
518 "movdqa 0x10(%0),%%xmm1 \n"
519 "movdqa 0x20(%0),%%xmm2 \n"
520 "movdqa 0x30(%0),%%xmm3 \n"
521 "lea 0x40(%0),%0 \n"
522 "pshufb %%xmm6,%%xmm0 \n"
523 "pshufb %%xmm6,%%xmm1 \n"
524 "pshufb %%xmm6,%%xmm2 \n"
525 "pshufb %%xmm6,%%xmm3 \n"
526 "movdqa %%xmm1,%%xmm4 \n"
527 "psrldq $0x4,%%xmm1 \n"
528 "pslldq $0xc,%%xmm4 \n"
529 "movdqa %%xmm2,%%xmm5 \n"
530 "por %%xmm4,%%xmm0 \n"
531 "pslldq $0x8,%%xmm5 \n"
532 "movdqa %%xmm0,(%1) \n"
533 "por %%xmm5,%%xmm1 \n"
534 "psrldq $0x8,%%xmm2 \n"
535 "pslldq $0x4,%%xmm3 \n"
536 "por %%xmm3,%%xmm2 \n"
537 "movdqa %%xmm1,0x10(%1) \n"
538 "movdqa %%xmm2,0x20(%1) \n"
539 "lea 0x30(%1),%1 \n"
540 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000541 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000542 : "+r"(src), // %0
543 "+r"(dst), // %1
544 "+r"(pix) // %2
545 : "m"(kShuffleMaskARGBToRAW) // %3
546 : "memory", "cc"
547#if defined(__SSE2__)
548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
549#endif
550 );
551}
552
553void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000554 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000555 "pcmpeqb %%xmm3,%%xmm3 \n"
556 "psrld $0x1b,%%xmm3 \n"
557 "pcmpeqb %%xmm4,%%xmm4 \n"
558 "psrld $0x1a,%%xmm4 \n"
559 "pslld $0x5,%%xmm4 \n"
560 "pcmpeqb %%xmm5,%%xmm5 \n"
561 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000562 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000563 "1: \n"
564 "movdqa (%0),%%xmm0 \n"
565 "movdqa %%xmm0,%%xmm1 \n"
566 "movdqa %%xmm0,%%xmm2 \n"
567 "pslld $0x8,%%xmm0 \n"
568 "psrld $0x3,%%xmm1 \n"
569 "psrld $0x5,%%xmm2 \n"
570 "psrad $0x10,%%xmm0 \n"
571 "pand %%xmm3,%%xmm1 \n"
572 "pand %%xmm4,%%xmm2 \n"
573 "pand %%xmm5,%%xmm0 \n"
574 "por %%xmm2,%%xmm1 \n"
575 "por %%xmm1,%%xmm0 \n"
576 "packssdw %%xmm0,%%xmm0 \n"
577 "lea 0x10(%0),%0 \n"
578 "movq %%xmm0,(%1) \n"
579 "lea 0x8(%1),%1 \n"
580 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000581 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000582 : "+r"(src), // %0
583 "+r"(dst), // %1
584 "+r"(pix) // %2
585 :
586 : "memory", "cc"
587#if defined(__SSE2__)
588 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
589#endif
590 );
591}
592
593void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000594 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000595 "pcmpeqb %%xmm4,%%xmm4 \n"
596 "psrld $0x1b,%%xmm4 \n"
597 "movdqa %%xmm4,%%xmm5 \n"
598 "pslld $0x5,%%xmm5 \n"
599 "movdqa %%xmm4,%%xmm6 \n"
600 "pslld $0xa,%%xmm6 \n"
601 "pcmpeqb %%xmm7,%%xmm7 \n"
602 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000603 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000604 "1: \n"
605 "movdqa (%0),%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "movdqa %%xmm0,%%xmm2 \n"
608 "movdqa %%xmm0,%%xmm3 \n"
609 "psrad $0x10,%%xmm0 \n"
610 "psrld $0x3,%%xmm1 \n"
611 "psrld $0x6,%%xmm2 \n"
612 "psrld $0x9,%%xmm3 \n"
613 "pand %%xmm7,%%xmm0 \n"
614 "pand %%xmm4,%%xmm1 \n"
615 "pand %%xmm5,%%xmm2 \n"
616 "pand %%xmm6,%%xmm3 \n"
617 "por %%xmm1,%%xmm0 \n"
618 "por %%xmm3,%%xmm2 \n"
619 "por %%xmm2,%%xmm0 \n"
620 "packssdw %%xmm0,%%xmm0 \n"
621 "lea 0x10(%0),%0 \n"
622 "movq %%xmm0,(%1) \n"
623 "lea 0x8(%1),%1 \n"
624 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000625 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000626 : "+r"(src), // %0
627 "+r"(dst), // %1
628 "+r"(pix) // %2
629 :
630 : "memory", "cc"
631#if defined(__SSE2__)
632 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
633#endif
634 );
635}
636
637void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000638 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000639 "pcmpeqb %%xmm4,%%xmm4 \n"
640 "psllw $0xc,%%xmm4 \n"
641 "movdqa %%xmm4,%%xmm3 \n"
642 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000643 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000644 "1: \n"
645 "movdqa (%0),%%xmm0 \n"
646 "movdqa %%xmm0,%%xmm1 \n"
647 "pand %%xmm3,%%xmm0 \n"
648 "pand %%xmm4,%%xmm1 \n"
649 "psrlq $0x4,%%xmm0 \n"
650 "psrlq $0x8,%%xmm1 \n"
651 "por %%xmm1,%%xmm0 \n"
652 "packuswb %%xmm0,%%xmm0 \n"
653 "lea 0x10(%0),%0 \n"
654 "movq %%xmm0,(%1) \n"
655 "lea 0x8(%1),%1 \n"
656 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000657 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000658 : "+r"(src), // %0
659 "+r"(dst), // %1
660 "+r"(pix) // %2
661 :
662 : "memory", "cc"
663#if defined(__SSE2__)
664 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
665#endif
666 );
667}
668
fbarchard@google.comb6149762011-11-07 21:58:52 +0000669void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000670 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000671 "movdqa %4,%%xmm5 \n"
672 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000673 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000674 "1: \n"
675 "movdqa (%0),%%xmm0 \n"
676 "movdqa 0x10(%0),%%xmm1 \n"
677 "movdqa 0x20(%0),%%xmm2 \n"
678 "movdqa 0x30(%0),%%xmm3 \n"
679 "pmaddubsw %%xmm4,%%xmm0 \n"
680 "pmaddubsw %%xmm4,%%xmm1 \n"
681 "pmaddubsw %%xmm4,%%xmm2 \n"
682 "pmaddubsw %%xmm4,%%xmm3 \n"
683 "lea 0x40(%0),%0 \n"
684 "phaddw %%xmm1,%%xmm0 \n"
685 "phaddw %%xmm3,%%xmm2 \n"
686 "psrlw $0x7,%%xmm0 \n"
687 "psrlw $0x7,%%xmm2 \n"
688 "packuswb %%xmm2,%%xmm0 \n"
689 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000690 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000691 "movdqa %%xmm0,(%1) \n"
692 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000693 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000694 : "+r"(src_argb), // %0
695 "+r"(dst_y), // %1
696 "+r"(pix) // %2
697 : "m"(kARGBToY), // %3
698 "m"(kAddY16) // %4
699 : "memory", "cc"
700#if defined(__SSE2__)
701 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
702#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000703 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000704}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000705
706void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000707 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000708 "movdqa %4,%%xmm5 \n"
709 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000710 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000711 "1: \n"
712 "movdqu (%0),%%xmm0 \n"
713 "movdqu 0x10(%0),%%xmm1 \n"
714 "movdqu 0x20(%0),%%xmm2 \n"
715 "movdqu 0x30(%0),%%xmm3 \n"
716 "pmaddubsw %%xmm4,%%xmm0 \n"
717 "pmaddubsw %%xmm4,%%xmm1 \n"
718 "pmaddubsw %%xmm4,%%xmm2 \n"
719 "pmaddubsw %%xmm4,%%xmm3 \n"
720 "lea 0x40(%0),%0 \n"
721 "phaddw %%xmm1,%%xmm0 \n"
722 "phaddw %%xmm3,%%xmm2 \n"
723 "psrlw $0x7,%%xmm0 \n"
724 "psrlw $0x7,%%xmm2 \n"
725 "packuswb %%xmm2,%%xmm0 \n"
726 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000727 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000728 "movdqu %%xmm0,(%1) \n"
729 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000730 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000731 : "+r"(src_argb), // %0
732 "+r"(dst_y), // %1
733 "+r"(pix) // %2
734 : "m"(kARGBToY), // %3
735 "m"(kAddY16) // %4
736 : "memory", "cc"
737#if defined(__SSE2__)
738 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
739#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000740 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000741}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000742
fbarchard@google.com714050a2012-02-17 22:59:56 +0000743// TODO(fbarchard): pass xmm constants to single block of assembly.
744// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
745// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
746// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
747// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000748void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
749 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000750 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000751 "movdqa %0,%%xmm4 \n"
752 "movdqa %1,%%xmm3 \n"
753 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000754 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000755 : "m"(kARGBToU), // %0
756 "m"(kARGBToV), // %1
757 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000758 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000759 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000760 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000761 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000762 "1: \n"
763 "movdqa (%0),%%xmm0 \n"
764 "movdqa 0x10(%0),%%xmm1 \n"
765 "movdqa 0x20(%0),%%xmm2 \n"
766 "movdqa 0x30(%0),%%xmm6 \n"
767 "pavgb (%0,%4,1),%%xmm0 \n"
768 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
769 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
770 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
771 "lea 0x40(%0),%0 \n"
772 "movdqa %%xmm0,%%xmm7 \n"
773 "shufps $0x88,%%xmm1,%%xmm0 \n"
774 "shufps $0xdd,%%xmm1,%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm0 \n"
776 "movdqa %%xmm2,%%xmm7 \n"
777 "shufps $0x88,%%xmm6,%%xmm2 \n"
778 "shufps $0xdd,%%xmm6,%%xmm7 \n"
779 "pavgb %%xmm7,%%xmm2 \n"
780 "movdqa %%xmm0,%%xmm1 \n"
781 "movdqa %%xmm2,%%xmm6 \n"
782 "pmaddubsw %%xmm4,%%xmm0 \n"
783 "pmaddubsw %%xmm4,%%xmm2 \n"
784 "pmaddubsw %%xmm3,%%xmm1 \n"
785 "pmaddubsw %%xmm3,%%xmm6 \n"
786 "phaddw %%xmm2,%%xmm0 \n"
787 "phaddw %%xmm6,%%xmm1 \n"
788 "psraw $0x8,%%xmm0 \n"
789 "psraw $0x8,%%xmm1 \n"
790 "packsswb %%xmm1,%%xmm0 \n"
791 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000792 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000793 "movlps %%xmm0,(%1) \n"
794 "movhps %%xmm0,(%1,%2,1) \n"
795 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000796 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000797 : "+r"(src_argb0), // %0
798 "+r"(dst_u), // %1
799 "+r"(dst_v), // %2
800 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000801 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000802 : "memory", "cc"
803#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000804 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000805#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000806 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000807}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000808
809void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
810 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000811 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000812 "movdqa %0,%%xmm4 \n"
813 "movdqa %1,%%xmm3 \n"
814 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000815 :
816 : "m"(kARGBToU), // %0
817 "m"(kARGBToV), // %1
818 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000819 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000820 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000821 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000822 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000823 "1: \n"
824 "movdqu (%0),%%xmm0 \n"
825 "movdqu 0x10(%0),%%xmm1 \n"
826 "movdqu 0x20(%0),%%xmm2 \n"
827 "movdqu 0x30(%0),%%xmm6 \n"
828 "movdqu (%0,%4,1),%%xmm7 \n"
829 "pavgb %%xmm7,%%xmm0 \n"
830 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
831 "pavgb %%xmm7,%%xmm1 \n"
832 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
833 "pavgb %%xmm7,%%xmm2 \n"
834 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
835 "pavgb %%xmm7,%%xmm6 \n"
836 "lea 0x40(%0),%0 \n"
837 "movdqa %%xmm0,%%xmm7 \n"
838 "shufps $0x88,%%xmm1,%%xmm0 \n"
839 "shufps $0xdd,%%xmm1,%%xmm7 \n"
840 "pavgb %%xmm7,%%xmm0 \n"
841 "movdqa %%xmm2,%%xmm7 \n"
842 "shufps $0x88,%%xmm6,%%xmm2 \n"
843 "shufps $0xdd,%%xmm6,%%xmm7 \n"
844 "pavgb %%xmm7,%%xmm2 \n"
845 "movdqa %%xmm0,%%xmm1 \n"
846 "movdqa %%xmm2,%%xmm6 \n"
847 "pmaddubsw %%xmm4,%%xmm0 \n"
848 "pmaddubsw %%xmm4,%%xmm2 \n"
849 "pmaddubsw %%xmm3,%%xmm1 \n"
850 "pmaddubsw %%xmm3,%%xmm6 \n"
851 "phaddw %%xmm2,%%xmm0 \n"
852 "phaddw %%xmm6,%%xmm1 \n"
853 "psraw $0x8,%%xmm0 \n"
854 "psraw $0x8,%%xmm1 \n"
855 "packsswb %%xmm1,%%xmm0 \n"
856 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000857 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000858 "movlps %%xmm0,(%1) \n"
859 "movhps %%xmm0,(%1,%2,1) \n"
860 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000861 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000862 : "+r"(src_argb0), // %0
863 "+r"(dst_u), // %1
864 "+r"(dst_v), // %2
865 "+rm"(width) // %3
866 : "r"(static_cast<intptr_t>(src_stride_argb))
867 : "memory", "cc"
868#if defined(__SSE2__)
869 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
870#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000871 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000872}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000873
fbarchard@google.com714050a2012-02-17 22:59:56 +0000874void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000875 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000876 "movdqa %4,%%xmm5 \n"
877 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000878 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000879 "1: \n"
880 "movdqa (%0),%%xmm0 \n"
881 "movdqa 0x10(%0),%%xmm1 \n"
882 "movdqa 0x20(%0),%%xmm2 \n"
883 "movdqa 0x30(%0),%%xmm3 \n"
884 "pmaddubsw %%xmm4,%%xmm0 \n"
885 "pmaddubsw %%xmm4,%%xmm1 \n"
886 "pmaddubsw %%xmm4,%%xmm2 \n"
887 "pmaddubsw %%xmm4,%%xmm3 \n"
888 "lea 0x40(%0),%0 \n"
889 "phaddw %%xmm1,%%xmm0 \n"
890 "phaddw %%xmm3,%%xmm2 \n"
891 "psrlw $0x7,%%xmm0 \n"
892 "psrlw $0x7,%%xmm2 \n"
893 "packuswb %%xmm2,%%xmm0 \n"
894 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000895 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000896 "movdqa %%xmm0,(%1) \n"
897 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000898 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000899 : "+r"(src_bgra), // %0
900 "+r"(dst_y), // %1
901 "+r"(pix) // %2
902 : "m"(kBGRAToY), // %3
903 "m"(kAddY16) // %4
904 : "memory", "cc"
905#if defined(__SSE2__)
906 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000907#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000908 );
909}
910
911void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000912 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000913 "movdqa %4,%%xmm5 \n"
914 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000915 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000916 "1: \n"
917 "movdqu (%0),%%xmm0 \n"
918 "movdqu 0x10(%0),%%xmm1 \n"
919 "movdqu 0x20(%0),%%xmm2 \n"
920 "movdqu 0x30(%0),%%xmm3 \n"
921 "pmaddubsw %%xmm4,%%xmm0 \n"
922 "pmaddubsw %%xmm4,%%xmm1 \n"
923 "pmaddubsw %%xmm4,%%xmm2 \n"
924 "pmaddubsw %%xmm4,%%xmm3 \n"
925 "lea 0x40(%0),%0 \n"
926 "phaddw %%xmm1,%%xmm0 \n"
927 "phaddw %%xmm3,%%xmm2 \n"
928 "psrlw $0x7,%%xmm0 \n"
929 "psrlw $0x7,%%xmm2 \n"
930 "packuswb %%xmm2,%%xmm0 \n"
931 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000932 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000933 "movdqu %%xmm0,(%1) \n"
934 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000935 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000936 : "+r"(src_bgra), // %0
937 "+r"(dst_y), // %1
938 "+r"(pix) // %2
939 : "m"(kBGRAToY), // %3
940 "m"(kAddY16) // %4
941 : "memory", "cc"
942#if defined(__SSE2__)
943 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
944#endif
945 );
946}
947
948void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
949 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000950 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000951 "movdqa %0,%%xmm4 \n"
952 "movdqa %1,%%xmm3 \n"
953 "movdqa %2,%%xmm5 \n"
954 :
955 : "m"(kBGRAToU), // %0
956 "m"(kBGRAToV), // %1
957 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000958 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000959 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000960 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000961 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000962 "1: \n"
963 "movdqa (%0),%%xmm0 \n"
964 "movdqa 0x10(%0),%%xmm1 \n"
965 "movdqa 0x20(%0),%%xmm2 \n"
966 "movdqa 0x30(%0),%%xmm6 \n"
967 "pavgb (%0,%4,1),%%xmm0 \n"
968 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
969 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
970 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
971 "lea 0x40(%0),%0 \n"
972 "movdqa %%xmm0,%%xmm7 \n"
973 "shufps $0x88,%%xmm1,%%xmm0 \n"
974 "shufps $0xdd,%%xmm1,%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm0 \n"
976 "movdqa %%xmm2,%%xmm7 \n"
977 "shufps $0x88,%%xmm6,%%xmm2 \n"
978 "shufps $0xdd,%%xmm6,%%xmm7 \n"
979 "pavgb %%xmm7,%%xmm2 \n"
980 "movdqa %%xmm0,%%xmm1 \n"
981 "movdqa %%xmm2,%%xmm6 \n"
982 "pmaddubsw %%xmm4,%%xmm0 \n"
983 "pmaddubsw %%xmm4,%%xmm2 \n"
984 "pmaddubsw %%xmm3,%%xmm1 \n"
985 "pmaddubsw %%xmm3,%%xmm6 \n"
986 "phaddw %%xmm2,%%xmm0 \n"
987 "phaddw %%xmm6,%%xmm1 \n"
988 "psraw $0x8,%%xmm0 \n"
989 "psraw $0x8,%%xmm1 \n"
990 "packsswb %%xmm1,%%xmm0 \n"
991 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000992 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000993 "movlps %%xmm0,(%1) \n"
994 "movhps %%xmm0,(%1,%2,1) \n"
995 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000996 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000997 : "+r"(src_bgra0), // %0
998 "+r"(dst_u), // %1
999 "+r"(dst_v), // %2
1000 "+rm"(width) // %3
1001 : "r"(static_cast<intptr_t>(src_stride_bgra))
1002 : "memory", "cc"
1003#if defined(__SSE2__)
1004 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1005#endif
1006 );
1007}
1008
1009void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1010 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001011 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001012 "movdqa %0,%%xmm4 \n"
1013 "movdqa %1,%%xmm3 \n"
1014 "movdqa %2,%%xmm5 \n"
1015 :
1016 : "m"(kBGRAToU), // %0
1017 "m"(kBGRAToV), // %1
1018 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001019 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001020 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001021 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001022 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001023 "1: \n"
1024 "movdqu (%0),%%xmm0 \n"
1025 "movdqu 0x10(%0),%%xmm1 \n"
1026 "movdqu 0x20(%0),%%xmm2 \n"
1027 "movdqu 0x30(%0),%%xmm6 \n"
1028 "movdqu (%0,%4,1),%%xmm7 \n"
1029 "pavgb %%xmm7,%%xmm0 \n"
1030 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1031 "pavgb %%xmm7,%%xmm1 \n"
1032 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1033 "pavgb %%xmm7,%%xmm2 \n"
1034 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1035 "pavgb %%xmm7,%%xmm6 \n"
1036 "lea 0x40(%0),%0 \n"
1037 "movdqa %%xmm0,%%xmm7 \n"
1038 "shufps $0x88,%%xmm1,%%xmm0 \n"
1039 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1040 "pavgb %%xmm7,%%xmm0 \n"
1041 "movdqa %%xmm2,%%xmm7 \n"
1042 "shufps $0x88,%%xmm6,%%xmm2 \n"
1043 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1044 "pavgb %%xmm7,%%xmm2 \n"
1045 "movdqa %%xmm0,%%xmm1 \n"
1046 "movdqa %%xmm2,%%xmm6 \n"
1047 "pmaddubsw %%xmm4,%%xmm0 \n"
1048 "pmaddubsw %%xmm4,%%xmm2 \n"
1049 "pmaddubsw %%xmm3,%%xmm1 \n"
1050 "pmaddubsw %%xmm3,%%xmm6 \n"
1051 "phaddw %%xmm2,%%xmm0 \n"
1052 "phaddw %%xmm6,%%xmm1 \n"
1053 "psraw $0x8,%%xmm0 \n"
1054 "psraw $0x8,%%xmm1 \n"
1055 "packsswb %%xmm1,%%xmm0 \n"
1056 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001057 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001058 "movlps %%xmm0,(%1) \n"
1059 "movhps %%xmm0,(%1,%2,1) \n"
1060 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001061 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001062 : "+r"(src_bgra0), // %0
1063 "+r"(dst_u), // %1
1064 "+r"(dst_v), // %2
1065 "+rm"(width) // %3
1066 : "r"(static_cast<intptr_t>(src_stride_bgra))
1067 : "memory", "cc"
1068#if defined(__SSE2__)
1069 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1070#endif
1071 );
1072}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001073
1074void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001075 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001076 "movdqa %4,%%xmm5 \n"
1077 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001078 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001079 "1: \n"
1080 "movdqa (%0),%%xmm0 \n"
1081 "movdqa 0x10(%0),%%xmm1 \n"
1082 "movdqa 0x20(%0),%%xmm2 \n"
1083 "movdqa 0x30(%0),%%xmm3 \n"
1084 "pmaddubsw %%xmm4,%%xmm0 \n"
1085 "pmaddubsw %%xmm4,%%xmm1 \n"
1086 "pmaddubsw %%xmm4,%%xmm2 \n"
1087 "pmaddubsw %%xmm4,%%xmm3 \n"
1088 "lea 0x40(%0),%0 \n"
1089 "phaddw %%xmm1,%%xmm0 \n"
1090 "phaddw %%xmm3,%%xmm2 \n"
1091 "psrlw $0x7,%%xmm0 \n"
1092 "psrlw $0x7,%%xmm2 \n"
1093 "packuswb %%xmm2,%%xmm0 \n"
1094 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001095 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001096 "movdqa %%xmm0,(%1) \n"
1097 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001098 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001099 : "+r"(src_abgr), // %0
1100 "+r"(dst_y), // %1
1101 "+r"(pix) // %2
1102 : "m"(kABGRToY), // %3
1103 "m"(kAddY16) // %4
1104 : "memory", "cc"
1105#if defined(__SSE2__)
1106 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1107#endif
1108 );
1109}
1110
1111void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001112 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001113 "movdqa %4,%%xmm5 \n"
1114 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001115 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001116 "1: \n"
1117 "movdqu (%0),%%xmm0 \n"
1118 "movdqu 0x10(%0),%%xmm1 \n"
1119 "movdqu 0x20(%0),%%xmm2 \n"
1120 "movdqu 0x30(%0),%%xmm3 \n"
1121 "pmaddubsw %%xmm4,%%xmm0 \n"
1122 "pmaddubsw %%xmm4,%%xmm1 \n"
1123 "pmaddubsw %%xmm4,%%xmm2 \n"
1124 "pmaddubsw %%xmm4,%%xmm3 \n"
1125 "lea 0x40(%0),%0 \n"
1126 "phaddw %%xmm1,%%xmm0 \n"
1127 "phaddw %%xmm3,%%xmm2 \n"
1128 "psrlw $0x7,%%xmm0 \n"
1129 "psrlw $0x7,%%xmm2 \n"
1130 "packuswb %%xmm2,%%xmm0 \n"
1131 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001132 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001133 "movdqu %%xmm0,(%1) \n"
1134 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001135 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001136 : "+r"(src_abgr), // %0
1137 "+r"(dst_y), // %1
1138 "+r"(pix) // %2
1139 : "m"(kABGRToY), // %3
1140 "m"(kAddY16) // %4
1141 : "memory", "cc"
1142#if defined(__SSE2__)
1143 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1144#endif
1145 );
1146}
1147
1148void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1149 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001150 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001151 "movdqa %0,%%xmm4 \n"
1152 "movdqa %1,%%xmm3 \n"
1153 "movdqa %2,%%xmm5 \n"
1154 :
1155 : "m"(kABGRToU), // %0
1156 "m"(kABGRToV), // %1
1157 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001158 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001159 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001160 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001161 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001162 "1: \n"
1163 "movdqa (%0),%%xmm0 \n"
1164 "movdqa 0x10(%0),%%xmm1 \n"
1165 "movdqa 0x20(%0),%%xmm2 \n"
1166 "movdqa 0x30(%0),%%xmm6 \n"
1167 "pavgb (%0,%4,1),%%xmm0 \n"
1168 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1169 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1170 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1171 "lea 0x40(%0),%0 \n"
1172 "movdqa %%xmm0,%%xmm7 \n"
1173 "shufps $0x88,%%xmm1,%%xmm0 \n"
1174 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm0 \n"
1176 "movdqa %%xmm2,%%xmm7 \n"
1177 "shufps $0x88,%%xmm6,%%xmm2 \n"
1178 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1179 "pavgb %%xmm7,%%xmm2 \n"
1180 "movdqa %%xmm0,%%xmm1 \n"
1181 "movdqa %%xmm2,%%xmm6 \n"
1182 "pmaddubsw %%xmm4,%%xmm0 \n"
1183 "pmaddubsw %%xmm4,%%xmm2 \n"
1184 "pmaddubsw %%xmm3,%%xmm1 \n"
1185 "pmaddubsw %%xmm3,%%xmm6 \n"
1186 "phaddw %%xmm2,%%xmm0 \n"
1187 "phaddw %%xmm6,%%xmm1 \n"
1188 "psraw $0x8,%%xmm0 \n"
1189 "psraw $0x8,%%xmm1 \n"
1190 "packsswb %%xmm1,%%xmm0 \n"
1191 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001192 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001193 "movlps %%xmm0,(%1) \n"
1194 "movhps %%xmm0,(%1,%2,1) \n"
1195 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001196 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001197 : "+r"(src_abgr0), // %0
1198 "+r"(dst_u), // %1
1199 "+r"(dst_v), // %2
1200 "+rm"(width) // %3
1201 : "r"(static_cast<intptr_t>(src_stride_abgr))
1202 : "memory", "cc"
1203#if defined(__SSE2__)
1204 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1205#endif
1206 );
1207}
1208
1209void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1210 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001211 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001212 "movdqa %0,%%xmm4 \n"
1213 "movdqa %1,%%xmm3 \n"
1214 "movdqa %2,%%xmm5 \n"
1215 :
1216 : "m"(kABGRToU), // %0
1217 "m"(kABGRToV), // %1
1218 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001219 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001220 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001221 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001222 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001223 "1: \n"
1224 "movdqu (%0),%%xmm0 \n"
1225 "movdqu 0x10(%0),%%xmm1 \n"
1226 "movdqu 0x20(%0),%%xmm2 \n"
1227 "movdqu 0x30(%0),%%xmm6 \n"
1228 "movdqu (%0,%4,1),%%xmm7 \n"
1229 "pavgb %%xmm7,%%xmm0 \n"
1230 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1231 "pavgb %%xmm7,%%xmm1 \n"
1232 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1233 "pavgb %%xmm7,%%xmm2 \n"
1234 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1235 "pavgb %%xmm7,%%xmm6 \n"
1236 "lea 0x40(%0),%0 \n"
1237 "movdqa %%xmm0,%%xmm7 \n"
1238 "shufps $0x88,%%xmm1,%%xmm0 \n"
1239 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1240 "pavgb %%xmm7,%%xmm0 \n"
1241 "movdqa %%xmm2,%%xmm7 \n"
1242 "shufps $0x88,%%xmm6,%%xmm2 \n"
1243 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1244 "pavgb %%xmm7,%%xmm2 \n"
1245 "movdqa %%xmm0,%%xmm1 \n"
1246 "movdqa %%xmm2,%%xmm6 \n"
1247 "pmaddubsw %%xmm4,%%xmm0 \n"
1248 "pmaddubsw %%xmm4,%%xmm2 \n"
1249 "pmaddubsw %%xmm3,%%xmm1 \n"
1250 "pmaddubsw %%xmm3,%%xmm6 \n"
1251 "phaddw %%xmm2,%%xmm0 \n"
1252 "phaddw %%xmm6,%%xmm1 \n"
1253 "psraw $0x8,%%xmm0 \n"
1254 "psraw $0x8,%%xmm1 \n"
1255 "packsswb %%xmm1,%%xmm0 \n"
1256 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001257 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001258 "movlps %%xmm0,(%1) \n"
1259 "movhps %%xmm0,(%1,%2,1) \n"
1260 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001261 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001262 : "+r"(src_abgr0), // %0
1263 "+r"(dst_u), // %1
1264 "+r"(dst_v), // %2
1265 "+rm"(width) // %3
1266 : "r"(static_cast<intptr_t>(src_stride_abgr))
1267 : "memory", "cc"
1268#if defined(__SSE2__)
1269 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1270#endif
1271 );
1272}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001273#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001274
fbarchard@google.come214fe32012-06-04 23:47:11 +00001275#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001276#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1277#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1278#define UR 0
1279
1280#define VB 0
1281#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1282#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1283
1284// Bias
1285#define BB UB * 128 + VB * 128
1286#define BG UG * 128 + VG * 128
1287#define BR UR * 128 + VR * 128
1288
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001289#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001290
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001291struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001292 vec8 kUVToB; // 0
1293 vec8 kUVToG; // 16
1294 vec8 kUVToR; // 32
1295 vec16 kUVBiasB; // 48
1296 vec16 kUVBiasG; // 64
1297 vec16 kUVBiasR; // 80
1298 vec16 kYSub16; // 96
1299 vec16 kYToRgb; // 112
1300 vec8 kVUToB; // 128
1301 vec8 kVUToG; // 144
1302 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001303} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001304 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1305 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1306 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1307 { BB, BB, BB, BB, BB, BB, BB, BB },
1308 { BG, BG, BG, BG, BG, BG, BG, BG },
1309 { BR, BR, BR, BR, BR, BR, BR, BR },
1310 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001311 { YG, YG, YG, YG, YG, YG, YG, YG },
1312 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1313 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1314 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001315};
1316
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001317
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001318// Read 8 UV from 411
1319#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001320 "movq (%[u_buf]),%%xmm0 \n" \
1321 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1322 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001323 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001324
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001325// Read 4 UV from 422, upsample to 8 UV
1326#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001327 "movd (%[u_buf]),%%xmm0 \n" \
1328 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1329 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001330 "punpcklbw %%xmm1,%%xmm0 \n" \
1331 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001332
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001333// Read 2 UV from 411, upsample to 8 UV
1334#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001335 "movd (%[u_buf]),%%xmm0 \n" \
1336 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1337 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001338 "punpcklbw %%xmm1,%%xmm0 \n" \
1339 "punpcklwd %%xmm0,%%xmm0 \n" \
1340 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001341
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001342// Read 4 UV from NV12, upsample to 8 UV
1343#define READNV12 \
1344 "movq (%[uv_buf]),%%xmm0 \n" \
1345 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001346 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001347
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001348// Convert 8 pixels: 8 UV and 8 Y
1349#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001350 "movdqa %%xmm0,%%xmm1 \n" \
1351 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001352 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1353 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1354 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1355 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1356 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1357 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1358 "movq (%[y_buf]),%%xmm3 \n" \
1359 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001360 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001361 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1362 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001363 "paddsw %%xmm3,%%xmm0 \n" \
1364 "paddsw %%xmm3,%%xmm1 \n" \
1365 "paddsw %%xmm3,%%xmm2 \n" \
1366 "psraw $0x6,%%xmm0 \n" \
1367 "psraw $0x6,%%xmm1 \n" \
1368 "psraw $0x6,%%xmm2 \n" \
1369 "packuswb %%xmm0,%%xmm0 \n" \
1370 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001371 "packuswb %%xmm2,%%xmm2 \n" \
1372
1373// Convert 8 pixels: 8 VU and 8 Y
1374#define YVUTORGB \
1375 "movdqa %%xmm0,%%xmm1 \n" \
1376 "movdqa %%xmm0,%%xmm2 \n" \
1377 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1378 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1379 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1380 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1381 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1382 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1383 "movq (%[y_buf]),%%xmm3 \n" \
1384 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1385 "punpcklbw %%xmm4,%%xmm3 \n" \
1386 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1387 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1388 "paddsw %%xmm3,%%xmm0 \n" \
1389 "paddsw %%xmm3,%%xmm1 \n" \
1390 "paddsw %%xmm3,%%xmm2 \n" \
1391 "psraw $0x6,%%xmm0 \n" \
1392 "psraw $0x6,%%xmm1 \n" \
1393 "psraw $0x6,%%xmm2 \n" \
1394 "packuswb %%xmm0,%%xmm0 \n" \
1395 "packuswb %%xmm1,%%xmm1 \n" \
1396 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001397
1398void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001399 const uint8* u_buf,
1400 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001401 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001402 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001403 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001404 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001405 "pcmpeqb %%xmm5,%%xmm5 \n"
1406 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001407 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001408 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001409 READYUV444
1410 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001411 "punpcklbw %%xmm1,%%xmm0 \n"
1412 "punpcklbw %%xmm5,%%xmm2 \n"
1413 "movdqa %%xmm0,%%xmm1 \n"
1414 "punpcklwd %%xmm2,%%xmm0 \n"
1415 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001416 "movdqa %%xmm0,(%[argb_buf]) \n"
1417 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1418 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1419 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001420 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001421 : [y_buf]"+r"(y_buf), // %[y_buf]
1422 [u_buf]"+r"(u_buf), // %[u_buf]
1423 [v_buf]"+r"(v_buf), // %[v_buf]
1424 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1425 [width]"+rm"(width) // %[width]
1426 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001427 : "memory", "cc"
1428#if defined(__SSE2__)
1429 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1430#endif
1431 );
1432}
1433
fbarchard@google.come214fe32012-06-04 23:47:11 +00001434void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001435 const uint8* u_buf,
1436 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001437 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001438 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001439 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001440 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001441 "pcmpeqb %%xmm5,%%xmm5 \n"
1442 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001443 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001444 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001445 READYUV422
1446 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001447 "punpcklbw %%xmm1,%%xmm0 \n"
1448 "punpcklbw %%xmm5,%%xmm2 \n"
1449 "movdqa %%xmm0,%%xmm1 \n"
1450 "punpcklwd %%xmm2,%%xmm0 \n"
1451 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001452 "movdqa %%xmm0,(%[argb_buf]) \n"
1453 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1454 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1455 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001456 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001457 : [y_buf]"+r"(y_buf), // %[y_buf]
1458 [u_buf]"+r"(u_buf), // %[u_buf]
1459 [v_buf]"+r"(v_buf), // %[v_buf]
1460 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1461 [width]"+rm"(width) // %[width]
1462 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001463 : "memory", "cc"
1464#if defined(__SSE2__)
1465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1466#endif
1467 );
1468}
1469
1470void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1471 const uint8* u_buf,
1472 const uint8* v_buf,
1473 uint8* argb_buf,
1474 int width) {
1475 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001476 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001477 "pcmpeqb %%xmm5,%%xmm5 \n"
1478 "pxor %%xmm4,%%xmm4 \n"
1479 ".p2align 4 \n"
1480 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001481 READYUV411
1482 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001483 "punpcklbw %%xmm1,%%xmm0 \n"
1484 "punpcklbw %%xmm5,%%xmm2 \n"
1485 "movdqa %%xmm0,%%xmm1 \n"
1486 "punpcklwd %%xmm2,%%xmm0 \n"
1487 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001488 "movdqa %%xmm0,(%[argb_buf]) \n"
1489 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1490 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1491 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001492 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001493 : [y_buf]"+r"(y_buf), // %[y_buf]
1494 [u_buf]"+r"(u_buf), // %[u_buf]
1495 [v_buf]"+r"(v_buf), // %[v_buf]
1496 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1497 [width]"+rm"(width) // %[width]
1498 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1499 : "memory", "cc"
1500#if defined(__SSE2__)
1501 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1502#endif
1503 );
1504}
1505
1506void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1507 const uint8* uv_buf,
1508 uint8* argb_buf,
1509 int width) {
1510 asm volatile (
1511 "pcmpeqb %%xmm5,%%xmm5 \n"
1512 "pxor %%xmm4,%%xmm4 \n"
1513 ".p2align 4 \n"
1514 "1: \n"
1515 READNV12
1516 YUVTORGB
1517 "punpcklbw %%xmm1,%%xmm0 \n"
1518 "punpcklbw %%xmm5,%%xmm2 \n"
1519 "movdqa %%xmm0,%%xmm1 \n"
1520 "punpcklwd %%xmm2,%%xmm0 \n"
1521 "punpckhwd %%xmm2,%%xmm1 \n"
1522 "movdqa %%xmm0,(%[argb_buf]) \n"
1523 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1524 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1525 "sub $0x8,%[width] \n"
1526 "jg 1b \n"
1527 : [y_buf]"+r"(y_buf), // %[y_buf]
1528 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1529 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1530 [width]"+rm"(width) // %[width]
1531 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1532 : "memory", "cc"
1533#if defined(__SSE2__)
1534 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1535#endif
1536 );
1537}
1538
1539void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1540 const uint8* vu_buf,
1541 uint8* argb_buf,
1542 int width) {
1543 asm volatile (
1544 "pcmpeqb %%xmm5,%%xmm5 \n"
1545 "pxor %%xmm4,%%xmm4 \n"
1546 ".p2align 4 \n"
1547 "1: \n"
1548 READNV12
1549 YVUTORGB
1550 "punpcklbw %%xmm1,%%xmm0 \n"
1551 "punpcklbw %%xmm5,%%xmm2 \n"
1552 "movdqa %%xmm0,%%xmm1 \n"
1553 "punpcklwd %%xmm2,%%xmm0 \n"
1554 "punpckhwd %%xmm2,%%xmm1 \n"
1555 "movdqa %%xmm0,(%[argb_buf]) \n"
1556 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1557 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1558 "sub $0x8,%[width] \n"
1559 "jg 1b \n"
1560 : [y_buf]"+r"(y_buf), // %[y_buf]
1561 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1562 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1563 [width]"+rm"(width) // %[width]
1564 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001565 : "memory", "cc"
1566#if defined(__SSE2__)
1567 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1568#endif
1569 );
1570}
1571
1572void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1573 const uint8* u_buf,
1574 const uint8* v_buf,
1575 uint8* argb_buf,
1576 int width) {
1577 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001578 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001579 "pcmpeqb %%xmm5,%%xmm5 \n"
1580 "pxor %%xmm4,%%xmm4 \n"
1581 ".p2align 4 \n"
1582 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001583 READYUV444
1584 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001585 "punpcklbw %%xmm1,%%xmm0 \n"
1586 "punpcklbw %%xmm5,%%xmm2 \n"
1587 "movdqa %%xmm0,%%xmm1 \n"
1588 "punpcklwd %%xmm2,%%xmm0 \n"
1589 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001590 "movdqu %%xmm0,(%[argb_buf]) \n"
1591 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1592 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1593 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001594 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001595 : [y_buf]"+r"(y_buf), // %[y_buf]
1596 [u_buf]"+r"(u_buf), // %[u_buf]
1597 [v_buf]"+r"(v_buf), // %[v_buf]
1598 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1599 [width]"+rm"(width) // %[width]
1600 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001601 : "memory", "cc"
1602#if defined(__SSE2__)
1603 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1604#endif
1605 );
1606}
1607
1608void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1609 const uint8* u_buf,
1610 const uint8* v_buf,
1611 uint8* argb_buf,
1612 int width) {
1613 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001614 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001615 "pcmpeqb %%xmm5,%%xmm5 \n"
1616 "pxor %%xmm4,%%xmm4 \n"
1617 ".p2align 4 \n"
1618 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001619 READYUV422
1620 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001621 "punpcklbw %%xmm1,%%xmm0 \n"
1622 "punpcklbw %%xmm5,%%xmm2 \n"
1623 "movdqa %%xmm0,%%xmm1 \n"
1624 "punpcklwd %%xmm2,%%xmm0 \n"
1625 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001626 "movdqu %%xmm0,(%[argb_buf]) \n"
1627 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1628 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1629 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001630 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001631 : [y_buf]"+r"(y_buf), // %[y_buf]
1632 [u_buf]"+r"(u_buf), // %[u_buf]
1633 [v_buf]"+r"(v_buf), // %[v_buf]
1634 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1635 [width]"+rm"(width) // %[width]
1636 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001637 : "memory", "cc"
1638#if defined(__SSE2__)
1639 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1640#endif
1641 );
1642}
1643
1644void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1645 const uint8* u_buf,
1646 const uint8* v_buf,
1647 uint8* argb_buf,
1648 int width) {
1649 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001650 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001651 "pcmpeqb %%xmm5,%%xmm5 \n"
1652 "pxor %%xmm4,%%xmm4 \n"
1653 ".p2align 4 \n"
1654 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001655 READYUV411
1656 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001657 "punpcklbw %%xmm1,%%xmm0 \n"
1658 "punpcklbw %%xmm5,%%xmm2 \n"
1659 "movdqa %%xmm0,%%xmm1 \n"
1660 "punpcklwd %%xmm2,%%xmm0 \n"
1661 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001662 "movdqu %%xmm0,(%[argb_buf]) \n"
1663 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1664 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1665 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001666 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001667 : [y_buf]"+r"(y_buf), // %[y_buf]
1668 [u_buf]"+r"(u_buf), // %[u_buf]
1669 [v_buf]"+r"(v_buf), // %[v_buf]
1670 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1671 [width]"+rm"(width) // %[width]
1672 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1673 : "memory", "cc"
1674#if defined(__SSE2__)
1675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1676#endif
1677 );
1678}
1679
1680void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1681 const uint8* uv_buf,
1682 uint8* argb_buf,
1683 int width) {
1684 asm volatile (
1685 "pcmpeqb %%xmm5,%%xmm5 \n"
1686 "pxor %%xmm4,%%xmm4 \n"
1687 ".p2align 4 \n"
1688 "1: \n"
1689 READNV12
1690 YUVTORGB
1691 "punpcklbw %%xmm1,%%xmm0 \n"
1692 "punpcklbw %%xmm5,%%xmm2 \n"
1693 "movdqa %%xmm0,%%xmm1 \n"
1694 "punpcklwd %%xmm2,%%xmm0 \n"
1695 "punpckhwd %%xmm2,%%xmm1 \n"
1696 "movdqu %%xmm0,(%[argb_buf]) \n"
1697 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1698 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1699 "sub $0x8,%[width] \n"
1700 "jg 1b \n"
1701 : [y_buf]"+r"(y_buf), // %[y_buf]
1702 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1703 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1704 [width]"+rm"(width) // %[width]
1705 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1706 : "memory", "cc"
1707#if defined(__SSE2__)
1708 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1709#endif
1710 );
1711}
1712
1713void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1714 const uint8* vu_buf,
1715 uint8* argb_buf,
1716 int width) {
1717 asm volatile (
1718 "pcmpeqb %%xmm5,%%xmm5 \n"
1719 "pxor %%xmm4,%%xmm4 \n"
1720 ".p2align 4 \n"
1721 "1: \n"
1722 READNV12
1723 YVUTORGB
1724 "punpcklbw %%xmm1,%%xmm0 \n"
1725 "punpcklbw %%xmm5,%%xmm2 \n"
1726 "movdqa %%xmm0,%%xmm1 \n"
1727 "punpcklwd %%xmm2,%%xmm0 \n"
1728 "punpckhwd %%xmm2,%%xmm1 \n"
1729 "movdqu %%xmm0,(%[argb_buf]) \n"
1730 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1731 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1732 "sub $0x8,%[width] \n"
1733 "jg 1b \n"
1734 : [y_buf]"+r"(y_buf), // %[y_buf]
1735 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1736 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1737 [width]"+rm"(width) // %[width]
1738 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001739 : "memory", "cc"
1740#if defined(__SSE2__)
1741 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1742#endif
1743 );
1744}
1745
1746void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1747 const uint8* u_buf,
1748 const uint8* v_buf,
1749 uint8* bgra_buf,
1750 int width) {
1751 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001752 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001753 "pcmpeqb %%xmm5,%%xmm5 \n"
1754 "pxor %%xmm4,%%xmm4 \n"
1755 ".p2align 4 \n"
1756 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001757 READYUV422
1758 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001759 "pcmpeqb %%xmm5,%%xmm5 \n"
1760 "punpcklbw %%xmm0,%%xmm1 \n"
1761 "punpcklbw %%xmm2,%%xmm5 \n"
1762 "movdqa %%xmm5,%%xmm0 \n"
1763 "punpcklwd %%xmm1,%%xmm5 \n"
1764 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001765 "movdqa %%xmm5,(%[argb_buf]) \n"
1766 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1767 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1768 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001769 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001770 : [y_buf]"+r"(y_buf), // %[y_buf]
1771 [u_buf]"+r"(u_buf), // %[u_buf]
1772 [v_buf]"+r"(v_buf), // %[v_buf]
1773 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1774 [width]"+rm"(width) // %[width]
1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001776 : "memory", "cc"
1777#if defined(__SSE2__)
1778 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1779#endif
1780 );
1781}
1782
fbarchard@google.come214fe32012-06-04 23:47:11 +00001783void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001784 const uint8* u_buf,
1785 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001786 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001787 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001788 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001789 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001790 "pcmpeqb %%xmm5,%%xmm5 \n"
1791 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001792 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001793 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001794 READYUV422
1795 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001796 "punpcklbw %%xmm1,%%xmm2 \n"
1797 "punpcklbw %%xmm5,%%xmm0 \n"
1798 "movdqa %%xmm2,%%xmm1 \n"
1799 "punpcklwd %%xmm0,%%xmm2 \n"
1800 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001801 "movdqa %%xmm2,(%[argb_buf]) \n"
1802 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1803 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1804 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001805 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001806 : [y_buf]"+r"(y_buf), // %[y_buf]
1807 [u_buf]"+r"(u_buf), // %[u_buf]
1808 [v_buf]"+r"(v_buf), // %[v_buf]
1809 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1810 [width]"+rm"(width) // %[width]
1811 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001812 : "memory", "cc"
1813#if defined(__SSE2__)
1814 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1815#endif
1816 );
1817}
1818
fbarchard@google.come214fe32012-06-04 23:47:11 +00001819void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001820 const uint8* u_buf,
1821 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001822 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001823 int width) {
1824 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001825 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001826 "pcmpeqb %%xmm5,%%xmm5 \n"
1827 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001828 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001829 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001830 READYUV422
1831 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001832 "pcmpeqb %%xmm5,%%xmm5 \n"
1833 "punpcklbw %%xmm0,%%xmm1 \n"
1834 "punpcklbw %%xmm2,%%xmm5 \n"
1835 "movdqa %%xmm5,%%xmm0 \n"
1836 "punpcklwd %%xmm1,%%xmm5 \n"
1837 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001838 "movdqu %%xmm5,(%[argb_buf]) \n"
1839 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
1840 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1841 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001842 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001843 : [y_buf]"+r"(y_buf), // %[y_buf]
1844 [u_buf]"+r"(u_buf), // %[u_buf]
1845 [v_buf]"+r"(v_buf), // %[v_buf]
1846 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1847 [width]"+rm"(width) // %[width]
1848 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001849 : "memory", "cc"
1850#if defined(__SSE2__)
1851 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1852#endif
1853 );
1854}
1855
fbarchard@google.come214fe32012-06-04 23:47:11 +00001856void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001857 const uint8* u_buf,
1858 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001859 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001860 int width) {
1861 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001862 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001863 "pcmpeqb %%xmm5,%%xmm5 \n"
1864 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001865 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001866 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001867 READYUV422
1868 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001869 "punpcklbw %%xmm1,%%xmm2 \n"
1870 "punpcklbw %%xmm5,%%xmm0 \n"
1871 "movdqa %%xmm2,%%xmm1 \n"
1872 "punpcklwd %%xmm0,%%xmm2 \n"
1873 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001874 "movdqu %%xmm2,(%[argb_buf]) \n"
1875 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1876 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1877 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001878 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001879 : [y_buf]"+r"(y_buf), // %[y_buf]
1880 [u_buf]"+r"(u_buf), // %[u_buf]
1881 [v_buf]"+r"(v_buf), // %[v_buf]
1882 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1883 [width]"+rm"(width) // %[width]
1884 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001885 : "memory", "cc"
1886#if defined(__SSE2__)
1887 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1888#endif
1889 );
1890}
fbarchard@google.come214fe32012-06-04 23:47:11 +00001891#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001892
1893#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001894void YToARGBRow_SSE2(const uint8* y_buf,
1895 uint8* rgb_buf,
1896 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001897 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001898 "pcmpeqb %%xmm4,%%xmm4 \n"
1899 "pslld $0x18,%%xmm4 \n"
1900 "mov $0x10001000,%%eax \n"
1901 "movd %%eax,%%xmm3 \n"
1902 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1903 "mov $0x012a012a,%%eax \n"
1904 "movd %%eax,%%xmm2 \n"
1905 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001906 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001907 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001908 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001909 "movq (%0),%%xmm0 \n"
1910 "lea 0x8(%0),%0 \n"
1911 "punpcklbw %%xmm0,%%xmm0 \n"
1912 "psubusw %%xmm3,%%xmm0 \n"
1913 "pmulhuw %%xmm2,%%xmm0 \n"
1914 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001915
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001916 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001917 "punpcklbw %%xmm0,%%xmm0 \n"
1918 "movdqa %%xmm0,%%xmm1 \n"
1919 "punpcklwd %%xmm0,%%xmm0 \n"
1920 "punpckhwd %%xmm1,%%xmm1 \n"
1921 "por %%xmm4,%%xmm0 \n"
1922 "por %%xmm4,%%xmm1 \n"
1923 "movdqa %%xmm0,(%1) \n"
1924 "movdqa %%xmm1,16(%1) \n"
1925 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001926
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001927 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001928 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001929 : "+r"(y_buf), // %0
1930 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001931 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001932 :
1933 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001934#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001935 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001936#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001937 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001938}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001939#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001940
fbarchard@google.com42831e02012-01-21 02:54:17 +00001941#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001942// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001943CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001944 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1945};
1946
fbarchard@google.com42831e02012-01-21 02:54:17 +00001947void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001948 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001949 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001950 "movdqa %3,%%xmm5 \n"
1951 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001952 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001953 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001954 "movdqa (%0,%2),%%xmm0 \n"
1955 "pshufb %%xmm5,%%xmm0 \n"
1956 "sub $0x10,%2 \n"
1957 "movdqa %%xmm0,(%1) \n"
1958 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001959 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001960 : "+r"(src), // %0
1961 "+r"(dst), // %1
1962 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001963 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001964 : "memory", "cc"
1965#if defined(__SSE2__)
1966 , "xmm0", "xmm5"
1967#endif
1968 );
1969}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001970#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001971
fbarchard@google.com42831e02012-01-21 02:54:17 +00001972#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001973void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001974 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001975 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001976 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001977 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001978 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001979 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001980 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001981 "psllw $0x8,%%xmm0 \n"
1982 "psrlw $0x8,%%xmm1 \n"
1983 "por %%xmm1,%%xmm0 \n"
1984 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1985 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1986 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1987 "sub $0x10,%2 \n"
1988 "movdqu %%xmm0,(%1) \n"
1989 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001990 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001991 : "+r"(src), // %0
1992 "+r"(dst), // %1
1993 "+r"(temp_width) // %2
1994 :
1995 : "memory", "cc"
1996#if defined(__SSE2__)
1997 , "xmm0", "xmm1"
1998#endif
1999 );
2000}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002001#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002002
fbarchard@google.com16a96642012-03-02 22:38:09 +00002003#ifdef HAS_MIRRORROW_UV_SSSE3
2004// Shuffle table for reversing the bytes of UV channels.
2005CONST uvec8 kShuffleMirrorUV = {
2006 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2007};
2008void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2009 int width) {
2010 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002011 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002012 "movdqa %4,%%xmm1 \n"
2013 "lea -16(%0,%3,2),%0 \n"
2014 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002015 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002016 "1: \n"
2017 "movdqa (%0),%%xmm0 \n"
2018 "lea -16(%0),%0 \n"
2019 "pshufb %%xmm1,%%xmm0 \n"
2020 "sub $8,%3 \n"
2021 "movlpd %%xmm0,(%1) \n"
2022 "movhpd %%xmm0,(%1,%2) \n"
2023 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002024 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002025 : "+r"(src), // %0
2026 "+r"(dst_u), // %1
2027 "+r"(dst_v), // %2
2028 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002029 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002030 : "memory", "cc"
2031#if defined(__SSE2__)
2032 , "xmm0", "xmm1"
2033#endif
2034 );
2035}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002036#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002037
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002038#ifdef HAS_ARGBMIRRORROW_SSSE3
2039// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002040CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002041 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2042};
2043
2044void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2045 intptr_t temp_width = static_cast<intptr_t>(width);
2046 asm volatile (
2047 "movdqa %3,%%xmm5 \n"
2048 "lea -0x10(%0),%0 \n"
2049 ".p2align 4 \n"
2050 "1: \n"
2051 "movdqa (%0,%2,4),%%xmm0 \n"
2052 "pshufb %%xmm5,%%xmm0 \n"
2053 "sub $0x4,%2 \n"
2054 "movdqa %%xmm0,(%1) \n"
2055 "lea 0x10(%1),%1 \n"
2056 "jg 1b \n"
2057 : "+r"(src), // %0
2058 "+r"(dst), // %1
2059 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002060 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002061 : "memory", "cc"
2062#if defined(__SSE2__)
2063 , "xmm0", "xmm5"
2064#endif
2065 );
2066}
2067#endif // HAS_ARGBMIRRORROW_SSSE3
2068
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002069#ifdef HAS_SPLITUV_SSE2
2070void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002071 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002072 "pcmpeqb %%xmm5,%%xmm5 \n"
2073 "psrlw $0x8,%%xmm5 \n"
2074 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002075 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002076 "1: \n"
2077 "movdqa (%0),%%xmm0 \n"
2078 "movdqa 0x10(%0),%%xmm1 \n"
2079 "lea 0x20(%0),%0 \n"
2080 "movdqa %%xmm0,%%xmm2 \n"
2081 "movdqa %%xmm1,%%xmm3 \n"
2082 "pand %%xmm5,%%xmm0 \n"
2083 "pand %%xmm5,%%xmm1 \n"
2084 "packuswb %%xmm1,%%xmm0 \n"
2085 "psrlw $0x8,%%xmm2 \n"
2086 "psrlw $0x8,%%xmm3 \n"
2087 "packuswb %%xmm3,%%xmm2 \n"
2088 "movdqa %%xmm0,(%1) \n"
2089 "movdqa %%xmm2,(%1,%2) \n"
2090 "lea 0x10(%1),%1 \n"
2091 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002092 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002093 : "+r"(src_uv), // %0
2094 "+r"(dst_u), // %1
2095 "+r"(dst_v), // %2
2096 "+r"(pix) // %3
2097 :
2098 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002099#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002100 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002101#endif
2102 );
2103}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002104#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002105
fbarchard@google.com19932f82012-02-16 22:19:14 +00002106#ifdef HAS_COPYROW_SSE2
2107void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002108 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002109 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002110 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002111 "1: \n"
2112 "movdqa (%0),%%xmm0 \n"
2113 "movdqa 0x10(%0),%%xmm1 \n"
2114 "movdqa %%xmm0,(%0,%1) \n"
2115 "movdqa %%xmm1,0x10(%0,%1) \n"
2116 "lea 0x20(%0),%0 \n"
2117 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002118 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002119 : "+r"(src), // %0
2120 "+r"(dst), // %1
2121 "+r"(count) // %2
2122 :
2123 : "memory", "cc"
2124#if defined(__SSE2__)
2125 , "xmm0", "xmm1"
2126#endif
2127 );
2128}
2129#endif // HAS_COPYROW_SSE2
2130
2131#ifdef HAS_COPYROW_X86
2132void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2133 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002134 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002135 "shr $0x2,%2 \n"
2136 "rep movsl \n"
2137 : "+S"(src), // %0
2138 "+D"(dst), // %1
2139 "+c"(width_tmp) // %2
2140 :
2141 : "memory", "cc"
2142 );
2143}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002144#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002145
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002146#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002147void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002148 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002149 "pcmpeqb %%xmm5,%%xmm5 \n"
2150 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002151 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002152 "1: \n"
2153 "movdqa (%0),%%xmm0 \n"
2154 "movdqa 0x10(%0),%%xmm1 \n"
2155 "lea 0x20(%0),%0 \n"
2156 "pand %%xmm5,%%xmm0 \n"
2157 "pand %%xmm5,%%xmm1 \n"
2158 "packuswb %%xmm1,%%xmm0 \n"
2159 "movdqa %%xmm0,(%1) \n"
2160 "lea 0x10(%1),%1 \n"
2161 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002162 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002163 : "+r"(src_yuy2), // %0
2164 "+r"(dst_y), // %1
2165 "+r"(pix) // %2
2166 :
2167 : "memory", "cc"
2168#if defined(__SSE2__)
2169 , "xmm0", "xmm1", "xmm5"
2170#endif
2171 );
2172}
2173
2174void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002175 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002176 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002177 "pcmpeqb %%xmm5,%%xmm5 \n"
2178 "psrlw $0x8,%%xmm5 \n"
2179 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002180 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002181 "1: \n"
2182 "movdqa (%0),%%xmm0 \n"
2183 "movdqa 0x10(%0),%%xmm1 \n"
2184 "movdqa (%0,%4,1),%%xmm2 \n"
2185 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2186 "lea 0x20(%0),%0 \n"
2187 "pavgb %%xmm2,%%xmm0 \n"
2188 "pavgb %%xmm3,%%xmm1 \n"
2189 "psrlw $0x8,%%xmm0 \n"
2190 "psrlw $0x8,%%xmm1 \n"
2191 "packuswb %%xmm1,%%xmm0 \n"
2192 "movdqa %%xmm0,%%xmm1 \n"
2193 "pand %%xmm5,%%xmm0 \n"
2194 "packuswb %%xmm0,%%xmm0 \n"
2195 "psrlw $0x8,%%xmm1 \n"
2196 "packuswb %%xmm1,%%xmm1 \n"
2197 "movq %%xmm0,(%1) \n"
2198 "movq %%xmm1,(%1,%2) \n"
2199 "lea 0x8(%1),%1 \n"
2200 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002201 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002202 : "+r"(src_yuy2), // %0
2203 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002204 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002205 "+r"(pix) // %3
2206 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2207 : "memory", "cc"
2208#if defined(__SSE2__)
2209 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2210#endif
2211 );
2212}
2213
fbarchard@google.comc704f782012-08-30 19:53:48 +00002214void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2215 uint8* dst_u, uint8* dst_v, int pix) {
2216 asm volatile (
2217 "pcmpeqb %%xmm5,%%xmm5 \n"
2218 "psrlw $0x8,%%xmm5 \n"
2219 "sub %1,%2 \n"
2220 ".p2align 4 \n"
2221 "1: \n"
2222 "movdqa (%0),%%xmm0 \n"
2223 "movdqa 0x10(%0),%%xmm1 \n"
2224 "lea 0x20(%0),%0 \n"
2225 "psrlw $0x8,%%xmm0 \n"
2226 "psrlw $0x8,%%xmm1 \n"
2227 "packuswb %%xmm1,%%xmm0 \n"
2228 "movdqa %%xmm0,%%xmm1 \n"
2229 "pand %%xmm5,%%xmm0 \n"
2230 "packuswb %%xmm0,%%xmm0 \n"
2231 "psrlw $0x8,%%xmm1 \n"
2232 "packuswb %%xmm1,%%xmm1 \n"
2233 "movq %%xmm0,(%1) \n"
2234 "movq %%xmm1,(%1,%2) \n"
2235 "lea 0x8(%1),%1 \n"
2236 "sub $0x10,%3 \n"
2237 "jg 1b \n"
2238 : "+r"(src_yuy2), // %0
2239 "+r"(dst_u), // %1
2240 "+r"(dst_v), // %2
2241 "+r"(pix) // %3
2242 :
2243 : "memory", "cc"
2244#if defined(__SSE2__)
2245 , "xmm0", "xmm1", "xmm5"
2246#endif
2247 );
2248}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002249
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002250void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2251 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002252 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002253 "pcmpeqb %%xmm5,%%xmm5 \n"
2254 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002255 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002256 "1: \n"
2257 "movdqu (%0),%%xmm0 \n"
2258 "movdqu 0x10(%0),%%xmm1 \n"
2259 "lea 0x20(%0),%0 \n"
2260 "pand %%xmm5,%%xmm0 \n"
2261 "pand %%xmm5,%%xmm1 \n"
2262 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002263 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002264 "movdqu %%xmm0,(%1) \n"
2265 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002266 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002267 : "+r"(src_yuy2), // %0
2268 "+r"(dst_y), // %1
2269 "+r"(pix) // %2
2270 :
2271 : "memory", "cc"
2272#if defined(__SSE2__)
2273 , "xmm0", "xmm1", "xmm5"
2274#endif
2275 );
2276}
2277
2278void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2279 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002280 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002281 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002282 "pcmpeqb %%xmm5,%%xmm5 \n"
2283 "psrlw $0x8,%%xmm5 \n"
2284 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002285 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002286 "1: \n"
2287 "movdqu (%0),%%xmm0 \n"
2288 "movdqu 0x10(%0),%%xmm1 \n"
2289 "movdqu (%0,%4,1),%%xmm2 \n"
2290 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2291 "lea 0x20(%0),%0 \n"
2292 "pavgb %%xmm2,%%xmm0 \n"
2293 "pavgb %%xmm3,%%xmm1 \n"
2294 "psrlw $0x8,%%xmm0 \n"
2295 "psrlw $0x8,%%xmm1 \n"
2296 "packuswb %%xmm1,%%xmm0 \n"
2297 "movdqa %%xmm0,%%xmm1 \n"
2298 "pand %%xmm5,%%xmm0 \n"
2299 "packuswb %%xmm0,%%xmm0 \n"
2300 "psrlw $0x8,%%xmm1 \n"
2301 "packuswb %%xmm1,%%xmm1 \n"
2302 "movq %%xmm0,(%1) \n"
2303 "movq %%xmm1,(%1,%2) \n"
2304 "lea 0x8(%1),%1 \n"
2305 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002306 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002307 : "+r"(src_yuy2), // %0
2308 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002309 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002310 "+r"(pix) // %3
2311 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2312 : "memory", "cc"
2313#if defined(__SSE2__)
2314 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2315#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002316 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002317}
2318
fbarchard@google.comc704f782012-08-30 19:53:48 +00002319void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2320 uint8* dst_u, uint8* dst_v, int pix) {
2321 asm volatile (
2322 "pcmpeqb %%xmm5,%%xmm5 \n"
2323 "psrlw $0x8,%%xmm5 \n"
2324 "sub %1,%2 \n"
2325 ".p2align 4 \n"
2326 "1: \n"
2327 "movdqu (%0),%%xmm0 \n"
2328 "movdqu 0x10(%0),%%xmm1 \n"
2329 "lea 0x20(%0),%0 \n"
2330 "psrlw $0x8,%%xmm0 \n"
2331 "psrlw $0x8,%%xmm1 \n"
2332 "packuswb %%xmm1,%%xmm0 \n"
2333 "movdqa %%xmm0,%%xmm1 \n"
2334 "pand %%xmm5,%%xmm0 \n"
2335 "packuswb %%xmm0,%%xmm0 \n"
2336 "psrlw $0x8,%%xmm1 \n"
2337 "packuswb %%xmm1,%%xmm1 \n"
2338 "movq %%xmm0,(%1) \n"
2339 "movq %%xmm1,(%1,%2) \n"
2340 "lea 0x8(%1),%1 \n"
2341 "sub $0x10,%3 \n"
2342 "jg 1b \n"
2343 : "+r"(src_yuy2), // %0
2344 "+r"(dst_u), // %1
2345 "+r"(dst_v), // %2
2346 "+r"(pix) // %3
2347 :
2348 : "memory", "cc"
2349#if defined(__SSE2__)
2350 , "xmm0", "xmm1", "xmm5"
2351#endif
2352 );
2353}
2354
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002355void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002356 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002357 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002358 "1: \n"
2359 "movdqa (%0),%%xmm0 \n"
2360 "movdqa 0x10(%0),%%xmm1 \n"
2361 "lea 0x20(%0),%0 \n"
2362 "psrlw $0x8,%%xmm0 \n"
2363 "psrlw $0x8,%%xmm1 \n"
2364 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002365 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002366 "movdqa %%xmm0,(%1) \n"
2367 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002368 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002369 : "+r"(src_uyvy), // %0
2370 "+r"(dst_y), // %1
2371 "+r"(pix) // %2
2372 :
2373 : "memory", "cc"
2374#if defined(__SSE2__)
2375 , "xmm0", "xmm1"
2376#endif
2377 );
2378}
2379
2380void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002381 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002382 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002383 "pcmpeqb %%xmm5,%%xmm5 \n"
2384 "psrlw $0x8,%%xmm5 \n"
2385 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002386 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002387 "1: \n"
2388 "movdqa (%0),%%xmm0 \n"
2389 "movdqa 0x10(%0),%%xmm1 \n"
2390 "movdqa (%0,%4,1),%%xmm2 \n"
2391 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2392 "lea 0x20(%0),%0 \n"
2393 "pavgb %%xmm2,%%xmm0 \n"
2394 "pavgb %%xmm3,%%xmm1 \n"
2395 "pand %%xmm5,%%xmm0 \n"
2396 "pand %%xmm5,%%xmm1 \n"
2397 "packuswb %%xmm1,%%xmm0 \n"
2398 "movdqa %%xmm0,%%xmm1 \n"
2399 "pand %%xmm5,%%xmm0 \n"
2400 "packuswb %%xmm0,%%xmm0 \n"
2401 "psrlw $0x8,%%xmm1 \n"
2402 "packuswb %%xmm1,%%xmm1 \n"
2403 "movq %%xmm0,(%1) \n"
2404 "movq %%xmm1,(%1,%2) \n"
2405 "lea 0x8(%1),%1 \n"
2406 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002407 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002408 : "+r"(src_uyvy), // %0
2409 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002410 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002411 "+r"(pix) // %3
2412 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2413 : "memory", "cc"
2414#if defined(__SSE2__)
2415 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2416#endif
2417 );
2418}
2419
fbarchard@google.comc704f782012-08-30 19:53:48 +00002420void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2421 uint8* dst_u, uint8* dst_v, int pix) {
2422 asm volatile (
2423 "pcmpeqb %%xmm5,%%xmm5 \n"
2424 "psrlw $0x8,%%xmm5 \n"
2425 "sub %1,%2 \n"
2426 ".p2align 4 \n"
2427 "1: \n"
2428 "movdqa (%0),%%xmm0 \n"
2429 "movdqa 0x10(%0),%%xmm1 \n"
2430 "lea 0x20(%0),%0 \n"
2431 "pand %%xmm5,%%xmm0 \n"
2432 "pand %%xmm5,%%xmm1 \n"
2433 "packuswb %%xmm1,%%xmm0 \n"
2434 "movdqa %%xmm0,%%xmm1 \n"
2435 "pand %%xmm5,%%xmm0 \n"
2436 "packuswb %%xmm0,%%xmm0 \n"
2437 "psrlw $0x8,%%xmm1 \n"
2438 "packuswb %%xmm1,%%xmm1 \n"
2439 "movq %%xmm0,(%1) \n"
2440 "movq %%xmm1,(%1,%2) \n"
2441 "lea 0x8(%1),%1 \n"
2442 "sub $0x10,%3 \n"
2443 "jg 1b \n"
2444 : "+r"(src_uyvy), // %0
2445 "+r"(dst_u), // %1
2446 "+r"(dst_v), // %2
2447 "+r"(pix) // %3
2448 :
2449 : "memory", "cc"
2450#if defined(__SSE2__)
2451 , "xmm0", "xmm1", "xmm5"
2452#endif
2453 );
2454}
2455
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002456void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2457 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002458 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002459 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002460 "1: \n"
2461 "movdqu (%0),%%xmm0 \n"
2462 "movdqu 0x10(%0),%%xmm1 \n"
2463 "lea 0x20(%0),%0 \n"
2464 "psrlw $0x8,%%xmm0 \n"
2465 "psrlw $0x8,%%xmm1 \n"
2466 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002467 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002468 "movdqu %%xmm0,(%1) \n"
2469 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002470 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002471 : "+r"(src_uyvy), // %0
2472 "+r"(dst_y), // %1
2473 "+r"(pix) // %2
2474 :
2475 : "memory", "cc"
2476#if defined(__SSE2__)
2477 , "xmm0", "xmm1"
2478#endif
2479 );
2480}
2481
2482void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002483 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002484 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002485 "pcmpeqb %%xmm5,%%xmm5 \n"
2486 "psrlw $0x8,%%xmm5 \n"
2487 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002488 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002489 "1: \n"
2490 "movdqu (%0),%%xmm0 \n"
2491 "movdqu 0x10(%0),%%xmm1 \n"
2492 "movdqu (%0,%4,1),%%xmm2 \n"
2493 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2494 "lea 0x20(%0),%0 \n"
2495 "pavgb %%xmm2,%%xmm0 \n"
2496 "pavgb %%xmm3,%%xmm1 \n"
2497 "pand %%xmm5,%%xmm0 \n"
2498 "pand %%xmm5,%%xmm1 \n"
2499 "packuswb %%xmm1,%%xmm0 \n"
2500 "movdqa %%xmm0,%%xmm1 \n"
2501 "pand %%xmm5,%%xmm0 \n"
2502 "packuswb %%xmm0,%%xmm0 \n"
2503 "psrlw $0x8,%%xmm1 \n"
2504 "packuswb %%xmm1,%%xmm1 \n"
2505 "movq %%xmm0,(%1) \n"
2506 "movq %%xmm1,(%1,%2) \n"
2507 "lea 0x8(%1),%1 \n"
2508 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002509 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002510 : "+r"(src_uyvy), // %0
2511 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002512 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002513 "+r"(pix) // %3
2514 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2515 : "memory", "cc"
2516#if defined(__SSE2__)
2517 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2518#endif
2519 );
2520}
fbarchard@google.comc704f782012-08-30 19:53:48 +00002521
2522void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2523 uint8* dst_u, uint8* dst_v, int pix) {
2524 asm volatile (
2525 "pcmpeqb %%xmm5,%%xmm5 \n"
2526 "psrlw $0x8,%%xmm5 \n"
2527 "sub %1,%2 \n"
2528 ".p2align 4 \n"
2529 "1: \n"
2530 "movdqu (%0),%%xmm0 \n"
2531 "movdqu 0x10(%0),%%xmm1 \n"
2532 "lea 0x20(%0),%0 \n"
2533 "pand %%xmm5,%%xmm0 \n"
2534 "pand %%xmm5,%%xmm1 \n"
2535 "packuswb %%xmm1,%%xmm0 \n"
2536 "movdqa %%xmm0,%%xmm1 \n"
2537 "pand %%xmm5,%%xmm0 \n"
2538 "packuswb %%xmm0,%%xmm0 \n"
2539 "psrlw $0x8,%%xmm1 \n"
2540 "packuswb %%xmm1,%%xmm1 \n"
2541 "movq %%xmm0,(%1) \n"
2542 "movq %%xmm1,(%1,%2) \n"
2543 "lea 0x8(%1),%1 \n"
2544 "sub $0x10,%3 \n"
2545 "jg 1b \n"
2546 : "+r"(src_uyvy), // %0
2547 "+r"(dst_u), // %1
2548 "+r"(dst_v), // %2
2549 "+r"(pix) // %3
2550 :
2551 : "memory", "cc"
2552#if defined(__SSE2__)
2553 , "xmm0", "xmm1", "xmm5"
2554#endif
2555 );
2556}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002557#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002558
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002559#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002560// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002561void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
2562 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002563 asm volatile (
2564 "pcmpeqb %%xmm7,%%xmm7 \n"
2565 "psrlw $0xf,%%xmm7 \n"
2566 "pcmpeqb %%xmm6,%%xmm6 \n"
2567 "psrlw $0x8,%%xmm6 \n"
2568 "pcmpeqb %%xmm5,%%xmm5 \n"
2569 "psllw $0x8,%%xmm5 \n"
2570 "pcmpeqb %%xmm4,%%xmm4 \n"
2571 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002572 "sub $0x1,%3 \n"
2573 "je 91f \n"
2574 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002575
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002576 // 1 pixel loop until destination pointer is aligned.
2577 "10: \n"
2578 "test $0xf,%2 \n"
2579 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002580 "movd (%0),%%xmm3 \n"
2581 "lea 0x4(%0),%0 \n"
2582 "movdqa %%xmm3,%%xmm0 \n"
2583 "pxor %%xmm4,%%xmm3 \n"
2584 "movd (%1),%%xmm2 \n"
2585 "psrlw $0x8,%%xmm3 \n"
2586 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2587 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2588 "pand %%xmm6,%%xmm2 \n"
2589 "paddw %%xmm7,%%xmm3 \n"
2590 "pmullw %%xmm3,%%xmm2 \n"
2591 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002592 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002593 "psrlw $0x8,%%xmm1 \n"
2594 "por %%xmm4,%%xmm0 \n"
2595 "pmullw %%xmm3,%%xmm1 \n"
2596 "psrlw $0x8,%%xmm2 \n"
2597 "paddusb %%xmm2,%%xmm0 \n"
2598 "pand %%xmm5,%%xmm1 \n"
2599 "paddusb %%xmm1,%%xmm0 \n"
2600 "sub $0x1,%3 \n"
2601 "movd %%xmm0,(%2) \n"
2602 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002603 "jge 10b \n"
2604
2605 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002606 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002607 "jl 49f \n"
2608
fbarchard@google.com794fe122012-06-15 01:05:01 +00002609 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002610 ".p2align 2 \n"
2611 "41: \n"
2612 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002613 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002614 "movdqa %%xmm3,%%xmm0 \n"
2615 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002616 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002617 "psrlw $0x8,%%xmm3 \n"
2618 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2619 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002620 "pand %%xmm6,%%xmm2 \n"
2621 "paddw %%xmm7,%%xmm3 \n"
2622 "pmullw %%xmm3,%%xmm2 \n"
2623 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002624 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002625 "psrlw $0x8,%%xmm1 \n"
2626 "por %%xmm4,%%xmm0 \n"
2627 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002628 "psrlw $0x8,%%xmm2 \n"
2629 "paddusb %%xmm2,%%xmm0 \n"
2630 "pand %%xmm5,%%xmm1 \n"
2631 "paddusb %%xmm1,%%xmm0 \n"
2632 "sub $0x4,%3 \n"
2633 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002634 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002635 "jge 41b \n"
2636
2637 "49: \n"
2638 "add $0x3,%3 \n"
2639 "jl 99f \n"
2640
fbarchard@google.com794fe122012-06-15 01:05:01 +00002641 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002642 "91: \n"
2643 "movd (%0),%%xmm3 \n"
2644 "lea 0x4(%0),%0 \n"
2645 "movdqa %%xmm3,%%xmm0 \n"
2646 "pxor %%xmm4,%%xmm3 \n"
2647 "movd (%1),%%xmm2 \n"
2648 "psrlw $0x8,%%xmm3 \n"
2649 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2650 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2651 "pand %%xmm6,%%xmm2 \n"
2652 "paddw %%xmm7,%%xmm3 \n"
2653 "pmullw %%xmm3,%%xmm2 \n"
2654 "movd (%1),%%xmm1 \n"
2655 "lea 0x4(%1),%1 \n"
2656 "psrlw $0x8,%%xmm1 \n"
2657 "por %%xmm4,%%xmm0 \n"
2658 "pmullw %%xmm3,%%xmm1 \n"
2659 "psrlw $0x8,%%xmm2 \n"
2660 "paddusb %%xmm2,%%xmm0 \n"
2661 "pand %%xmm5,%%xmm1 \n"
2662 "paddusb %%xmm1,%%xmm0 \n"
2663 "sub $0x1,%3 \n"
2664 "movd %%xmm0,(%2) \n"
2665 "lea 0x4(%2),%2 \n"
2666 "jge 91b \n"
2667 "99: \n"
2668 : "+r"(src_argb0), // %0
2669 "+r"(src_argb1), // %1
2670 "+r"(dst_argb), // %2
2671 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00002672 :
2673 : "memory", "cc"
2674#if defined(__SSE2__)
2675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2676#endif
2677 );
2678}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002679#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002680
fbarchard@google.com96af8702012-04-06 18:22:27 +00002681#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002682// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00002683CONST uvec8 kShuffleAlpha = {
2684 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2685 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2686};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002687
2688// Blend 8 pixels at a time
2689// Shuffle table for reversing the bytes.
2690
2691// Same as SSE2, but replaces
2692// psrlw xmm3, 8 // alpha
2693// pshufhw xmm3, xmm3,0F5h // 8 alpha words
2694// pshuflw xmm3, xmm3,0F5h
2695// with..
2696// pshufb xmm3, kShuffleAlpha // alpha
2697
2698void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2699 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002700 asm volatile (
2701 "pcmpeqb %%xmm7,%%xmm7 \n"
2702 "psrlw $0xf,%%xmm7 \n"
2703 "pcmpeqb %%xmm6,%%xmm6 \n"
2704 "psrlw $0x8,%%xmm6 \n"
2705 "pcmpeqb %%xmm5,%%xmm5 \n"
2706 "psllw $0x8,%%xmm5 \n"
2707 "pcmpeqb %%xmm4,%%xmm4 \n"
2708 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002709 "sub $0x1,%3 \n"
2710 "je 91f \n"
2711 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002712
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002713 // 1 pixel loop until destination pointer is aligned.
2714 "10: \n"
2715 "test $0xf,%2 \n"
2716 "je 19f \n"
2717 "movd (%0),%%xmm3 \n"
2718 "lea 0x4(%0),%0 \n"
2719 "movdqa %%xmm3,%%xmm0 \n"
2720 "pxor %%xmm4,%%xmm3 \n"
2721 "movd (%1),%%xmm2 \n"
2722 "pshufb %4,%%xmm3 \n"
2723 "pand %%xmm6,%%xmm2 \n"
2724 "paddw %%xmm7,%%xmm3 \n"
2725 "pmullw %%xmm3,%%xmm2 \n"
2726 "movd (%1),%%xmm1 \n"
2727 "lea 0x4(%1),%1 \n"
2728 "psrlw $0x8,%%xmm1 \n"
2729 "por %%xmm4,%%xmm0 \n"
2730 "pmullw %%xmm3,%%xmm1 \n"
2731 "psrlw $0x8,%%xmm2 \n"
2732 "paddusb %%xmm2,%%xmm0 \n"
2733 "pand %%xmm5,%%xmm1 \n"
2734 "paddusb %%xmm1,%%xmm0 \n"
2735 "sub $0x1,%3 \n"
2736 "movd %%xmm0,(%2) \n"
2737 "lea 0x4(%2),%2 \n"
2738 "jge 10b \n"
2739
2740 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002741 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002742 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002743 "test $0xf,%0 \n"
2744 "jne 41f \n"
2745 "test $0xf,%1 \n"
2746 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002747
fbarchard@google.com794fe122012-06-15 01:05:01 +00002748 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002749 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002750 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002751 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002752 "lea 0x10(%0),%0 \n"
2753 "movdqa %%xmm3,%%xmm0 \n"
2754 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002755 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002756 "pshufb %4,%%xmm3 \n"
2757 "pand %%xmm6,%%xmm2 \n"
2758 "paddw %%xmm7,%%xmm3 \n"
2759 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002760 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002761 "lea 0x10(%1),%1 \n"
2762 "psrlw $0x8,%%xmm1 \n"
2763 "por %%xmm4,%%xmm0 \n"
2764 "pmullw %%xmm3,%%xmm1 \n"
2765 "psrlw $0x8,%%xmm2 \n"
2766 "paddusb %%xmm2,%%xmm0 \n"
2767 "pand %%xmm5,%%xmm1 \n"
2768 "paddusb %%xmm1,%%xmm0 \n"
2769 "sub $0x4,%3 \n"
2770 "movdqa %%xmm0,(%2) \n"
2771 "lea 0x10(%2),%2 \n"
2772 "jge 40b \n"
2773 "jmp 49f \n"
2774
2775 // 4 pixel unaligned loop.
2776 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002777 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002778 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002779 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002780 "movdqa %%xmm3,%%xmm0 \n"
2781 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002782 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002783 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002784 "pand %%xmm6,%%xmm2 \n"
2785 "paddw %%xmm7,%%xmm3 \n"
2786 "pmullw %%xmm3,%%xmm2 \n"
2787 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002788 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002789 "psrlw $0x8,%%xmm1 \n"
2790 "por %%xmm4,%%xmm0 \n"
2791 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002792 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002793 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002794 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002795 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002796 "sub $0x4,%3 \n"
2797 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002798 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002799 "jge 41b \n"
2800
2801 "49: \n"
2802 "add $0x3,%3 \n"
2803 "jl 99f \n"
2804
fbarchard@google.com794fe122012-06-15 01:05:01 +00002805 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002806 "91: \n"
2807 "movd (%0),%%xmm3 \n"
2808 "lea 0x4(%0),%0 \n"
2809 "movdqa %%xmm3,%%xmm0 \n"
2810 "pxor %%xmm4,%%xmm3 \n"
2811 "movd (%1),%%xmm2 \n"
2812 "pshufb %4,%%xmm3 \n"
2813 "pand %%xmm6,%%xmm2 \n"
2814 "paddw %%xmm7,%%xmm3 \n"
2815 "pmullw %%xmm3,%%xmm2 \n"
2816 "movd (%1),%%xmm1 \n"
2817 "lea 0x4(%1),%1 \n"
2818 "psrlw $0x8,%%xmm1 \n"
2819 "por %%xmm4,%%xmm0 \n"
2820 "pmullw %%xmm3,%%xmm1 \n"
2821 "psrlw $0x8,%%xmm2 \n"
2822 "paddusb %%xmm2,%%xmm0 \n"
2823 "pand %%xmm5,%%xmm1 \n"
2824 "paddusb %%xmm1,%%xmm0 \n"
2825 "sub $0x1,%3 \n"
2826 "movd %%xmm0,(%2) \n"
2827 "lea 0x4(%2),%2 \n"
2828 "jge 91b \n"
2829 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002830 : "+r"(src_argb0), // %0
2831 "+r"(src_argb1), // %1
2832 "+r"(dst_argb), // %2
2833 "+r"(width) // %3
2834 : "m"(kShuffleAlpha) // %4
2835 : "memory", "cc"
2836#if defined(__SSE2__)
2837 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2838#endif
2839 );
2840}
2841#endif // HAS_ARGBBLENDROW_SSSE3
2842
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002843#ifdef HAS_ARGBATTENUATE_SSE2
2844// Attenuate 4 pixels at a time.
2845// aligned to 16 bytes
2846void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2847 asm volatile (
2848 "sub %0,%1 \n"
2849 "pcmpeqb %%xmm4,%%xmm4 \n"
2850 "pslld $0x18,%%xmm4 \n"
2851 "pcmpeqb %%xmm5,%%xmm5 \n"
2852 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002853
fbarchard@google.com81b804e2012-06-20 02:15:01 +00002854 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002855 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002856 "1: \n"
2857 "movdqa (%0),%%xmm0 \n"
2858 "punpcklbw %%xmm0,%%xmm0 \n"
2859 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2860 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2861 "pmulhuw %%xmm2,%%xmm0 \n"
2862 "movdqa (%0),%%xmm1 \n"
2863 "punpckhbw %%xmm1,%%xmm1 \n"
2864 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2865 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2866 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002867 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002868 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002869 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002870 "psrlw $0x8,%%xmm1 \n"
2871 "packuswb %%xmm1,%%xmm0 \n"
2872 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002873 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002874 "sub $0x4,%2 \n"
2875 "movdqa %%xmm0,(%0,%1,1) \n"
2876 "lea 0x10(%0),%0 \n"
2877 "jg 1b \n"
2878 : "+r"(src_argb), // %0
2879 "+r"(dst_argb), // %1
2880 "+r"(width) // %2
2881 :
2882 : "memory", "cc"
2883#if defined(__SSE2__)
2884 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2885#endif
2886 );
2887}
2888#endif // HAS_ARGBATTENUATE_SSE2
2889
fbarchard@google.comeeac2902012-07-18 18:54:32 +00002890#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00002891// Shuffle table duplicating alpha
2892CONST uvec8 kShuffleAlpha0 = {
2893 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2894};
2895CONST uvec8 kShuffleAlpha1 = {
2896 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2897 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2898};
2899// Attenuate 4 pixels at a time.
2900// aligned to 16 bytes
2901void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2902 asm volatile (
2903 "sub %0,%1 \n"
2904 "pcmpeqb %%xmm3,%%xmm3 \n"
2905 "pslld $0x18,%%xmm3 \n"
2906 "movdqa %3,%%xmm4 \n"
2907 "movdqa %4,%%xmm5 \n"
2908
fbarchard@google.com81b804e2012-06-20 02:15:01 +00002909 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002910 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002911 "1: \n"
2912 "movdqa (%0),%%xmm0 \n"
2913 "pshufb %%xmm4,%%xmm0 \n"
2914 "movdqa (%0),%%xmm1 \n"
2915 "punpcklbw %%xmm1,%%xmm1 \n"
2916 "pmulhuw %%xmm1,%%xmm0 \n"
2917 "movdqa (%0),%%xmm1 \n"
2918 "pshufb %%xmm5,%%xmm1 \n"
2919 "movdqa (%0),%%xmm2 \n"
2920 "punpckhbw %%xmm2,%%xmm2 \n"
2921 "pmulhuw %%xmm2,%%xmm1 \n"
2922 "movdqa (%0),%%xmm2 \n"
2923 "pand %%xmm3,%%xmm2 \n"
2924 "psrlw $0x8,%%xmm0 \n"
2925 "psrlw $0x8,%%xmm1 \n"
2926 "packuswb %%xmm1,%%xmm0 \n"
2927 "por %%xmm2,%%xmm0 \n"
2928 "sub $0x4,%2 \n"
2929 "movdqa %%xmm0,(%0,%1,1) \n"
2930 "lea 0x10(%0),%0 \n"
2931 "jg 1b \n"
2932 : "+r"(src_argb), // %0
2933 "+r"(dst_argb), // %1
2934 "+r"(width) // %2
2935 : "m"(kShuffleAlpha0), // %3
2936 "m"(kShuffleAlpha1) // %4
2937 : "memory", "cc"
2938#if defined(__SSE2__)
2939 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2940#endif
2941 );
2942}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00002943#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00002944
fbarchard@google.comeeac2902012-07-18 18:54:32 +00002945#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00002946// Unattenuate 4 pixels at a time.
2947// aligned to 16 bytes
2948void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2949 int width) {
2950 uintptr_t alpha = 0;
2951 asm volatile (
2952 "sub %0,%1 \n"
2953 "pcmpeqb %%xmm4,%%xmm4 \n"
2954 "pslld $0x18,%%xmm4 \n"
2955
fbarchard@google.com81b804e2012-06-20 02:15:01 +00002956 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002957 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002958 "1: \n"
2959 "movdqa (%0),%%xmm0 \n"
2960 "movzb 0x3(%0),%3 \n"
2961 "punpcklbw %%xmm0,%%xmm0 \n"
2962 "movd 0x0(%4,%3,4),%%xmm2 \n"
2963 "movzb 0x7(%0),%3 \n"
2964 "movd 0x0(%4,%3,4),%%xmm3 \n"
2965 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2966 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2967 "movlhps %%xmm3,%%xmm2 \n"
2968 "pmulhuw %%xmm2,%%xmm0 \n"
2969 "movdqa (%0),%%xmm1 \n"
2970 "movzb 0xb(%0),%3 \n"
2971 "punpckhbw %%xmm1,%%xmm1 \n"
2972 "movd 0x0(%4,%3,4),%%xmm2 \n"
2973 "movzb 0xf(%0),%3 \n"
2974 "movd 0x0(%4,%3,4),%%xmm3 \n"
2975 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2976 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2977 "movlhps %%xmm3,%%xmm2 \n"
2978 "pmulhuw %%xmm2,%%xmm1 \n"
2979 "movdqa (%0),%%xmm2 \n"
2980 "pand %%xmm4,%%xmm2 \n"
2981 "packuswb %%xmm1,%%xmm0 \n"
2982 "por %%xmm2,%%xmm0 \n"
2983 "sub $0x4,%2 \n"
2984 "movdqa %%xmm0,(%0,%1,1) \n"
2985 "lea 0x10(%0),%0 \n"
2986 "jg 1b \n"
2987 : "+r"(src_argb), // %0
2988 "+r"(dst_argb), // %1
2989 "+r"(width), // %2
2990 "+r"(alpha) // %3
2991 : "r"(fixed_invtbl8) // %4
2992 : "memory", "cc"
2993#if defined(__SSE2__)
2994 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2995#endif
2996 );
2997}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00002998#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00002999
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003000#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003001// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
3002CONST vec8 kARGBToGray = {
3003 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3004};
3005
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003006// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003007void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003008 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003009 "movdqa %3,%%xmm4 \n"
3010 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003011
3012 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003013 ".p2align 4 \n"
3014 "1: \n"
3015 "movdqa (%0),%%xmm0 \n"
3016 "movdqa 0x10(%0),%%xmm1 \n"
3017 "pmaddubsw %%xmm4,%%xmm0 \n"
3018 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003019 "phaddw %%xmm1,%%xmm0 \n"
3020 "psrlw $0x7,%%xmm0 \n"
3021 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003022 "movdqa (%0),%%xmm2 \n"
3023 "movdqa 0x10(%0),%%xmm3 \n"
3024 "psrld $0x18,%%xmm2 \n"
3025 "psrld $0x18,%%xmm3 \n"
3026 "packuswb %%xmm3,%%xmm2 \n"
3027 "packuswb %%xmm2,%%xmm2 \n"
3028 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003029 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003030 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003031 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003032 "punpcklwd %%xmm3,%%xmm0 \n"
3033 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003034 "sub $0x8,%2 \n"
3035 "movdqa %%xmm0,(%0,%1,1) \n"
3036 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003037 "lea 0x20(%0),%0 \n"
3038 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003039 : "+r"(src_argb), // %0
3040 "+r"(dst_argb), // %1
3041 "+r"(width) // %2
3042 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003043 : "memory", "cc"
3044#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003045 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003046#endif
3047 );
3048}
3049#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003050
3051#ifdef HAS_ARGBSEPIAROW_SSSE3
3052// b = (r * 35 + g * 68 + b * 17) >> 7
3053// g = (r * 45 + g * 88 + b * 22) >> 7
3054// r = (r * 50 + g * 98 + b * 24) >> 7
3055// Constant for ARGB color to sepia tone
3056CONST vec8 kARGBToSepiaB = {
3057 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3058};
3059
3060CONST vec8 kARGBToSepiaG = {
3061 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3062};
3063
3064CONST vec8 kARGBToSepiaR = {
3065 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3066};
3067
fbarchard@google.come442dc42012-06-18 17:37:09 +00003068// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003069void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3070 asm volatile (
3071 "movdqa %2,%%xmm2 \n"
3072 "movdqa %3,%%xmm3 \n"
3073 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003074
3075 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003076 ".p2align 4 \n"
3077 "1: \n"
3078 "movdqa (%0),%%xmm0 \n"
3079 "movdqa 0x10(%0),%%xmm6 \n"
3080 "pmaddubsw %%xmm2,%%xmm0 \n"
3081 "pmaddubsw %%xmm2,%%xmm6 \n"
3082 "phaddw %%xmm6,%%xmm0 \n"
3083 "psrlw $0x7,%%xmm0 \n"
3084 "packuswb %%xmm0,%%xmm0 \n"
3085 "movdqa (%0),%%xmm5 \n"
3086 "movdqa 0x10(%0),%%xmm1 \n"
3087 "pmaddubsw %%xmm3,%%xmm5 \n"
3088 "pmaddubsw %%xmm3,%%xmm1 \n"
3089 "phaddw %%xmm1,%%xmm5 \n"
3090 "psrlw $0x7,%%xmm5 \n"
3091 "packuswb %%xmm5,%%xmm5 \n"
3092 "punpcklbw %%xmm5,%%xmm0 \n"
3093 "movdqa (%0),%%xmm5 \n"
3094 "movdqa 0x10(%0),%%xmm1 \n"
3095 "pmaddubsw %%xmm4,%%xmm5 \n"
3096 "pmaddubsw %%xmm4,%%xmm1 \n"
3097 "phaddw %%xmm1,%%xmm5 \n"
3098 "psrlw $0x7,%%xmm5 \n"
3099 "packuswb %%xmm5,%%xmm5 \n"
3100 "movdqa (%0),%%xmm6 \n"
3101 "movdqa 0x10(%0),%%xmm1 \n"
3102 "psrld $0x18,%%xmm6 \n"
3103 "psrld $0x18,%%xmm1 \n"
3104 "packuswb %%xmm1,%%xmm6 \n"
3105 "packuswb %%xmm6,%%xmm6 \n"
3106 "punpcklbw %%xmm6,%%xmm5 \n"
3107 "movdqa %%xmm0,%%xmm1 \n"
3108 "punpcklwd %%xmm5,%%xmm0 \n"
3109 "punpckhwd %%xmm5,%%xmm1 \n"
3110 "sub $0x8,%1 \n"
3111 "movdqa %%xmm0,(%0) \n"
3112 "movdqa %%xmm1,0x10(%0) \n"
3113 "lea 0x20(%0),%0 \n"
3114 "jg 1b \n"
3115 : "+r"(dst_argb), // %0
3116 "+r"(width) // %1
3117 : "m"(kARGBToSepiaB), // %2
3118 "m"(kARGBToSepiaG), // %3
3119 "m"(kARGBToSepiaR) // %4
3120 : "memory", "cc"
3121#if defined(__SSE2__)
3122 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3123#endif
3124 );
3125}
3126#endif // HAS_ARGBSEPIAROW_SSSE3
3127
fbarchard@google.come442dc42012-06-18 17:37:09 +00003128#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3129// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3130// Same as Sepia except matrix is provided.
3131void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3132 int width) {
3133 asm volatile (
3134 "movd (%2),%%xmm2 \n"
3135 "movd 0x4(%2),%%xmm3 \n"
3136 "movd 0x8(%2),%%xmm4 \n"
3137 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3138 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3139 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003140
3141 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003142 ".p2align 4 \n"
3143 "1: \n"
3144 "movdqa (%0),%%xmm0 \n"
3145 "movdqa 0x10(%0),%%xmm6 \n"
3146 "pmaddubsw %%xmm2,%%xmm0 \n"
3147 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003148 "movdqa (%0),%%xmm5 \n"
3149 "movdqa 0x10(%0),%%xmm1 \n"
3150 "pmaddubsw %%xmm3,%%xmm5 \n"
3151 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003152 "phaddsw %%xmm6,%%xmm0 \n"
3153 "phaddsw %%xmm1,%%xmm5 \n"
3154 "psraw $0x7,%%xmm0 \n"
3155 "psraw $0x7,%%xmm5 \n"
3156 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003157 "packuswb %%xmm5,%%xmm5 \n"
3158 "punpcklbw %%xmm5,%%xmm0 \n"
3159 "movdqa (%0),%%xmm5 \n"
3160 "movdqa 0x10(%0),%%xmm1 \n"
3161 "pmaddubsw %%xmm4,%%xmm5 \n"
3162 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003163 "phaddsw %%xmm1,%%xmm5 \n"
3164 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003165 "packuswb %%xmm5,%%xmm5 \n"
3166 "movdqa (%0),%%xmm6 \n"
3167 "movdqa 0x10(%0),%%xmm1 \n"
3168 "psrld $0x18,%%xmm6 \n"
3169 "psrld $0x18,%%xmm1 \n"
3170 "packuswb %%xmm1,%%xmm6 \n"
3171 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003172 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003173 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003174 "punpcklwd %%xmm5,%%xmm0 \n"
3175 "punpckhwd %%xmm5,%%xmm1 \n"
3176 "sub $0x8,%1 \n"
3177 "movdqa %%xmm0,(%0) \n"
3178 "movdqa %%xmm1,0x10(%0) \n"
3179 "lea 0x20(%0),%0 \n"
3180 "jg 1b \n"
3181 : "+r"(dst_argb), // %0
3182 "+r"(width) // %1
3183 : "r"(matrix_argb) // %2
3184 : "memory", "cc"
3185#if defined(__SSE2__)
3186 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3187#endif
3188 );
3189}
3190#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3191
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003192#ifdef HAS_ARGBQUANTIZEROW_SSE2
3193// Quantize 4 ARGB pixels (16 bytes).
3194// aligned to 16 bytes
3195void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3196 int interval_offset, int width) {
3197 asm volatile (
3198 "movd %2,%%xmm2 \n"
3199 "movd %3,%%xmm3 \n"
3200 "movd %4,%%xmm4 \n"
3201 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3202 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3203 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3204 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3205 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3206 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3207 "pxor %%xmm5,%%xmm5 \n"
3208 "pcmpeqb %%xmm6,%%xmm6 \n"
3209 "pslld $0x18,%%xmm6 \n"
3210
3211 // 4 pixel loop.
3212 ".p2align 2 \n"
3213 "1: \n"
3214 "movdqa (%0),%%xmm0 \n"
3215 "punpcklbw %%xmm5,%%xmm0 \n"
3216 "pmulhuw %%xmm2,%%xmm0 \n"
3217 "movdqa (%0),%%xmm1 \n"
3218 "punpckhbw %%xmm5,%%xmm1 \n"
3219 "pmulhuw %%xmm2,%%xmm1 \n"
3220 "pmullw %%xmm3,%%xmm0 \n"
3221 "movdqa (%0),%%xmm7 \n"
3222 "pmullw %%xmm3,%%xmm1 \n"
3223 "pand %%xmm6,%%xmm7 \n"
3224 "paddw %%xmm4,%%xmm0 \n"
3225 "paddw %%xmm4,%%xmm1 \n"
3226 "packuswb %%xmm1,%%xmm0 \n"
3227 "por %%xmm7,%%xmm0 \n"
3228 "sub $0x4,%1 \n"
3229 "movdqa %%xmm0,(%0) \n"
3230 "lea 0x10(%0),%0 \n"
3231 "jg 1b \n"
3232 : "+r"(dst_argb), // %0
3233 "+r"(width) // %1
3234 : "r"(scale), // %2
3235 "r"(interval_size), // %3
3236 "r"(interval_offset) // %4
3237 : "memory", "cc"
3238#if defined(__SSE2__)
3239 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3240#endif
3241 );
3242}
3243#endif // HAS_ARGBQUANTIZEROW_SSE2
3244
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003245#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3246// Creates a table of cumulative sums where each value is a sum of all values
3247// above and to the left of the value, inclusive of the value.
3248void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003249 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003250 asm volatile (
3251 "sub %1,%2 \n"
3252 "pxor %%xmm0,%%xmm0 \n"
3253 "pxor %%xmm1,%%xmm1 \n"
3254 "sub $0x4,%3 \n"
3255 "jl 49f \n"
3256 "test $0xf,%1 \n"
3257 "jne 49f \n"
3258
3259 // 4 pixel loop \n"
3260 ".p2align 2 \n"
3261 "40: \n"
3262 "movdqu (%0),%%xmm2 \n"
3263 "lea 0x10(%0),%0 \n"
3264 "movdqa %%xmm2,%%xmm4 \n"
3265 "punpcklbw %%xmm1,%%xmm2 \n"
3266 "movdqa %%xmm2,%%xmm3 \n"
3267 "punpcklwd %%xmm1,%%xmm2 \n"
3268 "punpckhwd %%xmm1,%%xmm3 \n"
3269 "punpckhbw %%xmm1,%%xmm4 \n"
3270 "movdqa %%xmm4,%%xmm5 \n"
3271 "punpcklwd %%xmm1,%%xmm4 \n"
3272 "punpckhwd %%xmm1,%%xmm5 \n"
3273 "paddd %%xmm2,%%xmm0 \n"
3274 "movdqa (%1,%2,1),%%xmm2 \n"
3275 "paddd %%xmm0,%%xmm2 \n"
3276 "paddd %%xmm3,%%xmm0 \n"
3277 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3278 "paddd %%xmm0,%%xmm3 \n"
3279 "paddd %%xmm4,%%xmm0 \n"
3280 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3281 "paddd %%xmm0,%%xmm4 \n"
3282 "paddd %%xmm5,%%xmm0 \n"
3283 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3284 "paddd %%xmm0,%%xmm5 \n"
3285 "movdqa %%xmm2,(%1) \n"
3286 "movdqa %%xmm3,0x10(%1) \n"
3287 "movdqa %%xmm4,0x20(%1) \n"
3288 "movdqa %%xmm5,0x30(%1) \n"
3289 "lea 0x40(%1),%1 \n"
3290 "sub $0x4,%3 \n"
3291 "jge 40b \n"
3292
3293 "49: \n"
3294 "add $0x3,%3 \n"
3295 "jl 19f \n"
3296
3297 // 1 pixel loop \n"
3298 ".p2align 2 \n"
3299 "10: \n"
3300 "movd (%0),%%xmm2 \n"
3301 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003302 "punpcklbw %%xmm1,%%xmm2 \n"
3303 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003304 "paddd %%xmm2,%%xmm0 \n"
3305 "movdqu (%1,%2,1),%%xmm2 \n"
3306 "paddd %%xmm0,%%xmm2 \n"
3307 "movdqu %%xmm2,(%1) \n"
3308 "lea 0x10(%1),%1 \n"
3309 "sub $0x1,%3 \n"
3310 "jge 10b \n"
3311
3312 "19: \n"
3313 : "+r"(row), // %0
3314 "+r"(cumsum), // %1
3315 "+r"(previous_cumsum), // %2
3316 "+r"(width) // %3
3317 :
3318 : "memory", "cc"
3319#if defined(__SSE2__)
3320 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3321#endif
3322 );
3323}
3324#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3325
3326#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3327void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3328 int width, int area, uint8* dst, int count) {
3329 asm volatile (
3330 "movd %5,%%xmm4 \n"
3331 "cvtdq2ps %%xmm4,%%xmm4 \n"
3332 "rcpss %%xmm4,%%xmm4 \n"
3333 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3334 "sub $0x4,%3 \n"
3335 "jl 49f \n"
3336
3337 // 4 pixel loop \n"
3338 ".p2align 2 \n"
3339 "40: \n"
3340 "movdqa (%0),%%xmm0 \n"
3341 "movdqa 0x10(%0),%%xmm1 \n"
3342 "movdqa 0x20(%0),%%xmm2 \n"
3343 "movdqa 0x30(%0),%%xmm3 \n"
3344 "psubd (%0,%4,4),%%xmm0 \n"
3345 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3346 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3347 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3348 "lea 0x40(%0),%0 \n"
3349 "psubd (%1),%%xmm0 \n"
3350 "psubd 0x10(%1),%%xmm1 \n"
3351 "psubd 0x20(%1),%%xmm2 \n"
3352 "psubd 0x30(%1),%%xmm3 \n"
3353 "paddd (%1,%4,4),%%xmm0 \n"
3354 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3355 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3356 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3357 "lea 0x40(%1),%1 \n"
3358 "cvtdq2ps %%xmm0,%%xmm0 \n"
3359 "cvtdq2ps %%xmm1,%%xmm1 \n"
3360 "mulps %%xmm4,%%xmm0 \n"
3361 "mulps %%xmm4,%%xmm1 \n"
3362 "cvtdq2ps %%xmm2,%%xmm2 \n"
3363 "cvtdq2ps %%xmm3,%%xmm3 \n"
3364 "mulps %%xmm4,%%xmm2 \n"
3365 "mulps %%xmm4,%%xmm3 \n"
3366 "cvtps2dq %%xmm0,%%xmm0 \n"
3367 "cvtps2dq %%xmm1,%%xmm1 \n"
3368 "cvtps2dq %%xmm2,%%xmm2 \n"
3369 "cvtps2dq %%xmm3,%%xmm3 \n"
3370 "packssdw %%xmm1,%%xmm0 \n"
3371 "packssdw %%xmm3,%%xmm2 \n"
3372 "packuswb %%xmm2,%%xmm0 \n"
3373 "movdqu %%xmm0,(%2) \n"
3374 "lea 0x10(%2),%2 \n"
3375 "sub $0x4,%3 \n"
3376 "jge 40b \n"
3377
3378 "49: \n"
3379 "add $0x3,%3 \n"
3380 "jl 19f \n"
3381
3382 // 1 pixel loop \n"
3383 ".p2align 2 \n"
3384 "10: \n"
3385 "movdqa (%0),%%xmm0 \n"
3386 "psubd (%0,%4,4),%%xmm0 \n"
3387 "lea 0x10(%0),%0 \n"
3388 "psubd (%1),%%xmm0 \n"
3389 "paddd (%1,%4,4),%%xmm0 \n"
3390 "lea 0x10(%1),%1 \n"
3391 "cvtdq2ps %%xmm0,%%xmm0 \n"
3392 "mulps %%xmm4,%%xmm0 \n"
3393 "cvtps2dq %%xmm0,%%xmm0 \n"
3394 "packssdw %%xmm0,%%xmm0 \n"
3395 "packuswb %%xmm0,%%xmm0 \n"
3396 "movd %%xmm0,(%2) \n"
3397 "lea 0x4(%2),%2 \n"
3398 "sub $0x1,%3 \n"
3399 "jge 10b \n"
3400 "19: \n"
3401 : "+r"(topleft), // %0
3402 "+r"(botleft), // %1
3403 "+r"(dst), // %2
3404 "+rm"(count) // %3
3405 : "r"(static_cast<intptr_t>(width)), // %4
3406 "rm"(area) // %5
3407 : "memory", "cc"
3408#if defined(__SSE2__)
3409 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3410#endif
3411 );
3412}
3413#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003414#ifdef HAS_ARGBSHADE_SSE2
3415// Shade 4 pixels at a time by specified value.
3416// Aligned to 16 bytes.
3417void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3418 uint32 value) {
3419 asm volatile (
3420 "movd %3,%%xmm2 \n"
3421 "sub %0,%1 \n"
3422 "punpcklbw %%xmm2,%%xmm2 \n"
3423 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003424
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003425 // 4 pixel loop.
3426 ".p2align 2 \n"
3427 "1: \n"
3428 "movdqa (%0),%%xmm0 \n"
3429 "movdqa %%xmm0,%%xmm1 \n"
3430 "punpcklbw %%xmm0,%%xmm0 \n"
3431 "punpckhbw %%xmm1,%%xmm1 \n"
3432 "pmulhuw %%xmm2,%%xmm0 \n"
3433 "pmulhuw %%xmm2,%%xmm1 \n"
3434 "psrlw $0x8,%%xmm0 \n"
3435 "psrlw $0x8,%%xmm1 \n"
3436 "packuswb %%xmm1,%%xmm0 \n"
3437 "sub $0x4,%2 \n"
3438 "movdqa %%xmm0,(%0,%1,1) \n"
3439 "lea 0x10(%0),%0 \n"
3440 "jg 1b \n"
3441 : "+r"(src_argb), // %0
3442 "+r"(dst_argb), // %1
3443 "+r"(width) // %2
3444 : "r"(value) // %3
3445 : "memory", "cc"
3446#if defined(__SSE2__)
3447 , "xmm0", "xmm1", "xmm2"
3448#endif
3449 );
3450}
3451#endif // HAS_ARGBSHADE_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003452
fbarchard@google.com73444402012-08-09 17:33:29 +00003453#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003454// TODO(fbarchard): Find 64 bit way to avoid masking.
3455// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
fbarchard@google.com73444402012-08-09 17:33:29 +00003456// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003457// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
3458// an error if movq is used. movd %%xmm0,%1
3459
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00003460LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00003461void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3462 uint8* dst_argb, const float* uv_dudv, int width) {
3463 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003464 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00003465 asm volatile (
3466 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003467 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003468 "shl $0x10,%1 \n"
3469 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003470 "movd %1,%%xmm5 \n"
3471 "sub $0x4,%4 \n"
3472 "jl 49f \n"
3473
3474 "pshufd $0x44,%%xmm7,%%xmm7 \n"
3475 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003476 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003477 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003478 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003479 "movdqa %%xmm7,%%xmm4 \n"
3480 "addps %%xmm4,%%xmm4 \n"
3481 "movdqa %%xmm2,%%xmm3 \n"
3482 "addps %%xmm4,%%xmm3 \n"
3483 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003484
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003485 // 4 pixel loop \n"
3486 ".p2align 4 \n"
3487 "40: \n"
3488 "cvttps2dq %%xmm2,%%xmm0 \n"
3489 "cvttps2dq %%xmm3,%%xmm1 \n"
3490 "packssdw %%xmm1,%%xmm0 \n"
3491 "pmaddwd %%xmm5,%%xmm0 \n"
3492#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003493 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003494 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003495 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003496 "shr $32,%5 \n"
3497 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
3498#else
3499 "movd %%xmm0,%1 \n"
3500 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3501 "movd %%xmm0,%5 \n"
3502 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3503#endif
3504 "movd (%0,%1,1),%%xmm1 \n"
3505 "movd (%0,%5,1),%%xmm6 \n"
3506 "punpckldq %%xmm6,%%xmm1 \n"
3507 "addps %%xmm4,%%xmm2 \n"
3508 "movq %%xmm1,(%2) \n"
3509#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003510 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003511 "mov %1,%5 \n"
3512 "and $0x0fffffff,%1 \n"
3513 "shr $32,%5 \n"
3514#else
3515 "movd %%xmm0,%1 \n"
3516 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3517 "movd %%xmm0,%5 \n"
3518#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00003519 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003520 "movd (%0,%5,1),%%xmm6 \n"
3521 "punpckldq %%xmm6,%%xmm0 \n"
3522 "addps %%xmm4,%%xmm3 \n"
3523 "sub $0x4,%4 \n"
3524 "movq %%xmm0,0x08(%2) \n"
3525 "lea 0x10(%2),%2 \n"
3526 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003527
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003528 "49: \n"
3529 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003530 "jl 19f \n"
3531
3532 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003533 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003534 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003535 "cvttps2dq %%xmm2,%%xmm0 \n"
3536 "packssdw %%xmm0,%%xmm0 \n"
3537 "pmaddwd %%xmm5,%%xmm0 \n"
3538 "addps %%xmm7,%%xmm2 \n"
3539 "movd %%xmm0,%1 \n"
3540#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00003541 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003542#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00003543 "movd (%0,%1,1),%%xmm0 \n"
3544 "sub $0x1,%4 \n"
3545 "movd %%xmm0,(%2) \n"
3546 "lea 0x4(%2),%2 \n"
3547 "jge 10b \n"
3548 "19: \n"
3549 : "+r"(src_argb), // %0
3550 "+r"(src_argb_stride_temp), // %1
3551 "+r"(dst_argb), // %2
3552 "+r"(uv_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003553 "+rm"(width), // %4
3554 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00003555 :
3556 : "memory", "cc"
3557#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00003559#endif
3560 );
3561}
3562#endif // HAS_ARGBAFFINEROW_SSE2
3563
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00003564// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
3565void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
3566 ptrdiff_t src_stride, int dst_width,
3567 int source_y_fraction) {
3568 asm volatile (
3569 "sub %1,%0 \n"
3570 "shr %3 \n"
3571 "cmp $0x0,%3 \n"
3572 "je 2f \n"
3573 "cmp $0x40,%3 \n"
3574 "je 3f \n"
3575 "movd %3,%%xmm0 \n"
3576 "neg %3 \n"
3577 "add $0x80,%3 \n"
3578 "movd %3,%%xmm5 \n"
3579 "punpcklbw %%xmm0,%%xmm5 \n"
3580 "punpcklwd %%xmm5,%%xmm5 \n"
3581 "pshufd $0x0,%%xmm5,%%xmm5 \n"
3582 ".p2align 4 \n"
3583 "1: \n"
3584 "movdqa (%1),%%xmm0 \n"
3585 "movdqa (%1,%4,1),%%xmm2 \n"
3586 "movdqa %%xmm0,%%xmm1 \n"
3587 "punpcklbw %%xmm2,%%xmm0 \n"
3588 "punpckhbw %%xmm2,%%xmm1 \n"
3589 "pmaddubsw %%xmm5,%%xmm0 \n"
3590 "pmaddubsw %%xmm5,%%xmm1 \n"
3591 "psrlw $0x7,%%xmm0 \n"
3592 "psrlw $0x7,%%xmm1 \n"
3593 "packuswb %%xmm1,%%xmm0 \n"
3594 "sub $0x4,%2 \n"
3595 "movdqa %%xmm0,(%1,%0,1) \n"
3596 "lea 0x10(%1),%1 \n"
3597 "jg 1b \n"
3598 "jmp 4f \n"
3599 ".p2align 4 \n"
3600 "2: \n"
3601 "movdqa (%1),%%xmm0 \n"
3602 "sub $0x4,%2 \n"
3603 "movdqa %%xmm0,(%1,%0,1) \n"
3604 "lea 0x10(%1),%1 \n"
3605 "jg 2b \n"
3606 "jmp 4f \n"
3607 ".p2align 4 \n"
3608 "3: \n"
3609 "movdqa (%1),%%xmm0 \n"
3610 "pavgb (%1,%4,1),%%xmm0 \n"
3611 "sub $0x4,%2 \n"
3612 "movdqa %%xmm0,(%1,%0,1) \n"
3613 "lea 0x10(%1),%1 \n"
3614 "jg 3b \n"
3615 "4: \n"
3616 ".p2align 4 \n"
3617 : "+r"(dst_ptr), // %0
3618 "+r"(src_ptr), // %1
3619 "+r"(dst_width), // %2
3620 "+r"(source_y_fraction) // %3
3621 : "r"(static_cast<intptr_t>(src_stride)) // %4
3622 : "memory", "cc"
3623#if defined(__SSE2__)
3624 , "xmm0", "xmm1", "xmm2", "xmm5"
3625#endif
3626 );
3627}
3628
fbarchard@google.com2d11d432012-02-16 02:50:39 +00003629#endif // defined(__x86_64__) || defined(__i386__)
3630
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003631#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003632} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003633} // namespace libyuv
3634#endif