blob: 33149dada99d7132095622043f6892bda11e6dc6 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000101// Shuffle table for converting RGBA to ARGB.
102CONST uvec8 kShuffleMaskRGBAToARGB = {
103 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
104};
105
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000106// Shuffle table for converting ARGB to RGBA.
107CONST uvec8 kShuffleMaskARGBToRGBA = {
108 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
109};
110
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000111// Shuffle table for converting ARGB to RGB24.
112CONST uvec8 kShuffleMaskARGBToRGB24 = {
113 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
114};
115
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000116// Shuffle table for converting ARGB to RAW.
117CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000118 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000119};
120
fbarchard@google.comb6149762011-11-07 21:58:52 +0000121void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000122 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000123 "pcmpeqb %%xmm5,%%xmm5 \n"
124 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000125 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000126 "1: \n"
127 "movq (%0),%%xmm0 \n"
128 "lea 0x8(%0),%0 \n"
129 "punpcklbw %%xmm0,%%xmm0 \n"
130 "movdqa %%xmm0,%%xmm1 \n"
131 "punpcklwd %%xmm0,%%xmm0 \n"
132 "punpckhwd %%xmm1,%%xmm1 \n"
133 "por %%xmm5,%%xmm0 \n"
134 "por %%xmm5,%%xmm1 \n"
135 "movdqa %%xmm0,(%1) \n"
136 "movdqa %%xmm1,0x10(%1) \n"
137 "lea 0x20(%1),%1 \n"
138 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000139 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140 : "+r"(src_y), // %0
141 "+r"(dst_argb), // %1
142 "+r"(pix) // %2
143 :
144 : "memory", "cc"
145#if defined(__SSE2__)
146 , "xmm0", "xmm1", "xmm5"
147#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000148 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000149}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000150
151void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000152 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000153 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000154 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000155 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000156 "1: \n"
157 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000158 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000159 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000160 "movdqa %%xmm0,(%0,%1,1) \n"
161 "lea 0x10(%0),%0 \n"
162 "jg 1b \n"
163
fbarchard@google.comb6149762011-11-07 21:58:52 +0000164 : "+r"(src_abgr), // %0
165 "+r"(dst_argb), // %1
166 "+r"(pix) // %2
167 : "m"(kShuffleMaskABGRToARGB) // %3
168 : "memory", "cc"
169#if defined(__SSE2__)
170 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000171#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000172 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173}
174
175void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000176 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000177 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000178 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000179 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000180 "1: \n"
181 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000182 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000183 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000184 "movdqa %%xmm0,(%0,%1,1) \n"
185 "lea 0x10(%0),%0 \n"
186 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000187 : "+r"(src_bgra), // %0
188 "+r"(dst_argb), // %1
189 "+r"(pix) // %2
190 : "m"(kShuffleMaskBGRAToARGB) // %3
191 : "memory", "cc"
192#if defined(__SSE2__)
193 , "xmm0", "xmm5"
194#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000195 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000196}
197
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000198void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
199 asm volatile (
200 "movdqa %3,%%xmm5 \n"
201 "sub %0,%1 \n"
202 ".p2align 4 \n"
203 "1: \n"
204 "movdqa (%0),%%xmm0 \n"
205 "pshufb %%xmm5,%%xmm0 \n"
206 "sub $0x4,%2 \n"
207 "movdqa %%xmm0,(%0,%1,1) \n"
208 "lea 0x10(%0),%0 \n"
209 "jg 1b \n"
210
211 : "+r"(src_rgba), // %0
212 "+r"(dst_argb), // %1
213 "+r"(pix) // %2
214 : "m"(kShuffleMaskRGBAToARGB) // %3
215 : "memory", "cc"
216#if defined(__SSE2__)
217 , "xmm0", "xmm5"
218#endif
219 );
220}
221
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000222void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
223 asm volatile (
224 "movdqa %3,%%xmm5 \n"
225 "sub %0,%1 \n"
226 ".p2align 4 \n"
227 "1: \n"
228 "movdqa (%0),%%xmm0 \n"
229 "pshufb %%xmm5,%%xmm0 \n"
230 "sub $0x4,%2 \n"
231 "movdqa %%xmm0,(%0,%1,1) \n"
232 "lea 0x10(%0),%0 \n"
233 "jg 1b \n"
234
235 : "+r"(src_argb), // %0
236 "+r"(dst_rgba), // %1
237 "+r"(pix) // %2
238 : "m"(kShuffleMaskARGBToRGBA) // %3
239 : "memory", "cc"
240#if defined(__SSE2__)
241 , "xmm0", "xmm5"
242#endif
243 );
244}
245
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000246void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000247 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000248 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
249 "pslld $0x18,%%xmm5 \n"
250 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000251 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000252 "1: \n"
253 "movdqu (%0),%%xmm0 \n"
254 "movdqu 0x10(%0),%%xmm1 \n"
255 "movdqu 0x20(%0),%%xmm3 \n"
256 "lea 0x30(%0),%0 \n"
257 "movdqa %%xmm3,%%xmm2 \n"
258 "palignr $0x8,%%xmm1,%%xmm2 \n"
259 "pshufb %%xmm4,%%xmm2 \n"
260 "por %%xmm5,%%xmm2 \n"
261 "palignr $0xc,%%xmm0,%%xmm1 \n"
262 "pshufb %%xmm4,%%xmm0 \n"
263 "movdqa %%xmm2,0x20(%1) \n"
264 "por %%xmm5,%%xmm0 \n"
265 "pshufb %%xmm4,%%xmm1 \n"
266 "movdqa %%xmm0,(%1) \n"
267 "por %%xmm5,%%xmm1 \n"
268 "palignr $0x4,%%xmm3,%%xmm3 \n"
269 "pshufb %%xmm4,%%xmm3 \n"
270 "movdqa %%xmm1,0x10(%1) \n"
271 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000272 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000273 "movdqa %%xmm3,0x30(%1) \n"
274 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000275 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000276 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000277 "+r"(dst_argb), // %1
278 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000279 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000280 : "memory", "cc"
281#if defined(__SSE2__)
282 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
283#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000284 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000285}
286
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000287void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000288 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000289 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
290 "pslld $0x18,%%xmm5 \n"
291 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000292 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000293 "1: \n"
294 "movdqu (%0),%%xmm0 \n"
295 "movdqu 0x10(%0),%%xmm1 \n"
296 "movdqu 0x20(%0),%%xmm3 \n"
297 "lea 0x30(%0),%0 \n"
298 "movdqa %%xmm3,%%xmm2 \n"
299 "palignr $0x8,%%xmm1,%%xmm2 \n"
300 "pshufb %%xmm4,%%xmm2 \n"
301 "por %%xmm5,%%xmm2 \n"
302 "palignr $0xc,%%xmm0,%%xmm1 \n"
303 "pshufb %%xmm4,%%xmm0 \n"
304 "movdqa %%xmm2,0x20(%1) \n"
305 "por %%xmm5,%%xmm0 \n"
306 "pshufb %%xmm4,%%xmm1 \n"
307 "movdqa %%xmm0,(%1) \n"
308 "por %%xmm5,%%xmm1 \n"
309 "palignr $0x4,%%xmm3,%%xmm3 \n"
310 "pshufb %%xmm4,%%xmm3 \n"
311 "movdqa %%xmm1,0x10(%1) \n"
312 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000313 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000314 "movdqa %%xmm3,0x30(%1) \n"
315 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000316 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000317 : "+r"(src_raw), // %0
318 "+r"(dst_argb), // %1
319 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000320 : "m"(kShuffleMaskRAWToARGB) // %3
321 : "memory", "cc"
322#if defined(__SSE2__)
323 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
324#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000325 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000326}
327
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000328void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000329 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000330 "mov $0x1080108,%%eax \n"
331 "movd %%eax,%%xmm5 \n"
332 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000333 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000334 "movd %%eax,%%xmm6 \n"
335 "pshufd $0x0,%%xmm6,%%xmm6 \n"
336 "pcmpeqb %%xmm3,%%xmm3 \n"
337 "psllw $0xb,%%xmm3 \n"
338 "pcmpeqb %%xmm4,%%xmm4 \n"
339 "psllw $0xa,%%xmm4 \n"
340 "psrlw $0x5,%%xmm4 \n"
341 "pcmpeqb %%xmm7,%%xmm7 \n"
342 "psllw $0x8,%%xmm7 \n"
343 "sub %0,%1 \n"
344 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000345 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000346 "1: \n"
347 "movdqu (%0),%%xmm0 \n"
348 "movdqa %%xmm0,%%xmm1 \n"
349 "movdqa %%xmm0,%%xmm2 \n"
350 "pand %%xmm3,%%xmm1 \n"
351 "psllw $0xb,%%xmm2 \n"
352 "pmulhuw %%xmm5,%%xmm1 \n"
353 "pmulhuw %%xmm5,%%xmm2 \n"
354 "psllw $0x8,%%xmm1 \n"
355 "por %%xmm2,%%xmm1 \n"
356 "pand %%xmm4,%%xmm0 \n"
357 "pmulhuw %%xmm6,%%xmm0 \n"
358 "por %%xmm7,%%xmm0 \n"
359 "movdqa %%xmm1,%%xmm2 \n"
360 "punpcklbw %%xmm0,%%xmm1 \n"
361 "punpckhbw %%xmm0,%%xmm2 \n"
362 "movdqa %%xmm1,(%1,%0,2) \n"
363 "movdqa %%xmm2,0x10(%1,%0,2) \n"
364 "lea 0x10(%0),%0 \n"
365 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000366 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000367 : "+r"(src), // %0
368 "+r"(dst), // %1
369 "+r"(pix) // %2
370 :
371 : "memory", "cc", "eax"
372#if defined(__SSE2__)
373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
374#endif
375 );
376}
377
378void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000379 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000380 "mov $0x1080108,%%eax \n"
381 "movd %%eax,%%xmm5 \n"
382 "pshufd $0x0,%%xmm5,%%xmm5 \n"
383 "mov $0x42004200,%%eax \n"
384 "movd %%eax,%%xmm6 \n"
385 "pshufd $0x0,%%xmm6,%%xmm6 \n"
386 "pcmpeqb %%xmm3,%%xmm3 \n"
387 "psllw $0xb,%%xmm3 \n"
388 "movdqa %%xmm3,%%xmm4 \n"
389 "psrlw $0x6,%%xmm4 \n"
390 "pcmpeqb %%xmm7,%%xmm7 \n"
391 "psllw $0x8,%%xmm7 \n"
392 "sub %0,%1 \n"
393 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000394 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000395 "1: \n"
396 "movdqu (%0),%%xmm0 \n"
397 "movdqa %%xmm0,%%xmm1 \n"
398 "movdqa %%xmm0,%%xmm2 \n"
399 "psllw $0x1,%%xmm1 \n"
400 "psllw $0xb,%%xmm2 \n"
401 "pand %%xmm3,%%xmm1 \n"
402 "pmulhuw %%xmm5,%%xmm2 \n"
403 "pmulhuw %%xmm5,%%xmm1 \n"
404 "psllw $0x8,%%xmm1 \n"
405 "por %%xmm2,%%xmm1 \n"
406 "movdqa %%xmm0,%%xmm2 \n"
407 "pand %%xmm4,%%xmm0 \n"
408 "psraw $0x8,%%xmm2 \n"
409 "pmulhuw %%xmm6,%%xmm0 \n"
410 "pand %%xmm7,%%xmm2 \n"
411 "por %%xmm2,%%xmm0 \n"
412 "movdqa %%xmm1,%%xmm2 \n"
413 "punpcklbw %%xmm0,%%xmm1 \n"
414 "punpckhbw %%xmm0,%%xmm2 \n"
415 "movdqa %%xmm1,(%1,%0,2) \n"
416 "movdqa %%xmm2,0x10(%1,%0,2) \n"
417 "lea 0x10(%0),%0 \n"
418 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000419 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000420 : "+r"(src), // %0
421 "+r"(dst), // %1
422 "+r"(pix) // %2
423 :
424 : "memory", "cc", "eax"
425#if defined(__SSE2__)
426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
427#endif
428 );
429}
430
431void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000432 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000433 "mov $0xf0f0f0f,%%eax \n"
434 "movd %%eax,%%xmm4 \n"
435 "pshufd $0x0,%%xmm4,%%xmm4 \n"
436 "movdqa %%xmm4,%%xmm5 \n"
437 "pslld $0x4,%%xmm5 \n"
438 "sub %0,%1 \n"
439 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000440 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000441 "1: \n"
442 "movdqu (%0),%%xmm0 \n"
443 "movdqa %%xmm0,%%xmm2 \n"
444 "pand %%xmm4,%%xmm0 \n"
445 "pand %%xmm5,%%xmm2 \n"
446 "movdqa %%xmm0,%%xmm1 \n"
447 "movdqa %%xmm2,%%xmm3 \n"
448 "psllw $0x4,%%xmm1 \n"
449 "psrlw $0x4,%%xmm3 \n"
450 "por %%xmm1,%%xmm0 \n"
451 "por %%xmm3,%%xmm2 \n"
452 "movdqa %%xmm0,%%xmm1 \n"
453 "punpcklbw %%xmm2,%%xmm0 \n"
454 "punpckhbw %%xmm2,%%xmm1 \n"
455 "movdqa %%xmm0,(%1,%0,2) \n"
456 "movdqa %%xmm1,0x10(%1,%0,2) \n"
457 "lea 0x10(%0),%0 \n"
458 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000459 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000460 : "+r"(src), // %0
461 "+r"(dst), // %1
462 "+r"(pix) // %2
463 :
464 : "memory", "cc", "eax"
465#if defined(__SSE2__)
466 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
467#endif
468 );
469}
470
471void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000472 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000473 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000474 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000475 "1: \n"
476 "movdqa (%0),%%xmm0 \n"
477 "movdqa 0x10(%0),%%xmm1 \n"
478 "movdqa 0x20(%0),%%xmm2 \n"
479 "movdqa 0x30(%0),%%xmm3 \n"
480 "lea 0x40(%0),%0 \n"
481 "pshufb %%xmm6,%%xmm0 \n"
482 "pshufb %%xmm6,%%xmm1 \n"
483 "pshufb %%xmm6,%%xmm2 \n"
484 "pshufb %%xmm6,%%xmm3 \n"
485 "movdqa %%xmm1,%%xmm4 \n"
486 "psrldq $0x4,%%xmm1 \n"
487 "pslldq $0xc,%%xmm4 \n"
488 "movdqa %%xmm2,%%xmm5 \n"
489 "por %%xmm4,%%xmm0 \n"
490 "pslldq $0x8,%%xmm5 \n"
491 "movdqa %%xmm0,(%1) \n"
492 "por %%xmm5,%%xmm1 \n"
493 "psrldq $0x8,%%xmm2 \n"
494 "pslldq $0x4,%%xmm3 \n"
495 "por %%xmm3,%%xmm2 \n"
496 "movdqa %%xmm1,0x10(%1) \n"
497 "movdqa %%xmm2,0x20(%1) \n"
498 "lea 0x30(%1),%1 \n"
499 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000500 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000501 : "+r"(src), // %0
502 "+r"(dst), // %1
503 "+r"(pix) // %2
504 : "m"(kShuffleMaskARGBToRGB24) // %3
505 : "memory", "cc"
506#if defined(__SSE2__)
507 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
508#endif
509 );
510}
511
512void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000513 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000514 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000515 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000516 "1: \n"
517 "movdqa (%0),%%xmm0 \n"
518 "movdqa 0x10(%0),%%xmm1 \n"
519 "movdqa 0x20(%0),%%xmm2 \n"
520 "movdqa 0x30(%0),%%xmm3 \n"
521 "lea 0x40(%0),%0 \n"
522 "pshufb %%xmm6,%%xmm0 \n"
523 "pshufb %%xmm6,%%xmm1 \n"
524 "pshufb %%xmm6,%%xmm2 \n"
525 "pshufb %%xmm6,%%xmm3 \n"
526 "movdqa %%xmm1,%%xmm4 \n"
527 "psrldq $0x4,%%xmm1 \n"
528 "pslldq $0xc,%%xmm4 \n"
529 "movdqa %%xmm2,%%xmm5 \n"
530 "por %%xmm4,%%xmm0 \n"
531 "pslldq $0x8,%%xmm5 \n"
532 "movdqa %%xmm0,(%1) \n"
533 "por %%xmm5,%%xmm1 \n"
534 "psrldq $0x8,%%xmm2 \n"
535 "pslldq $0x4,%%xmm3 \n"
536 "por %%xmm3,%%xmm2 \n"
537 "movdqa %%xmm1,0x10(%1) \n"
538 "movdqa %%xmm2,0x20(%1) \n"
539 "lea 0x30(%1),%1 \n"
540 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000541 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000542 : "+r"(src), // %0
543 "+r"(dst), // %1
544 "+r"(pix) // %2
545 : "m"(kShuffleMaskARGBToRAW) // %3
546 : "memory", "cc"
547#if defined(__SSE2__)
548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
549#endif
550 );
551}
552
553void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000554 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000555 "pcmpeqb %%xmm3,%%xmm3 \n"
556 "psrld $0x1b,%%xmm3 \n"
557 "pcmpeqb %%xmm4,%%xmm4 \n"
558 "psrld $0x1a,%%xmm4 \n"
559 "pslld $0x5,%%xmm4 \n"
560 "pcmpeqb %%xmm5,%%xmm5 \n"
561 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000562 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000563 "1: \n"
564 "movdqa (%0),%%xmm0 \n"
565 "movdqa %%xmm0,%%xmm1 \n"
566 "movdqa %%xmm0,%%xmm2 \n"
567 "pslld $0x8,%%xmm0 \n"
568 "psrld $0x3,%%xmm1 \n"
569 "psrld $0x5,%%xmm2 \n"
570 "psrad $0x10,%%xmm0 \n"
571 "pand %%xmm3,%%xmm1 \n"
572 "pand %%xmm4,%%xmm2 \n"
573 "pand %%xmm5,%%xmm0 \n"
574 "por %%xmm2,%%xmm1 \n"
575 "por %%xmm1,%%xmm0 \n"
576 "packssdw %%xmm0,%%xmm0 \n"
577 "lea 0x10(%0),%0 \n"
578 "movq %%xmm0,(%1) \n"
579 "lea 0x8(%1),%1 \n"
580 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000581 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000582 : "+r"(src), // %0
583 "+r"(dst), // %1
584 "+r"(pix) // %2
585 :
586 : "memory", "cc"
587#if defined(__SSE2__)
588 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
589#endif
590 );
591}
592
593void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000594 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000595 "pcmpeqb %%xmm4,%%xmm4 \n"
596 "psrld $0x1b,%%xmm4 \n"
597 "movdqa %%xmm4,%%xmm5 \n"
598 "pslld $0x5,%%xmm5 \n"
599 "movdqa %%xmm4,%%xmm6 \n"
600 "pslld $0xa,%%xmm6 \n"
601 "pcmpeqb %%xmm7,%%xmm7 \n"
602 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000603 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000604 "1: \n"
605 "movdqa (%0),%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "movdqa %%xmm0,%%xmm2 \n"
608 "movdqa %%xmm0,%%xmm3 \n"
609 "psrad $0x10,%%xmm0 \n"
610 "psrld $0x3,%%xmm1 \n"
611 "psrld $0x6,%%xmm2 \n"
612 "psrld $0x9,%%xmm3 \n"
613 "pand %%xmm7,%%xmm0 \n"
614 "pand %%xmm4,%%xmm1 \n"
615 "pand %%xmm5,%%xmm2 \n"
616 "pand %%xmm6,%%xmm3 \n"
617 "por %%xmm1,%%xmm0 \n"
618 "por %%xmm3,%%xmm2 \n"
619 "por %%xmm2,%%xmm0 \n"
620 "packssdw %%xmm0,%%xmm0 \n"
621 "lea 0x10(%0),%0 \n"
622 "movq %%xmm0,(%1) \n"
623 "lea 0x8(%1),%1 \n"
624 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000625 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000626 : "+r"(src), // %0
627 "+r"(dst), // %1
628 "+r"(pix) // %2
629 :
630 : "memory", "cc"
631#if defined(__SSE2__)
632 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
633#endif
634 );
635}
636
637void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000638 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000639 "pcmpeqb %%xmm4,%%xmm4 \n"
640 "psllw $0xc,%%xmm4 \n"
641 "movdqa %%xmm4,%%xmm3 \n"
642 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000643 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000644 "1: \n"
645 "movdqa (%0),%%xmm0 \n"
646 "movdqa %%xmm0,%%xmm1 \n"
647 "pand %%xmm3,%%xmm0 \n"
648 "pand %%xmm4,%%xmm1 \n"
649 "psrlq $0x4,%%xmm0 \n"
650 "psrlq $0x8,%%xmm1 \n"
651 "por %%xmm1,%%xmm0 \n"
652 "packuswb %%xmm0,%%xmm0 \n"
653 "lea 0x10(%0),%0 \n"
654 "movq %%xmm0,(%1) \n"
655 "lea 0x8(%1),%1 \n"
656 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000657 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000658 : "+r"(src), // %0
659 "+r"(dst), // %1
660 "+r"(pix) // %2
661 :
662 : "memory", "cc"
663#if defined(__SSE2__)
664 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
665#endif
666 );
667}
668
fbarchard@google.comb6149762011-11-07 21:58:52 +0000669void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000670 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000671 "movdqa %4,%%xmm5 \n"
672 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000673 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000674 "1: \n"
675 "movdqa (%0),%%xmm0 \n"
676 "movdqa 0x10(%0),%%xmm1 \n"
677 "movdqa 0x20(%0),%%xmm2 \n"
678 "movdqa 0x30(%0),%%xmm3 \n"
679 "pmaddubsw %%xmm4,%%xmm0 \n"
680 "pmaddubsw %%xmm4,%%xmm1 \n"
681 "pmaddubsw %%xmm4,%%xmm2 \n"
682 "pmaddubsw %%xmm4,%%xmm3 \n"
683 "lea 0x40(%0),%0 \n"
684 "phaddw %%xmm1,%%xmm0 \n"
685 "phaddw %%xmm3,%%xmm2 \n"
686 "psrlw $0x7,%%xmm0 \n"
687 "psrlw $0x7,%%xmm2 \n"
688 "packuswb %%xmm2,%%xmm0 \n"
689 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000690 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000691 "movdqa %%xmm0,(%1) \n"
692 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000693 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000694 : "+r"(src_argb), // %0
695 "+r"(dst_y), // %1
696 "+r"(pix) // %2
697 : "m"(kARGBToY), // %3
698 "m"(kAddY16) // %4
699 : "memory", "cc"
700#if defined(__SSE2__)
701 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
702#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000703 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000704}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000705
706void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000707 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000708 "movdqa %4,%%xmm5 \n"
709 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000710 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000711 "1: \n"
712 "movdqu (%0),%%xmm0 \n"
713 "movdqu 0x10(%0),%%xmm1 \n"
714 "movdqu 0x20(%0),%%xmm2 \n"
715 "movdqu 0x30(%0),%%xmm3 \n"
716 "pmaddubsw %%xmm4,%%xmm0 \n"
717 "pmaddubsw %%xmm4,%%xmm1 \n"
718 "pmaddubsw %%xmm4,%%xmm2 \n"
719 "pmaddubsw %%xmm4,%%xmm3 \n"
720 "lea 0x40(%0),%0 \n"
721 "phaddw %%xmm1,%%xmm0 \n"
722 "phaddw %%xmm3,%%xmm2 \n"
723 "psrlw $0x7,%%xmm0 \n"
724 "psrlw $0x7,%%xmm2 \n"
725 "packuswb %%xmm2,%%xmm0 \n"
726 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000727 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000728 "movdqu %%xmm0,(%1) \n"
729 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000730 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000731 : "+r"(src_argb), // %0
732 "+r"(dst_y), // %1
733 "+r"(pix) // %2
734 : "m"(kARGBToY), // %3
735 "m"(kAddY16) // %4
736 : "memory", "cc"
737#if defined(__SSE2__)
738 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
739#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000740 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000741}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000742
fbarchard@google.com714050a2012-02-17 22:59:56 +0000743// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000744// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
745// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
746// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000747// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000748void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
749 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000750 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000751 "movdqa %0,%%xmm4 \n"
752 "movdqa %1,%%xmm3 \n"
753 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000754 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000755 : "m"(kARGBToU), // %0
756 "m"(kARGBToV), // %1
757 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000758 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000759 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000760 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000761 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000762 "1: \n"
763 "movdqa (%0),%%xmm0 \n"
764 "movdqa 0x10(%0),%%xmm1 \n"
765 "movdqa 0x20(%0),%%xmm2 \n"
766 "movdqa 0x30(%0),%%xmm6 \n"
767 "pavgb (%0,%4,1),%%xmm0 \n"
768 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
769 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
770 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
771 "lea 0x40(%0),%0 \n"
772 "movdqa %%xmm0,%%xmm7 \n"
773 "shufps $0x88,%%xmm1,%%xmm0 \n"
774 "shufps $0xdd,%%xmm1,%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm0 \n"
776 "movdqa %%xmm2,%%xmm7 \n"
777 "shufps $0x88,%%xmm6,%%xmm2 \n"
778 "shufps $0xdd,%%xmm6,%%xmm7 \n"
779 "pavgb %%xmm7,%%xmm2 \n"
780 "movdqa %%xmm0,%%xmm1 \n"
781 "movdqa %%xmm2,%%xmm6 \n"
782 "pmaddubsw %%xmm4,%%xmm0 \n"
783 "pmaddubsw %%xmm4,%%xmm2 \n"
784 "pmaddubsw %%xmm3,%%xmm1 \n"
785 "pmaddubsw %%xmm3,%%xmm6 \n"
786 "phaddw %%xmm2,%%xmm0 \n"
787 "phaddw %%xmm6,%%xmm1 \n"
788 "psraw $0x8,%%xmm0 \n"
789 "psraw $0x8,%%xmm1 \n"
790 "packsswb %%xmm1,%%xmm0 \n"
791 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000792 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000793 "movlps %%xmm0,(%1) \n"
794 "movhps %%xmm0,(%1,%2,1) \n"
795 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000796 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000797 : "+r"(src_argb0), // %0
798 "+r"(dst_u), // %1
799 "+r"(dst_v), // %2
800 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000801 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000802 : "memory", "cc"
803#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000804 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000805#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000806 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000807}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000808
809void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
810 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000811 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000812 "movdqa %0,%%xmm4 \n"
813 "movdqa %1,%%xmm3 \n"
814 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000815 :
816 : "m"(kARGBToU), // %0
817 "m"(kARGBToV), // %1
818 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000819 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000820 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000821 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000822 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000823 "1: \n"
824 "movdqu (%0),%%xmm0 \n"
825 "movdqu 0x10(%0),%%xmm1 \n"
826 "movdqu 0x20(%0),%%xmm2 \n"
827 "movdqu 0x30(%0),%%xmm6 \n"
828 "movdqu (%0,%4,1),%%xmm7 \n"
829 "pavgb %%xmm7,%%xmm0 \n"
830 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
831 "pavgb %%xmm7,%%xmm1 \n"
832 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
833 "pavgb %%xmm7,%%xmm2 \n"
834 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
835 "pavgb %%xmm7,%%xmm6 \n"
836 "lea 0x40(%0),%0 \n"
837 "movdqa %%xmm0,%%xmm7 \n"
838 "shufps $0x88,%%xmm1,%%xmm0 \n"
839 "shufps $0xdd,%%xmm1,%%xmm7 \n"
840 "pavgb %%xmm7,%%xmm0 \n"
841 "movdqa %%xmm2,%%xmm7 \n"
842 "shufps $0x88,%%xmm6,%%xmm2 \n"
843 "shufps $0xdd,%%xmm6,%%xmm7 \n"
844 "pavgb %%xmm7,%%xmm2 \n"
845 "movdqa %%xmm0,%%xmm1 \n"
846 "movdqa %%xmm2,%%xmm6 \n"
847 "pmaddubsw %%xmm4,%%xmm0 \n"
848 "pmaddubsw %%xmm4,%%xmm2 \n"
849 "pmaddubsw %%xmm3,%%xmm1 \n"
850 "pmaddubsw %%xmm3,%%xmm6 \n"
851 "phaddw %%xmm2,%%xmm0 \n"
852 "phaddw %%xmm6,%%xmm1 \n"
853 "psraw $0x8,%%xmm0 \n"
854 "psraw $0x8,%%xmm1 \n"
855 "packsswb %%xmm1,%%xmm0 \n"
856 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000857 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000858 "movlps %%xmm0,(%1) \n"
859 "movhps %%xmm0,(%1,%2,1) \n"
860 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000861 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000862 : "+r"(src_argb0), // %0
863 "+r"(dst_u), // %1
864 "+r"(dst_v), // %2
865 "+rm"(width) // %3
866 : "r"(static_cast<intptr_t>(src_stride_argb))
867 : "memory", "cc"
868#if defined(__SSE2__)
869 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
870#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000871 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000872}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000873
fbarchard@google.com714050a2012-02-17 22:59:56 +0000874void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000875 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000876 "movdqa %4,%%xmm5 \n"
877 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000878 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000879 "1: \n"
880 "movdqa (%0),%%xmm0 \n"
881 "movdqa 0x10(%0),%%xmm1 \n"
882 "movdqa 0x20(%0),%%xmm2 \n"
883 "movdqa 0x30(%0),%%xmm3 \n"
884 "pmaddubsw %%xmm4,%%xmm0 \n"
885 "pmaddubsw %%xmm4,%%xmm1 \n"
886 "pmaddubsw %%xmm4,%%xmm2 \n"
887 "pmaddubsw %%xmm4,%%xmm3 \n"
888 "lea 0x40(%0),%0 \n"
889 "phaddw %%xmm1,%%xmm0 \n"
890 "phaddw %%xmm3,%%xmm2 \n"
891 "psrlw $0x7,%%xmm0 \n"
892 "psrlw $0x7,%%xmm2 \n"
893 "packuswb %%xmm2,%%xmm0 \n"
894 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000895 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000896 "movdqa %%xmm0,(%1) \n"
897 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000898 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000899 : "+r"(src_bgra), // %0
900 "+r"(dst_y), // %1
901 "+r"(pix) // %2
902 : "m"(kBGRAToY), // %3
903 "m"(kAddY16) // %4
904 : "memory", "cc"
905#if defined(__SSE2__)
906 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000907#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000908 );
909}
910
911void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000912 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000913 "movdqa %4,%%xmm5 \n"
914 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000915 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000916 "1: \n"
917 "movdqu (%0),%%xmm0 \n"
918 "movdqu 0x10(%0),%%xmm1 \n"
919 "movdqu 0x20(%0),%%xmm2 \n"
920 "movdqu 0x30(%0),%%xmm3 \n"
921 "pmaddubsw %%xmm4,%%xmm0 \n"
922 "pmaddubsw %%xmm4,%%xmm1 \n"
923 "pmaddubsw %%xmm4,%%xmm2 \n"
924 "pmaddubsw %%xmm4,%%xmm3 \n"
925 "lea 0x40(%0),%0 \n"
926 "phaddw %%xmm1,%%xmm0 \n"
927 "phaddw %%xmm3,%%xmm2 \n"
928 "psrlw $0x7,%%xmm0 \n"
929 "psrlw $0x7,%%xmm2 \n"
930 "packuswb %%xmm2,%%xmm0 \n"
931 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000932 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000933 "movdqu %%xmm0,(%1) \n"
934 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000935 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000936 : "+r"(src_bgra), // %0
937 "+r"(dst_y), // %1
938 "+r"(pix) // %2
939 : "m"(kBGRAToY), // %3
940 "m"(kAddY16) // %4
941 : "memory", "cc"
942#if defined(__SSE2__)
943 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
944#endif
945 );
946}
947
948void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
949 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000950 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000951 "movdqa %0,%%xmm4 \n"
952 "movdqa %1,%%xmm3 \n"
953 "movdqa %2,%%xmm5 \n"
954 :
955 : "m"(kBGRAToU), // %0
956 "m"(kBGRAToV), // %1
957 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000958 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000959 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000960 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000961 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000962 "1: \n"
963 "movdqa (%0),%%xmm0 \n"
964 "movdqa 0x10(%0),%%xmm1 \n"
965 "movdqa 0x20(%0),%%xmm2 \n"
966 "movdqa 0x30(%0),%%xmm6 \n"
967 "pavgb (%0,%4,1),%%xmm0 \n"
968 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
969 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
970 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
971 "lea 0x40(%0),%0 \n"
972 "movdqa %%xmm0,%%xmm7 \n"
973 "shufps $0x88,%%xmm1,%%xmm0 \n"
974 "shufps $0xdd,%%xmm1,%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm0 \n"
976 "movdqa %%xmm2,%%xmm7 \n"
977 "shufps $0x88,%%xmm6,%%xmm2 \n"
978 "shufps $0xdd,%%xmm6,%%xmm7 \n"
979 "pavgb %%xmm7,%%xmm2 \n"
980 "movdqa %%xmm0,%%xmm1 \n"
981 "movdqa %%xmm2,%%xmm6 \n"
982 "pmaddubsw %%xmm4,%%xmm0 \n"
983 "pmaddubsw %%xmm4,%%xmm2 \n"
984 "pmaddubsw %%xmm3,%%xmm1 \n"
985 "pmaddubsw %%xmm3,%%xmm6 \n"
986 "phaddw %%xmm2,%%xmm0 \n"
987 "phaddw %%xmm6,%%xmm1 \n"
988 "psraw $0x8,%%xmm0 \n"
989 "psraw $0x8,%%xmm1 \n"
990 "packsswb %%xmm1,%%xmm0 \n"
991 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000992 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000993 "movlps %%xmm0,(%1) \n"
994 "movhps %%xmm0,(%1,%2,1) \n"
995 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000996 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000997 : "+r"(src_bgra0), // %0
998 "+r"(dst_u), // %1
999 "+r"(dst_v), // %2
1000 "+rm"(width) // %3
1001 : "r"(static_cast<intptr_t>(src_stride_bgra))
1002 : "memory", "cc"
1003#if defined(__SSE2__)
1004 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1005#endif
1006 );
1007}
1008
1009void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1010 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001011 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001012 "movdqa %0,%%xmm4 \n"
1013 "movdqa %1,%%xmm3 \n"
1014 "movdqa %2,%%xmm5 \n"
1015 :
1016 : "m"(kBGRAToU), // %0
1017 "m"(kBGRAToV), // %1
1018 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001019 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001020 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001021 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001022 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001023 "1: \n"
1024 "movdqu (%0),%%xmm0 \n"
1025 "movdqu 0x10(%0),%%xmm1 \n"
1026 "movdqu 0x20(%0),%%xmm2 \n"
1027 "movdqu 0x30(%0),%%xmm6 \n"
1028 "movdqu (%0,%4,1),%%xmm7 \n"
1029 "pavgb %%xmm7,%%xmm0 \n"
1030 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1031 "pavgb %%xmm7,%%xmm1 \n"
1032 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1033 "pavgb %%xmm7,%%xmm2 \n"
1034 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1035 "pavgb %%xmm7,%%xmm6 \n"
1036 "lea 0x40(%0),%0 \n"
1037 "movdqa %%xmm0,%%xmm7 \n"
1038 "shufps $0x88,%%xmm1,%%xmm0 \n"
1039 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1040 "pavgb %%xmm7,%%xmm0 \n"
1041 "movdqa %%xmm2,%%xmm7 \n"
1042 "shufps $0x88,%%xmm6,%%xmm2 \n"
1043 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1044 "pavgb %%xmm7,%%xmm2 \n"
1045 "movdqa %%xmm0,%%xmm1 \n"
1046 "movdqa %%xmm2,%%xmm6 \n"
1047 "pmaddubsw %%xmm4,%%xmm0 \n"
1048 "pmaddubsw %%xmm4,%%xmm2 \n"
1049 "pmaddubsw %%xmm3,%%xmm1 \n"
1050 "pmaddubsw %%xmm3,%%xmm6 \n"
1051 "phaddw %%xmm2,%%xmm0 \n"
1052 "phaddw %%xmm6,%%xmm1 \n"
1053 "psraw $0x8,%%xmm0 \n"
1054 "psraw $0x8,%%xmm1 \n"
1055 "packsswb %%xmm1,%%xmm0 \n"
1056 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001057 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001058 "movlps %%xmm0,(%1) \n"
1059 "movhps %%xmm0,(%1,%2,1) \n"
1060 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001061 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001062 : "+r"(src_bgra0), // %0
1063 "+r"(dst_u), // %1
1064 "+r"(dst_v), // %2
1065 "+rm"(width) // %3
1066 : "r"(static_cast<intptr_t>(src_stride_bgra))
1067 : "memory", "cc"
1068#if defined(__SSE2__)
1069 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1070#endif
1071 );
1072}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001073
1074void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001075 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001076 "movdqa %4,%%xmm5 \n"
1077 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001078 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001079 "1: \n"
1080 "movdqa (%0),%%xmm0 \n"
1081 "movdqa 0x10(%0),%%xmm1 \n"
1082 "movdqa 0x20(%0),%%xmm2 \n"
1083 "movdqa 0x30(%0),%%xmm3 \n"
1084 "pmaddubsw %%xmm4,%%xmm0 \n"
1085 "pmaddubsw %%xmm4,%%xmm1 \n"
1086 "pmaddubsw %%xmm4,%%xmm2 \n"
1087 "pmaddubsw %%xmm4,%%xmm3 \n"
1088 "lea 0x40(%0),%0 \n"
1089 "phaddw %%xmm1,%%xmm0 \n"
1090 "phaddw %%xmm3,%%xmm2 \n"
1091 "psrlw $0x7,%%xmm0 \n"
1092 "psrlw $0x7,%%xmm2 \n"
1093 "packuswb %%xmm2,%%xmm0 \n"
1094 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001095 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001096 "movdqa %%xmm0,(%1) \n"
1097 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001098 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001099 : "+r"(src_abgr), // %0
1100 "+r"(dst_y), // %1
1101 "+r"(pix) // %2
1102 : "m"(kABGRToY), // %3
1103 "m"(kAddY16) // %4
1104 : "memory", "cc"
1105#if defined(__SSE2__)
1106 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1107#endif
1108 );
1109}
1110
1111void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001112 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001113 "movdqa %4,%%xmm5 \n"
1114 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001115 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001116 "1: \n"
1117 "movdqu (%0),%%xmm0 \n"
1118 "movdqu 0x10(%0),%%xmm1 \n"
1119 "movdqu 0x20(%0),%%xmm2 \n"
1120 "movdqu 0x30(%0),%%xmm3 \n"
1121 "pmaddubsw %%xmm4,%%xmm0 \n"
1122 "pmaddubsw %%xmm4,%%xmm1 \n"
1123 "pmaddubsw %%xmm4,%%xmm2 \n"
1124 "pmaddubsw %%xmm4,%%xmm3 \n"
1125 "lea 0x40(%0),%0 \n"
1126 "phaddw %%xmm1,%%xmm0 \n"
1127 "phaddw %%xmm3,%%xmm2 \n"
1128 "psrlw $0x7,%%xmm0 \n"
1129 "psrlw $0x7,%%xmm2 \n"
1130 "packuswb %%xmm2,%%xmm0 \n"
1131 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001132 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001133 "movdqu %%xmm0,(%1) \n"
1134 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001135 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001136 : "+r"(src_abgr), // %0
1137 "+r"(dst_y), // %1
1138 "+r"(pix) // %2
1139 : "m"(kABGRToY), // %3
1140 "m"(kAddY16) // %4
1141 : "memory", "cc"
1142#if defined(__SSE2__)
1143 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1144#endif
1145 );
1146}
1147
1148void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1149 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001150 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001151 "movdqa %0,%%xmm4 \n"
1152 "movdqa %1,%%xmm3 \n"
1153 "movdqa %2,%%xmm5 \n"
1154 :
1155 : "m"(kABGRToU), // %0
1156 "m"(kABGRToV), // %1
1157 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001158 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001159 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001160 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001161 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001162 "1: \n"
1163 "movdqa (%0),%%xmm0 \n"
1164 "movdqa 0x10(%0),%%xmm1 \n"
1165 "movdqa 0x20(%0),%%xmm2 \n"
1166 "movdqa 0x30(%0),%%xmm6 \n"
1167 "pavgb (%0,%4,1),%%xmm0 \n"
1168 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1169 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1170 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1171 "lea 0x40(%0),%0 \n"
1172 "movdqa %%xmm0,%%xmm7 \n"
1173 "shufps $0x88,%%xmm1,%%xmm0 \n"
1174 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm0 \n"
1176 "movdqa %%xmm2,%%xmm7 \n"
1177 "shufps $0x88,%%xmm6,%%xmm2 \n"
1178 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1179 "pavgb %%xmm7,%%xmm2 \n"
1180 "movdqa %%xmm0,%%xmm1 \n"
1181 "movdqa %%xmm2,%%xmm6 \n"
1182 "pmaddubsw %%xmm4,%%xmm0 \n"
1183 "pmaddubsw %%xmm4,%%xmm2 \n"
1184 "pmaddubsw %%xmm3,%%xmm1 \n"
1185 "pmaddubsw %%xmm3,%%xmm6 \n"
1186 "phaddw %%xmm2,%%xmm0 \n"
1187 "phaddw %%xmm6,%%xmm1 \n"
1188 "psraw $0x8,%%xmm0 \n"
1189 "psraw $0x8,%%xmm1 \n"
1190 "packsswb %%xmm1,%%xmm0 \n"
1191 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001192 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001193 "movlps %%xmm0,(%1) \n"
1194 "movhps %%xmm0,(%1,%2,1) \n"
1195 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001196 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001197 : "+r"(src_abgr0), // %0
1198 "+r"(dst_u), // %1
1199 "+r"(dst_v), // %2
1200 "+rm"(width) // %3
1201 : "r"(static_cast<intptr_t>(src_stride_abgr))
1202 : "memory", "cc"
1203#if defined(__SSE2__)
1204 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1205#endif
1206 );
1207}
1208
1209void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1210 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001211 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001212 "movdqa %0,%%xmm4 \n"
1213 "movdqa %1,%%xmm3 \n"
1214 "movdqa %2,%%xmm5 \n"
1215 :
1216 : "m"(kABGRToU), // %0
1217 "m"(kABGRToV), // %1
1218 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001219 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001220 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001221 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001222 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001223 "1: \n"
1224 "movdqu (%0),%%xmm0 \n"
1225 "movdqu 0x10(%0),%%xmm1 \n"
1226 "movdqu 0x20(%0),%%xmm2 \n"
1227 "movdqu 0x30(%0),%%xmm6 \n"
1228 "movdqu (%0,%4,1),%%xmm7 \n"
1229 "pavgb %%xmm7,%%xmm0 \n"
1230 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1231 "pavgb %%xmm7,%%xmm1 \n"
1232 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1233 "pavgb %%xmm7,%%xmm2 \n"
1234 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1235 "pavgb %%xmm7,%%xmm6 \n"
1236 "lea 0x40(%0),%0 \n"
1237 "movdqa %%xmm0,%%xmm7 \n"
1238 "shufps $0x88,%%xmm1,%%xmm0 \n"
1239 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1240 "pavgb %%xmm7,%%xmm0 \n"
1241 "movdqa %%xmm2,%%xmm7 \n"
1242 "shufps $0x88,%%xmm6,%%xmm2 \n"
1243 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1244 "pavgb %%xmm7,%%xmm2 \n"
1245 "movdqa %%xmm0,%%xmm1 \n"
1246 "movdqa %%xmm2,%%xmm6 \n"
1247 "pmaddubsw %%xmm4,%%xmm0 \n"
1248 "pmaddubsw %%xmm4,%%xmm2 \n"
1249 "pmaddubsw %%xmm3,%%xmm1 \n"
1250 "pmaddubsw %%xmm3,%%xmm6 \n"
1251 "phaddw %%xmm2,%%xmm0 \n"
1252 "phaddw %%xmm6,%%xmm1 \n"
1253 "psraw $0x8,%%xmm0 \n"
1254 "psraw $0x8,%%xmm1 \n"
1255 "packsswb %%xmm1,%%xmm0 \n"
1256 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001257 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001258 "movlps %%xmm0,(%1) \n"
1259 "movhps %%xmm0,(%1,%2,1) \n"
1260 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001261 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001262 : "+r"(src_abgr0), // %0
1263 "+r"(dst_u), // %1
1264 "+r"(dst_v), // %2
1265 "+rm"(width) // %3
1266 : "r"(static_cast<intptr_t>(src_stride_abgr))
1267 : "memory", "cc"
1268#if defined(__SSE2__)
1269 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1270#endif
1271 );
1272}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001273#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001274
fbarchard@google.come214fe32012-06-04 23:47:11 +00001275#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001276#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1277#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1278#define UR 0
1279
1280#define VB 0
1281#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1282#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1283
1284// Bias
1285#define BB UB * 128 + VB * 128
1286#define BG UG * 128 + VG * 128
1287#define BR UR * 128 + VR * 128
1288
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001289#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001290
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001291struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001292 vec8 kUVToB; // 0
1293 vec8 kUVToG; // 16
1294 vec8 kUVToR; // 32
1295 vec16 kUVBiasB; // 48
1296 vec16 kUVBiasG; // 64
1297 vec16 kUVBiasR; // 80
1298 vec16 kYSub16; // 96
1299 vec16 kYToRgb; // 112
1300 vec8 kVUToB; // 128
1301 vec8 kVUToG; // 144
1302 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001303} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001304 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1305 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1306 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1307 { BB, BB, BB, BB, BB, BB, BB, BB },
1308 { BG, BG, BG, BG, BG, BG, BG, BG },
1309 { BR, BR, BR, BR, BR, BR, BR, BR },
1310 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001311 { YG, YG, YG, YG, YG, YG, YG, YG },
1312 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1313 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1314 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001315};
1316
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001317
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001318// Read 8 UV from 411
1319#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001320 "movq (%[u_buf]),%%xmm0 \n" \
1321 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1322 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001323 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001324
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001325// Read 4 UV from 422, upsample to 8 UV
1326#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001327 "movd (%[u_buf]),%%xmm0 \n" \
1328 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1329 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001330 "punpcklbw %%xmm1,%%xmm0 \n" \
1331 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001332
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001333// Read 2 UV from 411, upsample to 8 UV
1334#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001335 "movd (%[u_buf]),%%xmm0 \n" \
1336 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1337 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001338 "punpcklbw %%xmm1,%%xmm0 \n" \
1339 "punpcklwd %%xmm0,%%xmm0 \n" \
1340 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001341
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001342// Read 4 UV from NV12, upsample to 8 UV
1343#define READNV12 \
1344 "movq (%[uv_buf]),%%xmm0 \n" \
1345 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001346 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001347
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001348// Convert 8 pixels: 8 UV and 8 Y
1349#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001350 "movdqa %%xmm0,%%xmm1 \n" \
1351 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001352 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1353 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1354 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1355 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1356 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1357 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1358 "movq (%[y_buf]),%%xmm3 \n" \
1359 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001360 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001361 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1362 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001363 "paddsw %%xmm3,%%xmm0 \n" \
1364 "paddsw %%xmm3,%%xmm1 \n" \
1365 "paddsw %%xmm3,%%xmm2 \n" \
1366 "psraw $0x6,%%xmm0 \n" \
1367 "psraw $0x6,%%xmm1 \n" \
1368 "psraw $0x6,%%xmm2 \n" \
1369 "packuswb %%xmm0,%%xmm0 \n" \
1370 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001371 "packuswb %%xmm2,%%xmm2 \n" \
1372
1373// Convert 8 pixels: 8 VU and 8 Y
1374#define YVUTORGB \
1375 "movdqa %%xmm0,%%xmm1 \n" \
1376 "movdqa %%xmm0,%%xmm2 \n" \
1377 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1378 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1379 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1380 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1381 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1382 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1383 "movq (%[y_buf]),%%xmm3 \n" \
1384 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1385 "punpcklbw %%xmm4,%%xmm3 \n" \
1386 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1387 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1388 "paddsw %%xmm3,%%xmm0 \n" \
1389 "paddsw %%xmm3,%%xmm1 \n" \
1390 "paddsw %%xmm3,%%xmm2 \n" \
1391 "psraw $0x6,%%xmm0 \n" \
1392 "psraw $0x6,%%xmm1 \n" \
1393 "psraw $0x6,%%xmm2 \n" \
1394 "packuswb %%xmm0,%%xmm0 \n" \
1395 "packuswb %%xmm1,%%xmm1 \n" \
1396 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001397
1398void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001399 const uint8* u_buf,
1400 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001401 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001402 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001403 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001404 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001405 "pcmpeqb %%xmm5,%%xmm5 \n"
1406 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001407 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001408 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001409 READYUV444
1410 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001411 "punpcklbw %%xmm1,%%xmm0 \n"
1412 "punpcklbw %%xmm5,%%xmm2 \n"
1413 "movdqa %%xmm0,%%xmm1 \n"
1414 "punpcklwd %%xmm2,%%xmm0 \n"
1415 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001416 "movdqa %%xmm0,(%[argb_buf]) \n"
1417 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1418 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1419 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001420 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001421 : [y_buf]"+r"(y_buf), // %[y_buf]
1422 [u_buf]"+r"(u_buf), // %[u_buf]
1423 [v_buf]"+r"(v_buf), // %[v_buf]
1424 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1425 [width]"+rm"(width) // %[width]
1426 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001427 : "memory", "cc"
1428#if defined(__SSE2__)
1429 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1430#endif
1431 );
1432}
1433
fbarchard@google.come214fe32012-06-04 23:47:11 +00001434void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001435 const uint8* u_buf,
1436 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001437 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001438 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001439 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001440 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001441 "pcmpeqb %%xmm5,%%xmm5 \n"
1442 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001443 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001444 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001445 READYUV422
1446 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001447 "punpcklbw %%xmm1,%%xmm0 \n"
1448 "punpcklbw %%xmm5,%%xmm2 \n"
1449 "movdqa %%xmm0,%%xmm1 \n"
1450 "punpcklwd %%xmm2,%%xmm0 \n"
1451 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001452 "movdqa %%xmm0,(%[argb_buf]) \n"
1453 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1454 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1455 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001456 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001457 : [y_buf]"+r"(y_buf), // %[y_buf]
1458 [u_buf]"+r"(u_buf), // %[u_buf]
1459 [v_buf]"+r"(v_buf), // %[v_buf]
1460 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1461 [width]"+rm"(width) // %[width]
1462 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001463 : "memory", "cc"
1464#if defined(__SSE2__)
1465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1466#endif
1467 );
1468}
1469
1470void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1471 const uint8* u_buf,
1472 const uint8* v_buf,
1473 uint8* argb_buf,
1474 int width) {
1475 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001476 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001477 "pcmpeqb %%xmm5,%%xmm5 \n"
1478 "pxor %%xmm4,%%xmm4 \n"
1479 ".p2align 4 \n"
1480 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001481 READYUV411
1482 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001483 "punpcklbw %%xmm1,%%xmm0 \n"
1484 "punpcklbw %%xmm5,%%xmm2 \n"
1485 "movdqa %%xmm0,%%xmm1 \n"
1486 "punpcklwd %%xmm2,%%xmm0 \n"
1487 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001488 "movdqa %%xmm0,(%[argb_buf]) \n"
1489 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1490 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1491 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001492 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001493 : [y_buf]"+r"(y_buf), // %[y_buf]
1494 [u_buf]"+r"(u_buf), // %[u_buf]
1495 [v_buf]"+r"(v_buf), // %[v_buf]
1496 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1497 [width]"+rm"(width) // %[width]
1498 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1499 : "memory", "cc"
1500#if defined(__SSE2__)
1501 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1502#endif
1503 );
1504}
1505
1506void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1507 const uint8* uv_buf,
1508 uint8* argb_buf,
1509 int width) {
1510 asm volatile (
1511 "pcmpeqb %%xmm5,%%xmm5 \n"
1512 "pxor %%xmm4,%%xmm4 \n"
1513 ".p2align 4 \n"
1514 "1: \n"
1515 READNV12
1516 YUVTORGB
1517 "punpcklbw %%xmm1,%%xmm0 \n"
1518 "punpcklbw %%xmm5,%%xmm2 \n"
1519 "movdqa %%xmm0,%%xmm1 \n"
1520 "punpcklwd %%xmm2,%%xmm0 \n"
1521 "punpckhwd %%xmm2,%%xmm1 \n"
1522 "movdqa %%xmm0,(%[argb_buf]) \n"
1523 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1524 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1525 "sub $0x8,%[width] \n"
1526 "jg 1b \n"
1527 : [y_buf]"+r"(y_buf), // %[y_buf]
1528 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1529 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1530 [width]"+rm"(width) // %[width]
1531 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1532 : "memory", "cc"
1533#if defined(__SSE2__)
1534 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1535#endif
1536 );
1537}
1538
1539void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1540 const uint8* vu_buf,
1541 uint8* argb_buf,
1542 int width) {
1543 asm volatile (
1544 "pcmpeqb %%xmm5,%%xmm5 \n"
1545 "pxor %%xmm4,%%xmm4 \n"
1546 ".p2align 4 \n"
1547 "1: \n"
1548 READNV12
1549 YVUTORGB
1550 "punpcklbw %%xmm1,%%xmm0 \n"
1551 "punpcklbw %%xmm5,%%xmm2 \n"
1552 "movdqa %%xmm0,%%xmm1 \n"
1553 "punpcklwd %%xmm2,%%xmm0 \n"
1554 "punpckhwd %%xmm2,%%xmm1 \n"
1555 "movdqa %%xmm0,(%[argb_buf]) \n"
1556 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1557 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1558 "sub $0x8,%[width] \n"
1559 "jg 1b \n"
1560 : [y_buf]"+r"(y_buf), // %[y_buf]
1561 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1562 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1563 [width]"+rm"(width) // %[width]
1564 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001565 : "memory", "cc"
1566#if defined(__SSE2__)
1567 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1568#endif
1569 );
1570}
1571
1572void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1573 const uint8* u_buf,
1574 const uint8* v_buf,
1575 uint8* argb_buf,
1576 int width) {
1577 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001578 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001579 "pcmpeqb %%xmm5,%%xmm5 \n"
1580 "pxor %%xmm4,%%xmm4 \n"
1581 ".p2align 4 \n"
1582 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001583 READYUV444
1584 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001585 "punpcklbw %%xmm1,%%xmm0 \n"
1586 "punpcklbw %%xmm5,%%xmm2 \n"
1587 "movdqa %%xmm0,%%xmm1 \n"
1588 "punpcklwd %%xmm2,%%xmm0 \n"
1589 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001590 "movdqu %%xmm0,(%[argb_buf]) \n"
1591 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1592 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1593 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001594 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001595 : [y_buf]"+r"(y_buf), // %[y_buf]
1596 [u_buf]"+r"(u_buf), // %[u_buf]
1597 [v_buf]"+r"(v_buf), // %[v_buf]
1598 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1599 [width]"+rm"(width) // %[width]
1600 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001601 : "memory", "cc"
1602#if defined(__SSE2__)
1603 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1604#endif
1605 );
1606}
1607
1608void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1609 const uint8* u_buf,
1610 const uint8* v_buf,
1611 uint8* argb_buf,
1612 int width) {
1613 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001614 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001615 "pcmpeqb %%xmm5,%%xmm5 \n"
1616 "pxor %%xmm4,%%xmm4 \n"
1617 ".p2align 4 \n"
1618 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001619 READYUV422
1620 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001621 "punpcklbw %%xmm1,%%xmm0 \n"
1622 "punpcklbw %%xmm5,%%xmm2 \n"
1623 "movdqa %%xmm0,%%xmm1 \n"
1624 "punpcklwd %%xmm2,%%xmm0 \n"
1625 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001626 "movdqu %%xmm0,(%[argb_buf]) \n"
1627 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1628 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1629 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001630 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001631 : [y_buf]"+r"(y_buf), // %[y_buf]
1632 [u_buf]"+r"(u_buf), // %[u_buf]
1633 [v_buf]"+r"(v_buf), // %[v_buf]
1634 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1635 [width]"+rm"(width) // %[width]
1636 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001637 : "memory", "cc"
1638#if defined(__SSE2__)
1639 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1640#endif
1641 );
1642}
1643
1644void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1645 const uint8* u_buf,
1646 const uint8* v_buf,
1647 uint8* argb_buf,
1648 int width) {
1649 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001650 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001651 "pcmpeqb %%xmm5,%%xmm5 \n"
1652 "pxor %%xmm4,%%xmm4 \n"
1653 ".p2align 4 \n"
1654 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001655 READYUV411
1656 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001657 "punpcklbw %%xmm1,%%xmm0 \n"
1658 "punpcklbw %%xmm5,%%xmm2 \n"
1659 "movdqa %%xmm0,%%xmm1 \n"
1660 "punpcklwd %%xmm2,%%xmm0 \n"
1661 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001662 "movdqu %%xmm0,(%[argb_buf]) \n"
1663 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1664 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1665 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001666 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001667 : [y_buf]"+r"(y_buf), // %[y_buf]
1668 [u_buf]"+r"(u_buf), // %[u_buf]
1669 [v_buf]"+r"(v_buf), // %[v_buf]
1670 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1671 [width]"+rm"(width) // %[width]
1672 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1673 : "memory", "cc"
1674#if defined(__SSE2__)
1675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1676#endif
1677 );
1678}
1679
1680void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1681 const uint8* uv_buf,
1682 uint8* argb_buf,
1683 int width) {
1684 asm volatile (
1685 "pcmpeqb %%xmm5,%%xmm5 \n"
1686 "pxor %%xmm4,%%xmm4 \n"
1687 ".p2align 4 \n"
1688 "1: \n"
1689 READNV12
1690 YUVTORGB
1691 "punpcklbw %%xmm1,%%xmm0 \n"
1692 "punpcklbw %%xmm5,%%xmm2 \n"
1693 "movdqa %%xmm0,%%xmm1 \n"
1694 "punpcklwd %%xmm2,%%xmm0 \n"
1695 "punpckhwd %%xmm2,%%xmm1 \n"
1696 "movdqu %%xmm0,(%[argb_buf]) \n"
1697 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1698 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1699 "sub $0x8,%[width] \n"
1700 "jg 1b \n"
1701 : [y_buf]"+r"(y_buf), // %[y_buf]
1702 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1703 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1704 [width]"+rm"(width) // %[width]
1705 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1706 : "memory", "cc"
1707#if defined(__SSE2__)
1708 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1709#endif
1710 );
1711}
1712
1713void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1714 const uint8* vu_buf,
1715 uint8* argb_buf,
1716 int width) {
1717 asm volatile (
1718 "pcmpeqb %%xmm5,%%xmm5 \n"
1719 "pxor %%xmm4,%%xmm4 \n"
1720 ".p2align 4 \n"
1721 "1: \n"
1722 READNV12
1723 YVUTORGB
1724 "punpcklbw %%xmm1,%%xmm0 \n"
1725 "punpcklbw %%xmm5,%%xmm2 \n"
1726 "movdqa %%xmm0,%%xmm1 \n"
1727 "punpcklwd %%xmm2,%%xmm0 \n"
1728 "punpckhwd %%xmm2,%%xmm1 \n"
1729 "movdqu %%xmm0,(%[argb_buf]) \n"
1730 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1731 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1732 "sub $0x8,%[width] \n"
1733 "jg 1b \n"
1734 : [y_buf]"+r"(y_buf), // %[y_buf]
1735 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1736 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1737 [width]"+rm"(width) // %[width]
1738 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001739 : "memory", "cc"
1740#if defined(__SSE2__)
1741 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1742#endif
1743 );
1744}
1745
1746void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1747 const uint8* u_buf,
1748 const uint8* v_buf,
1749 uint8* bgra_buf,
1750 int width) {
1751 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001752 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001753 "pcmpeqb %%xmm5,%%xmm5 \n"
1754 "pxor %%xmm4,%%xmm4 \n"
1755 ".p2align 4 \n"
1756 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001757 READYUV422
1758 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001759 "pcmpeqb %%xmm5,%%xmm5 \n"
1760 "punpcklbw %%xmm0,%%xmm1 \n"
1761 "punpcklbw %%xmm2,%%xmm5 \n"
1762 "movdqa %%xmm5,%%xmm0 \n"
1763 "punpcklwd %%xmm1,%%xmm5 \n"
1764 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001765 "movdqa %%xmm5,(%[argb_buf]) \n"
1766 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1767 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1768 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001769 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001770 : [y_buf]"+r"(y_buf), // %[y_buf]
1771 [u_buf]"+r"(u_buf), // %[u_buf]
1772 [v_buf]"+r"(v_buf), // %[v_buf]
1773 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1774 [width]"+rm"(width) // %[width]
1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001776 : "memory", "cc"
1777#if defined(__SSE2__)
1778 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1779#endif
1780 );
1781}
1782
fbarchard@google.come214fe32012-06-04 23:47:11 +00001783void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001784 const uint8* u_buf,
1785 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001786 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001787 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001788 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001789 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001790 "pcmpeqb %%xmm5,%%xmm5 \n"
1791 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001792 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001793 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001794 READYUV422
1795 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001796 "punpcklbw %%xmm1,%%xmm2 \n"
1797 "punpcklbw %%xmm5,%%xmm0 \n"
1798 "movdqa %%xmm2,%%xmm1 \n"
1799 "punpcklwd %%xmm0,%%xmm2 \n"
1800 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001801 "movdqa %%xmm2,(%[argb_buf]) \n"
1802 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1803 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1804 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001805 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001806 : [y_buf]"+r"(y_buf), // %[y_buf]
1807 [u_buf]"+r"(u_buf), // %[u_buf]
1808 [v_buf]"+r"(v_buf), // %[v_buf]
1809 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1810 [width]"+rm"(width) // %[width]
1811 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001812 : "memory", "cc"
1813#if defined(__SSE2__)
1814 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1815#endif
1816 );
1817}
1818
fbarchard@google.come214fe32012-06-04 23:47:11 +00001819void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001820 const uint8* u_buf,
1821 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001822 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001823 int width) {
1824 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001825 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001826 "pcmpeqb %%xmm5,%%xmm5 \n"
1827 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001828 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001829 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001830 READYUV422
1831 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001832 "pcmpeqb %%xmm5,%%xmm5 \n"
1833 "punpcklbw %%xmm0,%%xmm1 \n"
1834 "punpcklbw %%xmm2,%%xmm5 \n"
1835 "movdqa %%xmm5,%%xmm0 \n"
1836 "punpcklwd %%xmm1,%%xmm5 \n"
1837 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001838 "movdqu %%xmm5,(%[argb_buf]) \n"
1839 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
1840 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1841 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001842 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001843 : [y_buf]"+r"(y_buf), // %[y_buf]
1844 [u_buf]"+r"(u_buf), // %[u_buf]
1845 [v_buf]"+r"(v_buf), // %[v_buf]
1846 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1847 [width]"+rm"(width) // %[width]
1848 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001849 : "memory", "cc"
1850#if defined(__SSE2__)
1851 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1852#endif
1853 );
1854}
1855
fbarchard@google.come214fe32012-06-04 23:47:11 +00001856void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001857 const uint8* u_buf,
1858 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001859 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001860 int width) {
1861 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001862 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001863 "pcmpeqb %%xmm5,%%xmm5 \n"
1864 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001865 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001866 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001867 READYUV422
1868 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001869 "punpcklbw %%xmm1,%%xmm2 \n"
1870 "punpcklbw %%xmm5,%%xmm0 \n"
1871 "movdqa %%xmm2,%%xmm1 \n"
1872 "punpcklwd %%xmm0,%%xmm2 \n"
1873 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001874 "movdqu %%xmm2,(%[argb_buf]) \n"
1875 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1876 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1877 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001878 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001879 : [y_buf]"+r"(y_buf), // %[y_buf]
1880 [u_buf]"+r"(u_buf), // %[u_buf]
1881 [v_buf]"+r"(v_buf), // %[v_buf]
1882 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1883 [width]"+rm"(width) // %[width]
1884 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001885 : "memory", "cc"
1886#if defined(__SSE2__)
1887 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1888#endif
1889 );
1890}
fbarchard@google.come214fe32012-06-04 23:47:11 +00001891#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001892
1893#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001894void YToARGBRow_SSE2(const uint8* y_buf,
1895 uint8* rgb_buf,
1896 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001897 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001898 "pcmpeqb %%xmm4,%%xmm4 \n"
1899 "pslld $0x18,%%xmm4 \n"
1900 "mov $0x10001000,%%eax \n"
1901 "movd %%eax,%%xmm3 \n"
1902 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1903 "mov $0x012a012a,%%eax \n"
1904 "movd %%eax,%%xmm2 \n"
1905 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001906 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001907 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001908 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001909 "movq (%0),%%xmm0 \n"
1910 "lea 0x8(%0),%0 \n"
1911 "punpcklbw %%xmm0,%%xmm0 \n"
1912 "psubusw %%xmm3,%%xmm0 \n"
1913 "pmulhuw %%xmm2,%%xmm0 \n"
1914 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001915
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001916 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001917 "punpcklbw %%xmm0,%%xmm0 \n"
1918 "movdqa %%xmm0,%%xmm1 \n"
1919 "punpcklwd %%xmm0,%%xmm0 \n"
1920 "punpckhwd %%xmm1,%%xmm1 \n"
1921 "por %%xmm4,%%xmm0 \n"
1922 "por %%xmm4,%%xmm1 \n"
1923 "movdqa %%xmm0,(%1) \n"
1924 "movdqa %%xmm1,16(%1) \n"
1925 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001926
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001927 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001928 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001929 : "+r"(y_buf), // %0
1930 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001931 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001932 :
1933 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001934#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001935 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001936#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001937 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001938}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001939#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001940
fbarchard@google.com42831e02012-01-21 02:54:17 +00001941#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001942// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001943CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001944 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1945};
1946
fbarchard@google.com42831e02012-01-21 02:54:17 +00001947void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001948 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001949 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001950 "movdqa %3,%%xmm5 \n"
1951 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001952 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001953 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001954 "movdqa (%0,%2),%%xmm0 \n"
1955 "pshufb %%xmm5,%%xmm0 \n"
1956 "sub $0x10,%2 \n"
1957 "movdqa %%xmm0,(%1) \n"
1958 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001959 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001960 : "+r"(src), // %0
1961 "+r"(dst), // %1
1962 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001963 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001964 : "memory", "cc"
1965#if defined(__SSE2__)
1966 , "xmm0", "xmm5"
1967#endif
1968 );
1969}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001970#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001971
fbarchard@google.com42831e02012-01-21 02:54:17 +00001972#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001973void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001974 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001975 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001976 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001977 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001978 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001979 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001980 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001981 "psllw $0x8,%%xmm0 \n"
1982 "psrlw $0x8,%%xmm1 \n"
1983 "por %%xmm1,%%xmm0 \n"
1984 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1985 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1986 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1987 "sub $0x10,%2 \n"
1988 "movdqu %%xmm0,(%1) \n"
1989 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001990 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001991 : "+r"(src), // %0
1992 "+r"(dst), // %1
1993 "+r"(temp_width) // %2
1994 :
1995 : "memory", "cc"
1996#if defined(__SSE2__)
1997 , "xmm0", "xmm1"
1998#endif
1999 );
2000}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002001#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002002
fbarchard@google.com16a96642012-03-02 22:38:09 +00002003#ifdef HAS_MIRRORROW_UV_SSSE3
2004// Shuffle table for reversing the bytes of UV channels.
2005CONST uvec8 kShuffleMirrorUV = {
2006 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2007};
2008void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2009 int width) {
2010 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002011 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002012 "movdqa %4,%%xmm1 \n"
2013 "lea -16(%0,%3,2),%0 \n"
2014 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002015 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002016 "1: \n"
2017 "movdqa (%0),%%xmm0 \n"
2018 "lea -16(%0),%0 \n"
2019 "pshufb %%xmm1,%%xmm0 \n"
2020 "sub $8,%3 \n"
2021 "movlpd %%xmm0,(%1) \n"
2022 "movhpd %%xmm0,(%1,%2) \n"
2023 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002024 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002025 : "+r"(src), // %0
2026 "+r"(dst_u), // %1
2027 "+r"(dst_v), // %2
2028 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002029 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002030 : "memory", "cc"
2031#if defined(__SSE2__)
2032 , "xmm0", "xmm1"
2033#endif
2034 );
2035}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002036#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002037
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002038#ifdef HAS_ARGBMIRRORROW_SSSE3
2039// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002040CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002041 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2042};
2043
2044void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2045 intptr_t temp_width = static_cast<intptr_t>(width);
2046 asm volatile (
2047 "movdqa %3,%%xmm5 \n"
2048 "lea -0x10(%0),%0 \n"
2049 ".p2align 4 \n"
2050 "1: \n"
2051 "movdqa (%0,%2,4),%%xmm0 \n"
2052 "pshufb %%xmm5,%%xmm0 \n"
2053 "sub $0x4,%2 \n"
2054 "movdqa %%xmm0,(%1) \n"
2055 "lea 0x10(%1),%1 \n"
2056 "jg 1b \n"
2057 : "+r"(src), // %0
2058 "+r"(dst), // %1
2059 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002060 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002061 : "memory", "cc"
2062#if defined(__SSE2__)
2063 , "xmm0", "xmm5"
2064#endif
2065 );
2066}
2067#endif // HAS_ARGBMIRRORROW_SSSE3
2068
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002069#ifdef HAS_SPLITUV_SSE2
2070void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002071 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002072 "pcmpeqb %%xmm5,%%xmm5 \n"
2073 "psrlw $0x8,%%xmm5 \n"
2074 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002075 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002076 "1: \n"
2077 "movdqa (%0),%%xmm0 \n"
2078 "movdqa 0x10(%0),%%xmm1 \n"
2079 "lea 0x20(%0),%0 \n"
2080 "movdqa %%xmm0,%%xmm2 \n"
2081 "movdqa %%xmm1,%%xmm3 \n"
2082 "pand %%xmm5,%%xmm0 \n"
2083 "pand %%xmm5,%%xmm1 \n"
2084 "packuswb %%xmm1,%%xmm0 \n"
2085 "psrlw $0x8,%%xmm2 \n"
2086 "psrlw $0x8,%%xmm3 \n"
2087 "packuswb %%xmm3,%%xmm2 \n"
2088 "movdqa %%xmm0,(%1) \n"
2089 "movdqa %%xmm2,(%1,%2) \n"
2090 "lea 0x10(%1),%1 \n"
2091 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002092 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002093 : "+r"(src_uv), // %0
2094 "+r"(dst_u), // %1
2095 "+r"(dst_v), // %2
2096 "+r"(pix) // %3
2097 :
2098 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002099#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002100 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002101#endif
2102 );
2103}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002104#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002105
fbarchard@google.com19932f82012-02-16 22:19:14 +00002106#ifdef HAS_COPYROW_SSE2
2107void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002108 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002109 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002110 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002111 "1: \n"
2112 "movdqa (%0),%%xmm0 \n"
2113 "movdqa 0x10(%0),%%xmm1 \n"
2114 "movdqa %%xmm0,(%0,%1) \n"
2115 "movdqa %%xmm1,0x10(%0,%1) \n"
2116 "lea 0x20(%0),%0 \n"
2117 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002118 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002119 : "+r"(src), // %0
2120 "+r"(dst), // %1
2121 "+r"(count) // %2
2122 :
2123 : "memory", "cc"
2124#if defined(__SSE2__)
2125 , "xmm0", "xmm1"
2126#endif
2127 );
2128}
2129#endif // HAS_COPYROW_SSE2
2130
2131#ifdef HAS_COPYROW_X86
2132void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2133 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002134 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002135 "shr $0x2,%2 \n"
2136 "rep movsl \n"
2137 : "+S"(src), // %0
2138 "+D"(dst), // %1
2139 "+c"(width_tmp) // %2
2140 :
2141 : "memory", "cc"
2142 );
2143}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002144#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002145
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002146#ifdef HAS_SETROW_X86
2147void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2148 size_t width_tmp = static_cast<size_t>(width);
2149 asm volatile (
2150 "shr $0x2,%1 \n"
2151 "rep stosl \n"
2152 : "+D"(dst), // %0
2153 "+c"(width_tmp) // %1
2154 : "a"(v32) // %2
2155 : "memory", "cc");
2156}
2157
2158void SetRows32_X86(uint8* dst, uint32 v32, int width,
2159 int dst_stride, int height) {
2160 for (int y = 0; y < height; ++y) {
2161 size_t width_tmp = static_cast<size_t>(width);
2162 uint32* d = reinterpret_cast<uint32*>(dst);
2163 asm volatile (
2164 "rep stosl \n"
2165 : "+D"(d), // %0
2166 "+c"(width_tmp) // %1
2167 : "a"(v32) // %2
2168 : "memory", "cc");
2169 dst += dst_stride;
2170 }
2171}
2172#endif // HAS_SETROW_X86
2173
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002174#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002175void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002176 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002177 "pcmpeqb %%xmm5,%%xmm5 \n"
2178 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002179 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002180 "1: \n"
2181 "movdqa (%0),%%xmm0 \n"
2182 "movdqa 0x10(%0),%%xmm1 \n"
2183 "lea 0x20(%0),%0 \n"
2184 "pand %%xmm5,%%xmm0 \n"
2185 "pand %%xmm5,%%xmm1 \n"
2186 "packuswb %%xmm1,%%xmm0 \n"
2187 "movdqa %%xmm0,(%1) \n"
2188 "lea 0x10(%1),%1 \n"
2189 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002190 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002191 : "+r"(src_yuy2), // %0
2192 "+r"(dst_y), // %1
2193 "+r"(pix) // %2
2194 :
2195 : "memory", "cc"
2196#if defined(__SSE2__)
2197 , "xmm0", "xmm1", "xmm5"
2198#endif
2199 );
2200}
2201
2202void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002203 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002204 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002205 "pcmpeqb %%xmm5,%%xmm5 \n"
2206 "psrlw $0x8,%%xmm5 \n"
2207 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002208 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002209 "1: \n"
2210 "movdqa (%0),%%xmm0 \n"
2211 "movdqa 0x10(%0),%%xmm1 \n"
2212 "movdqa (%0,%4,1),%%xmm2 \n"
2213 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2214 "lea 0x20(%0),%0 \n"
2215 "pavgb %%xmm2,%%xmm0 \n"
2216 "pavgb %%xmm3,%%xmm1 \n"
2217 "psrlw $0x8,%%xmm0 \n"
2218 "psrlw $0x8,%%xmm1 \n"
2219 "packuswb %%xmm1,%%xmm0 \n"
2220 "movdqa %%xmm0,%%xmm1 \n"
2221 "pand %%xmm5,%%xmm0 \n"
2222 "packuswb %%xmm0,%%xmm0 \n"
2223 "psrlw $0x8,%%xmm1 \n"
2224 "packuswb %%xmm1,%%xmm1 \n"
2225 "movq %%xmm0,(%1) \n"
2226 "movq %%xmm1,(%1,%2) \n"
2227 "lea 0x8(%1),%1 \n"
2228 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002229 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002230 : "+r"(src_yuy2), // %0
2231 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002232 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002233 "+r"(pix) // %3
2234 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2235 : "memory", "cc"
2236#if defined(__SSE2__)
2237 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2238#endif
2239 );
2240}
2241
fbarchard@google.comc704f782012-08-30 19:53:48 +00002242void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2243 uint8* dst_u, uint8* dst_v, int pix) {
2244 asm volatile (
2245 "pcmpeqb %%xmm5,%%xmm5 \n"
2246 "psrlw $0x8,%%xmm5 \n"
2247 "sub %1,%2 \n"
2248 ".p2align 4 \n"
2249 "1: \n"
2250 "movdqa (%0),%%xmm0 \n"
2251 "movdqa 0x10(%0),%%xmm1 \n"
2252 "lea 0x20(%0),%0 \n"
2253 "psrlw $0x8,%%xmm0 \n"
2254 "psrlw $0x8,%%xmm1 \n"
2255 "packuswb %%xmm1,%%xmm0 \n"
2256 "movdqa %%xmm0,%%xmm1 \n"
2257 "pand %%xmm5,%%xmm0 \n"
2258 "packuswb %%xmm0,%%xmm0 \n"
2259 "psrlw $0x8,%%xmm1 \n"
2260 "packuswb %%xmm1,%%xmm1 \n"
2261 "movq %%xmm0,(%1) \n"
2262 "movq %%xmm1,(%1,%2) \n"
2263 "lea 0x8(%1),%1 \n"
2264 "sub $0x10,%3 \n"
2265 "jg 1b \n"
2266 : "+r"(src_yuy2), // %0
2267 "+r"(dst_u), // %1
2268 "+r"(dst_v), // %2
2269 "+r"(pix) // %3
2270 :
2271 : "memory", "cc"
2272#if defined(__SSE2__)
2273 , "xmm0", "xmm1", "xmm5"
2274#endif
2275 );
2276}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002277
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002278void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2279 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002280 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002281 "pcmpeqb %%xmm5,%%xmm5 \n"
2282 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002283 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002284 "1: \n"
2285 "movdqu (%0),%%xmm0 \n"
2286 "movdqu 0x10(%0),%%xmm1 \n"
2287 "lea 0x20(%0),%0 \n"
2288 "pand %%xmm5,%%xmm0 \n"
2289 "pand %%xmm5,%%xmm1 \n"
2290 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002291 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002292 "movdqu %%xmm0,(%1) \n"
2293 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002294 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002295 : "+r"(src_yuy2), // %0
2296 "+r"(dst_y), // %1
2297 "+r"(pix) // %2
2298 :
2299 : "memory", "cc"
2300#if defined(__SSE2__)
2301 , "xmm0", "xmm1", "xmm5"
2302#endif
2303 );
2304}
2305
2306void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2307 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002308 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002309 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002310 "pcmpeqb %%xmm5,%%xmm5 \n"
2311 "psrlw $0x8,%%xmm5 \n"
2312 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002313 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002314 "1: \n"
2315 "movdqu (%0),%%xmm0 \n"
2316 "movdqu 0x10(%0),%%xmm1 \n"
2317 "movdqu (%0,%4,1),%%xmm2 \n"
2318 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2319 "lea 0x20(%0),%0 \n"
2320 "pavgb %%xmm2,%%xmm0 \n"
2321 "pavgb %%xmm3,%%xmm1 \n"
2322 "psrlw $0x8,%%xmm0 \n"
2323 "psrlw $0x8,%%xmm1 \n"
2324 "packuswb %%xmm1,%%xmm0 \n"
2325 "movdqa %%xmm0,%%xmm1 \n"
2326 "pand %%xmm5,%%xmm0 \n"
2327 "packuswb %%xmm0,%%xmm0 \n"
2328 "psrlw $0x8,%%xmm1 \n"
2329 "packuswb %%xmm1,%%xmm1 \n"
2330 "movq %%xmm0,(%1) \n"
2331 "movq %%xmm1,(%1,%2) \n"
2332 "lea 0x8(%1),%1 \n"
2333 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002334 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002335 : "+r"(src_yuy2), // %0
2336 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002337 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002338 "+r"(pix) // %3
2339 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2340 : "memory", "cc"
2341#if defined(__SSE2__)
2342 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2343#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002344 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002345}
2346
fbarchard@google.comc704f782012-08-30 19:53:48 +00002347void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2348 uint8* dst_u, uint8* dst_v, int pix) {
2349 asm volatile (
2350 "pcmpeqb %%xmm5,%%xmm5 \n"
2351 "psrlw $0x8,%%xmm5 \n"
2352 "sub %1,%2 \n"
2353 ".p2align 4 \n"
2354 "1: \n"
2355 "movdqu (%0),%%xmm0 \n"
2356 "movdqu 0x10(%0),%%xmm1 \n"
2357 "lea 0x20(%0),%0 \n"
2358 "psrlw $0x8,%%xmm0 \n"
2359 "psrlw $0x8,%%xmm1 \n"
2360 "packuswb %%xmm1,%%xmm0 \n"
2361 "movdqa %%xmm0,%%xmm1 \n"
2362 "pand %%xmm5,%%xmm0 \n"
2363 "packuswb %%xmm0,%%xmm0 \n"
2364 "psrlw $0x8,%%xmm1 \n"
2365 "packuswb %%xmm1,%%xmm1 \n"
2366 "movq %%xmm0,(%1) \n"
2367 "movq %%xmm1,(%1,%2) \n"
2368 "lea 0x8(%1),%1 \n"
2369 "sub $0x10,%3 \n"
2370 "jg 1b \n"
2371 : "+r"(src_yuy2), // %0
2372 "+r"(dst_u), // %1
2373 "+r"(dst_v), // %2
2374 "+r"(pix) // %3
2375 :
2376 : "memory", "cc"
2377#if defined(__SSE2__)
2378 , "xmm0", "xmm1", "xmm5"
2379#endif
2380 );
2381}
2382
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002383void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002384 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002385 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002386 "1: \n"
2387 "movdqa (%0),%%xmm0 \n"
2388 "movdqa 0x10(%0),%%xmm1 \n"
2389 "lea 0x20(%0),%0 \n"
2390 "psrlw $0x8,%%xmm0 \n"
2391 "psrlw $0x8,%%xmm1 \n"
2392 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002393 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002394 "movdqa %%xmm0,(%1) \n"
2395 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002396 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002397 : "+r"(src_uyvy), // %0
2398 "+r"(dst_y), // %1
2399 "+r"(pix) // %2
2400 :
2401 : "memory", "cc"
2402#if defined(__SSE2__)
2403 , "xmm0", "xmm1"
2404#endif
2405 );
2406}
2407
2408void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002409 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002410 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002411 "pcmpeqb %%xmm5,%%xmm5 \n"
2412 "psrlw $0x8,%%xmm5 \n"
2413 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002414 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002415 "1: \n"
2416 "movdqa (%0),%%xmm0 \n"
2417 "movdqa 0x10(%0),%%xmm1 \n"
2418 "movdqa (%0,%4,1),%%xmm2 \n"
2419 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2420 "lea 0x20(%0),%0 \n"
2421 "pavgb %%xmm2,%%xmm0 \n"
2422 "pavgb %%xmm3,%%xmm1 \n"
2423 "pand %%xmm5,%%xmm0 \n"
2424 "pand %%xmm5,%%xmm1 \n"
2425 "packuswb %%xmm1,%%xmm0 \n"
2426 "movdqa %%xmm0,%%xmm1 \n"
2427 "pand %%xmm5,%%xmm0 \n"
2428 "packuswb %%xmm0,%%xmm0 \n"
2429 "psrlw $0x8,%%xmm1 \n"
2430 "packuswb %%xmm1,%%xmm1 \n"
2431 "movq %%xmm0,(%1) \n"
2432 "movq %%xmm1,(%1,%2) \n"
2433 "lea 0x8(%1),%1 \n"
2434 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002435 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002436 : "+r"(src_uyvy), // %0
2437 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002438 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002439 "+r"(pix) // %3
2440 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2441 : "memory", "cc"
2442#if defined(__SSE2__)
2443 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2444#endif
2445 );
2446}
2447
fbarchard@google.comc704f782012-08-30 19:53:48 +00002448void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2449 uint8* dst_u, uint8* dst_v, int pix) {
2450 asm volatile (
2451 "pcmpeqb %%xmm5,%%xmm5 \n"
2452 "psrlw $0x8,%%xmm5 \n"
2453 "sub %1,%2 \n"
2454 ".p2align 4 \n"
2455 "1: \n"
2456 "movdqa (%0),%%xmm0 \n"
2457 "movdqa 0x10(%0),%%xmm1 \n"
2458 "lea 0x20(%0),%0 \n"
2459 "pand %%xmm5,%%xmm0 \n"
2460 "pand %%xmm5,%%xmm1 \n"
2461 "packuswb %%xmm1,%%xmm0 \n"
2462 "movdqa %%xmm0,%%xmm1 \n"
2463 "pand %%xmm5,%%xmm0 \n"
2464 "packuswb %%xmm0,%%xmm0 \n"
2465 "psrlw $0x8,%%xmm1 \n"
2466 "packuswb %%xmm1,%%xmm1 \n"
2467 "movq %%xmm0,(%1) \n"
2468 "movq %%xmm1,(%1,%2) \n"
2469 "lea 0x8(%1),%1 \n"
2470 "sub $0x10,%3 \n"
2471 "jg 1b \n"
2472 : "+r"(src_uyvy), // %0
2473 "+r"(dst_u), // %1
2474 "+r"(dst_v), // %2
2475 "+r"(pix) // %3
2476 :
2477 : "memory", "cc"
2478#if defined(__SSE2__)
2479 , "xmm0", "xmm1", "xmm5"
2480#endif
2481 );
2482}
2483
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002484void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2485 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002486 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002487 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002488 "1: \n"
2489 "movdqu (%0),%%xmm0 \n"
2490 "movdqu 0x10(%0),%%xmm1 \n"
2491 "lea 0x20(%0),%0 \n"
2492 "psrlw $0x8,%%xmm0 \n"
2493 "psrlw $0x8,%%xmm1 \n"
2494 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002495 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002496 "movdqu %%xmm0,(%1) \n"
2497 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002498 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002499 : "+r"(src_uyvy), // %0
2500 "+r"(dst_y), // %1
2501 "+r"(pix) // %2
2502 :
2503 : "memory", "cc"
2504#if defined(__SSE2__)
2505 , "xmm0", "xmm1"
2506#endif
2507 );
2508}
2509
2510void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002511 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002512 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002513 "pcmpeqb %%xmm5,%%xmm5 \n"
2514 "psrlw $0x8,%%xmm5 \n"
2515 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002516 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002517 "1: \n"
2518 "movdqu (%0),%%xmm0 \n"
2519 "movdqu 0x10(%0),%%xmm1 \n"
2520 "movdqu (%0,%4,1),%%xmm2 \n"
2521 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2522 "lea 0x20(%0),%0 \n"
2523 "pavgb %%xmm2,%%xmm0 \n"
2524 "pavgb %%xmm3,%%xmm1 \n"
2525 "pand %%xmm5,%%xmm0 \n"
2526 "pand %%xmm5,%%xmm1 \n"
2527 "packuswb %%xmm1,%%xmm0 \n"
2528 "movdqa %%xmm0,%%xmm1 \n"
2529 "pand %%xmm5,%%xmm0 \n"
2530 "packuswb %%xmm0,%%xmm0 \n"
2531 "psrlw $0x8,%%xmm1 \n"
2532 "packuswb %%xmm1,%%xmm1 \n"
2533 "movq %%xmm0,(%1) \n"
2534 "movq %%xmm1,(%1,%2) \n"
2535 "lea 0x8(%1),%1 \n"
2536 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002537 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002538 : "+r"(src_uyvy), // %0
2539 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002540 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002541 "+r"(pix) // %3
2542 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2543 : "memory", "cc"
2544#if defined(__SSE2__)
2545 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2546#endif
2547 );
2548}
fbarchard@google.comc704f782012-08-30 19:53:48 +00002549
2550void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2551 uint8* dst_u, uint8* dst_v, int pix) {
2552 asm volatile (
2553 "pcmpeqb %%xmm5,%%xmm5 \n"
2554 "psrlw $0x8,%%xmm5 \n"
2555 "sub %1,%2 \n"
2556 ".p2align 4 \n"
2557 "1: \n"
2558 "movdqu (%0),%%xmm0 \n"
2559 "movdqu 0x10(%0),%%xmm1 \n"
2560 "lea 0x20(%0),%0 \n"
2561 "pand %%xmm5,%%xmm0 \n"
2562 "pand %%xmm5,%%xmm1 \n"
2563 "packuswb %%xmm1,%%xmm0 \n"
2564 "movdqa %%xmm0,%%xmm1 \n"
2565 "pand %%xmm5,%%xmm0 \n"
2566 "packuswb %%xmm0,%%xmm0 \n"
2567 "psrlw $0x8,%%xmm1 \n"
2568 "packuswb %%xmm1,%%xmm1 \n"
2569 "movq %%xmm0,(%1) \n"
2570 "movq %%xmm1,(%1,%2) \n"
2571 "lea 0x8(%1),%1 \n"
2572 "sub $0x10,%3 \n"
2573 "jg 1b \n"
2574 : "+r"(src_uyvy), // %0
2575 "+r"(dst_u), // %1
2576 "+r"(dst_v), // %2
2577 "+r"(pix) // %3
2578 :
2579 : "memory", "cc"
2580#if defined(__SSE2__)
2581 , "xmm0", "xmm1", "xmm5"
2582#endif
2583 );
2584}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002585#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002586
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002587#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002588// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002589void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
2590 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002591 asm volatile (
2592 "pcmpeqb %%xmm7,%%xmm7 \n"
2593 "psrlw $0xf,%%xmm7 \n"
2594 "pcmpeqb %%xmm6,%%xmm6 \n"
2595 "psrlw $0x8,%%xmm6 \n"
2596 "pcmpeqb %%xmm5,%%xmm5 \n"
2597 "psllw $0x8,%%xmm5 \n"
2598 "pcmpeqb %%xmm4,%%xmm4 \n"
2599 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002600 "sub $0x1,%3 \n"
2601 "je 91f \n"
2602 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002603
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002604 // 1 pixel loop until destination pointer is aligned.
2605 "10: \n"
2606 "test $0xf,%2 \n"
2607 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002608 "movd (%0),%%xmm3 \n"
2609 "lea 0x4(%0),%0 \n"
2610 "movdqa %%xmm3,%%xmm0 \n"
2611 "pxor %%xmm4,%%xmm3 \n"
2612 "movd (%1),%%xmm2 \n"
2613 "psrlw $0x8,%%xmm3 \n"
2614 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2615 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2616 "pand %%xmm6,%%xmm2 \n"
2617 "paddw %%xmm7,%%xmm3 \n"
2618 "pmullw %%xmm3,%%xmm2 \n"
2619 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002620 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002621 "psrlw $0x8,%%xmm1 \n"
2622 "por %%xmm4,%%xmm0 \n"
2623 "pmullw %%xmm3,%%xmm1 \n"
2624 "psrlw $0x8,%%xmm2 \n"
2625 "paddusb %%xmm2,%%xmm0 \n"
2626 "pand %%xmm5,%%xmm1 \n"
2627 "paddusb %%xmm1,%%xmm0 \n"
2628 "sub $0x1,%3 \n"
2629 "movd %%xmm0,(%2) \n"
2630 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002631 "jge 10b \n"
2632
2633 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002634 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002635 "jl 49f \n"
2636
fbarchard@google.com794fe122012-06-15 01:05:01 +00002637 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002638 ".p2align 2 \n"
2639 "41: \n"
2640 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002641 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002642 "movdqa %%xmm3,%%xmm0 \n"
2643 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002644 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002645 "psrlw $0x8,%%xmm3 \n"
2646 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2647 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002648 "pand %%xmm6,%%xmm2 \n"
2649 "paddw %%xmm7,%%xmm3 \n"
2650 "pmullw %%xmm3,%%xmm2 \n"
2651 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002652 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002653 "psrlw $0x8,%%xmm1 \n"
2654 "por %%xmm4,%%xmm0 \n"
2655 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002656 "psrlw $0x8,%%xmm2 \n"
2657 "paddusb %%xmm2,%%xmm0 \n"
2658 "pand %%xmm5,%%xmm1 \n"
2659 "paddusb %%xmm1,%%xmm0 \n"
2660 "sub $0x4,%3 \n"
2661 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002662 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002663 "jge 41b \n"
2664
2665 "49: \n"
2666 "add $0x3,%3 \n"
2667 "jl 99f \n"
2668
fbarchard@google.com794fe122012-06-15 01:05:01 +00002669 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002670 "91: \n"
2671 "movd (%0),%%xmm3 \n"
2672 "lea 0x4(%0),%0 \n"
2673 "movdqa %%xmm3,%%xmm0 \n"
2674 "pxor %%xmm4,%%xmm3 \n"
2675 "movd (%1),%%xmm2 \n"
2676 "psrlw $0x8,%%xmm3 \n"
2677 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2678 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2679 "pand %%xmm6,%%xmm2 \n"
2680 "paddw %%xmm7,%%xmm3 \n"
2681 "pmullw %%xmm3,%%xmm2 \n"
2682 "movd (%1),%%xmm1 \n"
2683 "lea 0x4(%1),%1 \n"
2684 "psrlw $0x8,%%xmm1 \n"
2685 "por %%xmm4,%%xmm0 \n"
2686 "pmullw %%xmm3,%%xmm1 \n"
2687 "psrlw $0x8,%%xmm2 \n"
2688 "paddusb %%xmm2,%%xmm0 \n"
2689 "pand %%xmm5,%%xmm1 \n"
2690 "paddusb %%xmm1,%%xmm0 \n"
2691 "sub $0x1,%3 \n"
2692 "movd %%xmm0,(%2) \n"
2693 "lea 0x4(%2),%2 \n"
2694 "jge 91b \n"
2695 "99: \n"
2696 : "+r"(src_argb0), // %0
2697 "+r"(src_argb1), // %1
2698 "+r"(dst_argb), // %2
2699 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00002700 :
2701 : "memory", "cc"
2702#if defined(__SSE2__)
2703 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2704#endif
2705 );
2706}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002707#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002708
fbarchard@google.com96af8702012-04-06 18:22:27 +00002709#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002710// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00002711CONST uvec8 kShuffleAlpha = {
2712 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2713 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2714};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002715
2716// Blend 8 pixels at a time
2717// Shuffle table for reversing the bytes.
2718
2719// Same as SSE2, but replaces
2720// psrlw xmm3, 8 // alpha
2721// pshufhw xmm3, xmm3,0F5h // 8 alpha words
2722// pshuflw xmm3, xmm3,0F5h
2723// with..
2724// pshufb xmm3, kShuffleAlpha // alpha
2725
2726void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2727 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002728 asm volatile (
2729 "pcmpeqb %%xmm7,%%xmm7 \n"
2730 "psrlw $0xf,%%xmm7 \n"
2731 "pcmpeqb %%xmm6,%%xmm6 \n"
2732 "psrlw $0x8,%%xmm6 \n"
2733 "pcmpeqb %%xmm5,%%xmm5 \n"
2734 "psllw $0x8,%%xmm5 \n"
2735 "pcmpeqb %%xmm4,%%xmm4 \n"
2736 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002737 "sub $0x1,%3 \n"
2738 "je 91f \n"
2739 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002740
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002741 // 1 pixel loop until destination pointer is aligned.
2742 "10: \n"
2743 "test $0xf,%2 \n"
2744 "je 19f \n"
2745 "movd (%0),%%xmm3 \n"
2746 "lea 0x4(%0),%0 \n"
2747 "movdqa %%xmm3,%%xmm0 \n"
2748 "pxor %%xmm4,%%xmm3 \n"
2749 "movd (%1),%%xmm2 \n"
2750 "pshufb %4,%%xmm3 \n"
2751 "pand %%xmm6,%%xmm2 \n"
2752 "paddw %%xmm7,%%xmm3 \n"
2753 "pmullw %%xmm3,%%xmm2 \n"
2754 "movd (%1),%%xmm1 \n"
2755 "lea 0x4(%1),%1 \n"
2756 "psrlw $0x8,%%xmm1 \n"
2757 "por %%xmm4,%%xmm0 \n"
2758 "pmullw %%xmm3,%%xmm1 \n"
2759 "psrlw $0x8,%%xmm2 \n"
2760 "paddusb %%xmm2,%%xmm0 \n"
2761 "pand %%xmm5,%%xmm1 \n"
2762 "paddusb %%xmm1,%%xmm0 \n"
2763 "sub $0x1,%3 \n"
2764 "movd %%xmm0,(%2) \n"
2765 "lea 0x4(%2),%2 \n"
2766 "jge 10b \n"
2767
2768 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002769 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002770 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002771 "test $0xf,%0 \n"
2772 "jne 41f \n"
2773 "test $0xf,%1 \n"
2774 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002775
fbarchard@google.com794fe122012-06-15 01:05:01 +00002776 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002777 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002778 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002779 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002780 "lea 0x10(%0),%0 \n"
2781 "movdqa %%xmm3,%%xmm0 \n"
2782 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002783 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002784 "pshufb %4,%%xmm3 \n"
2785 "pand %%xmm6,%%xmm2 \n"
2786 "paddw %%xmm7,%%xmm3 \n"
2787 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002788 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002789 "lea 0x10(%1),%1 \n"
2790 "psrlw $0x8,%%xmm1 \n"
2791 "por %%xmm4,%%xmm0 \n"
2792 "pmullw %%xmm3,%%xmm1 \n"
2793 "psrlw $0x8,%%xmm2 \n"
2794 "paddusb %%xmm2,%%xmm0 \n"
2795 "pand %%xmm5,%%xmm1 \n"
2796 "paddusb %%xmm1,%%xmm0 \n"
2797 "sub $0x4,%3 \n"
2798 "movdqa %%xmm0,(%2) \n"
2799 "lea 0x10(%2),%2 \n"
2800 "jge 40b \n"
2801 "jmp 49f \n"
2802
2803 // 4 pixel unaligned loop.
2804 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002805 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002806 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002807 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002808 "movdqa %%xmm3,%%xmm0 \n"
2809 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002810 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002811 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002812 "pand %%xmm6,%%xmm2 \n"
2813 "paddw %%xmm7,%%xmm3 \n"
2814 "pmullw %%xmm3,%%xmm2 \n"
2815 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002816 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002817 "psrlw $0x8,%%xmm1 \n"
2818 "por %%xmm4,%%xmm0 \n"
2819 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002820 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002821 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002822 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002823 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002824 "sub $0x4,%3 \n"
2825 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002826 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002827 "jge 41b \n"
2828
2829 "49: \n"
2830 "add $0x3,%3 \n"
2831 "jl 99f \n"
2832
fbarchard@google.com794fe122012-06-15 01:05:01 +00002833 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002834 "91: \n"
2835 "movd (%0),%%xmm3 \n"
2836 "lea 0x4(%0),%0 \n"
2837 "movdqa %%xmm3,%%xmm0 \n"
2838 "pxor %%xmm4,%%xmm3 \n"
2839 "movd (%1),%%xmm2 \n"
2840 "pshufb %4,%%xmm3 \n"
2841 "pand %%xmm6,%%xmm2 \n"
2842 "paddw %%xmm7,%%xmm3 \n"
2843 "pmullw %%xmm3,%%xmm2 \n"
2844 "movd (%1),%%xmm1 \n"
2845 "lea 0x4(%1),%1 \n"
2846 "psrlw $0x8,%%xmm1 \n"
2847 "por %%xmm4,%%xmm0 \n"
2848 "pmullw %%xmm3,%%xmm1 \n"
2849 "psrlw $0x8,%%xmm2 \n"
2850 "paddusb %%xmm2,%%xmm0 \n"
2851 "pand %%xmm5,%%xmm1 \n"
2852 "paddusb %%xmm1,%%xmm0 \n"
2853 "sub $0x1,%3 \n"
2854 "movd %%xmm0,(%2) \n"
2855 "lea 0x4(%2),%2 \n"
2856 "jge 91b \n"
2857 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002858 : "+r"(src_argb0), // %0
2859 "+r"(src_argb1), // %1
2860 "+r"(dst_argb), // %2
2861 "+r"(width) // %3
2862 : "m"(kShuffleAlpha) // %4
2863 : "memory", "cc"
2864#if defined(__SSE2__)
2865 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2866#endif
2867 );
2868}
2869#endif // HAS_ARGBBLENDROW_SSSE3
2870
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002871#ifdef HAS_ARGBATTENUATE_SSE2
2872// Attenuate 4 pixels at a time.
2873// aligned to 16 bytes
2874void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2875 asm volatile (
2876 "sub %0,%1 \n"
2877 "pcmpeqb %%xmm4,%%xmm4 \n"
2878 "pslld $0x18,%%xmm4 \n"
2879 "pcmpeqb %%xmm5,%%xmm5 \n"
2880 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002881
fbarchard@google.com81b804e2012-06-20 02:15:01 +00002882 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002883 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002884 "1: \n"
2885 "movdqa (%0),%%xmm0 \n"
2886 "punpcklbw %%xmm0,%%xmm0 \n"
2887 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2888 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2889 "pmulhuw %%xmm2,%%xmm0 \n"
2890 "movdqa (%0),%%xmm1 \n"
2891 "punpckhbw %%xmm1,%%xmm1 \n"
2892 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2893 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2894 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002895 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002896 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002897 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002898 "psrlw $0x8,%%xmm1 \n"
2899 "packuswb %%xmm1,%%xmm0 \n"
2900 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002901 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002902 "sub $0x4,%2 \n"
2903 "movdqa %%xmm0,(%0,%1,1) \n"
2904 "lea 0x10(%0),%0 \n"
2905 "jg 1b \n"
2906 : "+r"(src_argb), // %0
2907 "+r"(dst_argb), // %1
2908 "+r"(width) // %2
2909 :
2910 : "memory", "cc"
2911#if defined(__SSE2__)
2912 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2913#endif
2914 );
2915}
2916#endif // HAS_ARGBATTENUATE_SSE2
2917
fbarchard@google.comeeac2902012-07-18 18:54:32 +00002918#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00002919// Shuffle table duplicating alpha
2920CONST uvec8 kShuffleAlpha0 = {
2921 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2922};
2923CONST uvec8 kShuffleAlpha1 = {
2924 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2925 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2926};
2927// Attenuate 4 pixels at a time.
2928// aligned to 16 bytes
2929void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2930 asm volatile (
2931 "sub %0,%1 \n"
2932 "pcmpeqb %%xmm3,%%xmm3 \n"
2933 "pslld $0x18,%%xmm3 \n"
2934 "movdqa %3,%%xmm4 \n"
2935 "movdqa %4,%%xmm5 \n"
2936
fbarchard@google.com81b804e2012-06-20 02:15:01 +00002937 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002938 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002939 "1: \n"
2940 "movdqa (%0),%%xmm0 \n"
2941 "pshufb %%xmm4,%%xmm0 \n"
2942 "movdqa (%0),%%xmm1 \n"
2943 "punpcklbw %%xmm1,%%xmm1 \n"
2944 "pmulhuw %%xmm1,%%xmm0 \n"
2945 "movdqa (%0),%%xmm1 \n"
2946 "pshufb %%xmm5,%%xmm1 \n"
2947 "movdqa (%0),%%xmm2 \n"
2948 "punpckhbw %%xmm2,%%xmm2 \n"
2949 "pmulhuw %%xmm2,%%xmm1 \n"
2950 "movdqa (%0),%%xmm2 \n"
2951 "pand %%xmm3,%%xmm2 \n"
2952 "psrlw $0x8,%%xmm0 \n"
2953 "psrlw $0x8,%%xmm1 \n"
2954 "packuswb %%xmm1,%%xmm0 \n"
2955 "por %%xmm2,%%xmm0 \n"
2956 "sub $0x4,%2 \n"
2957 "movdqa %%xmm0,(%0,%1,1) \n"
2958 "lea 0x10(%0),%0 \n"
2959 "jg 1b \n"
2960 : "+r"(src_argb), // %0
2961 "+r"(dst_argb), // %1
2962 "+r"(width) // %2
2963 : "m"(kShuffleAlpha0), // %3
2964 "m"(kShuffleAlpha1) // %4
2965 : "memory", "cc"
2966#if defined(__SSE2__)
2967 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2968#endif
2969 );
2970}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00002971#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00002972
fbarchard@google.comeeac2902012-07-18 18:54:32 +00002973#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00002974// Unattenuate 4 pixels at a time.
2975// aligned to 16 bytes
2976void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2977 int width) {
2978 uintptr_t alpha = 0;
2979 asm volatile (
2980 "sub %0,%1 \n"
2981 "pcmpeqb %%xmm4,%%xmm4 \n"
2982 "pslld $0x18,%%xmm4 \n"
2983
fbarchard@google.com81b804e2012-06-20 02:15:01 +00002984 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002985 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002986 "1: \n"
2987 "movdqa (%0),%%xmm0 \n"
2988 "movzb 0x3(%0),%3 \n"
2989 "punpcklbw %%xmm0,%%xmm0 \n"
2990 "movd 0x0(%4,%3,4),%%xmm2 \n"
2991 "movzb 0x7(%0),%3 \n"
2992 "movd 0x0(%4,%3,4),%%xmm3 \n"
2993 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2994 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2995 "movlhps %%xmm3,%%xmm2 \n"
2996 "pmulhuw %%xmm2,%%xmm0 \n"
2997 "movdqa (%0),%%xmm1 \n"
2998 "movzb 0xb(%0),%3 \n"
2999 "punpckhbw %%xmm1,%%xmm1 \n"
3000 "movd 0x0(%4,%3,4),%%xmm2 \n"
3001 "movzb 0xf(%0),%3 \n"
3002 "movd 0x0(%4,%3,4),%%xmm3 \n"
3003 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3004 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3005 "movlhps %%xmm3,%%xmm2 \n"
3006 "pmulhuw %%xmm2,%%xmm1 \n"
3007 "movdqa (%0),%%xmm2 \n"
3008 "pand %%xmm4,%%xmm2 \n"
3009 "packuswb %%xmm1,%%xmm0 \n"
3010 "por %%xmm2,%%xmm0 \n"
3011 "sub $0x4,%2 \n"
3012 "movdqa %%xmm0,(%0,%1,1) \n"
3013 "lea 0x10(%0),%0 \n"
3014 "jg 1b \n"
3015 : "+r"(src_argb), // %0
3016 "+r"(dst_argb), // %1
3017 "+r"(width), // %2
3018 "+r"(alpha) // %3
3019 : "r"(fixed_invtbl8) // %4
3020 : "memory", "cc"
3021#if defined(__SSE2__)
3022 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3023#endif
3024 );
3025}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003026#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003027
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003028#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003029// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003030CONST vec8 kARGBToGray = {
3031 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3032};
3033
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003034// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003035void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003036 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003037 "movdqa %3,%%xmm4 \n"
3038 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003039
3040 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003041 ".p2align 4 \n"
3042 "1: \n"
3043 "movdqa (%0),%%xmm0 \n"
3044 "movdqa 0x10(%0),%%xmm1 \n"
3045 "pmaddubsw %%xmm4,%%xmm0 \n"
3046 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003047 "phaddw %%xmm1,%%xmm0 \n"
3048 "psrlw $0x7,%%xmm0 \n"
3049 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003050 "movdqa (%0),%%xmm2 \n"
3051 "movdqa 0x10(%0),%%xmm3 \n"
3052 "psrld $0x18,%%xmm2 \n"
3053 "psrld $0x18,%%xmm3 \n"
3054 "packuswb %%xmm3,%%xmm2 \n"
3055 "packuswb %%xmm2,%%xmm2 \n"
3056 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003057 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003058 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003059 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003060 "punpcklwd %%xmm3,%%xmm0 \n"
3061 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003062 "sub $0x8,%2 \n"
3063 "movdqa %%xmm0,(%0,%1,1) \n"
3064 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003065 "lea 0x20(%0),%0 \n"
3066 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003067 : "+r"(src_argb), // %0
3068 "+r"(dst_argb), // %1
3069 "+r"(width) // %2
3070 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003071 : "memory", "cc"
3072#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003073 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003074#endif
3075 );
3076}
3077#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003078
3079#ifdef HAS_ARGBSEPIAROW_SSSE3
3080// b = (r * 35 + g * 68 + b * 17) >> 7
3081// g = (r * 45 + g * 88 + b * 22) >> 7
3082// r = (r * 50 + g * 98 + b * 24) >> 7
3083// Constant for ARGB color to sepia tone
3084CONST vec8 kARGBToSepiaB = {
3085 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3086};
3087
3088CONST vec8 kARGBToSepiaG = {
3089 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3090};
3091
3092CONST vec8 kARGBToSepiaR = {
3093 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3094};
3095
fbarchard@google.come442dc42012-06-18 17:37:09 +00003096// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003097void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3098 asm volatile (
3099 "movdqa %2,%%xmm2 \n"
3100 "movdqa %3,%%xmm3 \n"
3101 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003102
3103 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003104 ".p2align 4 \n"
3105 "1: \n"
3106 "movdqa (%0),%%xmm0 \n"
3107 "movdqa 0x10(%0),%%xmm6 \n"
3108 "pmaddubsw %%xmm2,%%xmm0 \n"
3109 "pmaddubsw %%xmm2,%%xmm6 \n"
3110 "phaddw %%xmm6,%%xmm0 \n"
3111 "psrlw $0x7,%%xmm0 \n"
3112 "packuswb %%xmm0,%%xmm0 \n"
3113 "movdqa (%0),%%xmm5 \n"
3114 "movdqa 0x10(%0),%%xmm1 \n"
3115 "pmaddubsw %%xmm3,%%xmm5 \n"
3116 "pmaddubsw %%xmm3,%%xmm1 \n"
3117 "phaddw %%xmm1,%%xmm5 \n"
3118 "psrlw $0x7,%%xmm5 \n"
3119 "packuswb %%xmm5,%%xmm5 \n"
3120 "punpcklbw %%xmm5,%%xmm0 \n"
3121 "movdqa (%0),%%xmm5 \n"
3122 "movdqa 0x10(%0),%%xmm1 \n"
3123 "pmaddubsw %%xmm4,%%xmm5 \n"
3124 "pmaddubsw %%xmm4,%%xmm1 \n"
3125 "phaddw %%xmm1,%%xmm5 \n"
3126 "psrlw $0x7,%%xmm5 \n"
3127 "packuswb %%xmm5,%%xmm5 \n"
3128 "movdqa (%0),%%xmm6 \n"
3129 "movdqa 0x10(%0),%%xmm1 \n"
3130 "psrld $0x18,%%xmm6 \n"
3131 "psrld $0x18,%%xmm1 \n"
3132 "packuswb %%xmm1,%%xmm6 \n"
3133 "packuswb %%xmm6,%%xmm6 \n"
3134 "punpcklbw %%xmm6,%%xmm5 \n"
3135 "movdqa %%xmm0,%%xmm1 \n"
3136 "punpcklwd %%xmm5,%%xmm0 \n"
3137 "punpckhwd %%xmm5,%%xmm1 \n"
3138 "sub $0x8,%1 \n"
3139 "movdqa %%xmm0,(%0) \n"
3140 "movdqa %%xmm1,0x10(%0) \n"
3141 "lea 0x20(%0),%0 \n"
3142 "jg 1b \n"
3143 : "+r"(dst_argb), // %0
3144 "+r"(width) // %1
3145 : "m"(kARGBToSepiaB), // %2
3146 "m"(kARGBToSepiaG), // %3
3147 "m"(kARGBToSepiaR) // %4
3148 : "memory", "cc"
3149#if defined(__SSE2__)
3150 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3151#endif
3152 );
3153}
3154#endif // HAS_ARGBSEPIAROW_SSSE3
3155
fbarchard@google.come442dc42012-06-18 17:37:09 +00003156#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3157// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3158// Same as Sepia except matrix is provided.
3159void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3160 int width) {
3161 asm volatile (
3162 "movd (%2),%%xmm2 \n"
3163 "movd 0x4(%2),%%xmm3 \n"
3164 "movd 0x8(%2),%%xmm4 \n"
3165 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3166 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3167 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003168
3169 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003170 ".p2align 4 \n"
3171 "1: \n"
3172 "movdqa (%0),%%xmm0 \n"
3173 "movdqa 0x10(%0),%%xmm6 \n"
3174 "pmaddubsw %%xmm2,%%xmm0 \n"
3175 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003176 "movdqa (%0),%%xmm5 \n"
3177 "movdqa 0x10(%0),%%xmm1 \n"
3178 "pmaddubsw %%xmm3,%%xmm5 \n"
3179 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003180 "phaddsw %%xmm6,%%xmm0 \n"
3181 "phaddsw %%xmm1,%%xmm5 \n"
3182 "psraw $0x7,%%xmm0 \n"
3183 "psraw $0x7,%%xmm5 \n"
3184 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003185 "packuswb %%xmm5,%%xmm5 \n"
3186 "punpcklbw %%xmm5,%%xmm0 \n"
3187 "movdqa (%0),%%xmm5 \n"
3188 "movdqa 0x10(%0),%%xmm1 \n"
3189 "pmaddubsw %%xmm4,%%xmm5 \n"
3190 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003191 "phaddsw %%xmm1,%%xmm5 \n"
3192 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003193 "packuswb %%xmm5,%%xmm5 \n"
3194 "movdqa (%0),%%xmm6 \n"
3195 "movdqa 0x10(%0),%%xmm1 \n"
3196 "psrld $0x18,%%xmm6 \n"
3197 "psrld $0x18,%%xmm1 \n"
3198 "packuswb %%xmm1,%%xmm6 \n"
3199 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003200 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003201 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003202 "punpcklwd %%xmm5,%%xmm0 \n"
3203 "punpckhwd %%xmm5,%%xmm1 \n"
3204 "sub $0x8,%1 \n"
3205 "movdqa %%xmm0,(%0) \n"
3206 "movdqa %%xmm1,0x10(%0) \n"
3207 "lea 0x20(%0),%0 \n"
3208 "jg 1b \n"
3209 : "+r"(dst_argb), // %0
3210 "+r"(width) // %1
3211 : "r"(matrix_argb) // %2
3212 : "memory", "cc"
3213#if defined(__SSE2__)
3214 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3215#endif
3216 );
3217}
3218#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3219
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003220#ifdef HAS_ARGBQUANTIZEROW_SSE2
3221// Quantize 4 ARGB pixels (16 bytes).
3222// aligned to 16 bytes
3223void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3224 int interval_offset, int width) {
3225 asm volatile (
3226 "movd %2,%%xmm2 \n"
3227 "movd %3,%%xmm3 \n"
3228 "movd %4,%%xmm4 \n"
3229 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3230 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3231 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3232 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3233 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3234 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3235 "pxor %%xmm5,%%xmm5 \n"
3236 "pcmpeqb %%xmm6,%%xmm6 \n"
3237 "pslld $0x18,%%xmm6 \n"
3238
3239 // 4 pixel loop.
3240 ".p2align 2 \n"
3241 "1: \n"
3242 "movdqa (%0),%%xmm0 \n"
3243 "punpcklbw %%xmm5,%%xmm0 \n"
3244 "pmulhuw %%xmm2,%%xmm0 \n"
3245 "movdqa (%0),%%xmm1 \n"
3246 "punpckhbw %%xmm5,%%xmm1 \n"
3247 "pmulhuw %%xmm2,%%xmm1 \n"
3248 "pmullw %%xmm3,%%xmm0 \n"
3249 "movdqa (%0),%%xmm7 \n"
3250 "pmullw %%xmm3,%%xmm1 \n"
3251 "pand %%xmm6,%%xmm7 \n"
3252 "paddw %%xmm4,%%xmm0 \n"
3253 "paddw %%xmm4,%%xmm1 \n"
3254 "packuswb %%xmm1,%%xmm0 \n"
3255 "por %%xmm7,%%xmm0 \n"
3256 "sub $0x4,%1 \n"
3257 "movdqa %%xmm0,(%0) \n"
3258 "lea 0x10(%0),%0 \n"
3259 "jg 1b \n"
3260 : "+r"(dst_argb), // %0
3261 "+r"(width) // %1
3262 : "r"(scale), // %2
3263 "r"(interval_size), // %3
3264 "r"(interval_offset) // %4
3265 : "memory", "cc"
3266#if defined(__SSE2__)
3267 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3268#endif
3269 );
3270}
3271#endif // HAS_ARGBQUANTIZEROW_SSE2
3272
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003273#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3274// Creates a table of cumulative sums where each value is a sum of all values
3275// above and to the left of the value, inclusive of the value.
3276void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003277 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003278 asm volatile (
3279 "sub %1,%2 \n"
3280 "pxor %%xmm0,%%xmm0 \n"
3281 "pxor %%xmm1,%%xmm1 \n"
3282 "sub $0x4,%3 \n"
3283 "jl 49f \n"
3284 "test $0xf,%1 \n"
3285 "jne 49f \n"
3286
3287 // 4 pixel loop \n"
3288 ".p2align 2 \n"
3289 "40: \n"
3290 "movdqu (%0),%%xmm2 \n"
3291 "lea 0x10(%0),%0 \n"
3292 "movdqa %%xmm2,%%xmm4 \n"
3293 "punpcklbw %%xmm1,%%xmm2 \n"
3294 "movdqa %%xmm2,%%xmm3 \n"
3295 "punpcklwd %%xmm1,%%xmm2 \n"
3296 "punpckhwd %%xmm1,%%xmm3 \n"
3297 "punpckhbw %%xmm1,%%xmm4 \n"
3298 "movdqa %%xmm4,%%xmm5 \n"
3299 "punpcklwd %%xmm1,%%xmm4 \n"
3300 "punpckhwd %%xmm1,%%xmm5 \n"
3301 "paddd %%xmm2,%%xmm0 \n"
3302 "movdqa (%1,%2,1),%%xmm2 \n"
3303 "paddd %%xmm0,%%xmm2 \n"
3304 "paddd %%xmm3,%%xmm0 \n"
3305 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3306 "paddd %%xmm0,%%xmm3 \n"
3307 "paddd %%xmm4,%%xmm0 \n"
3308 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3309 "paddd %%xmm0,%%xmm4 \n"
3310 "paddd %%xmm5,%%xmm0 \n"
3311 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3312 "paddd %%xmm0,%%xmm5 \n"
3313 "movdqa %%xmm2,(%1) \n"
3314 "movdqa %%xmm3,0x10(%1) \n"
3315 "movdqa %%xmm4,0x20(%1) \n"
3316 "movdqa %%xmm5,0x30(%1) \n"
3317 "lea 0x40(%1),%1 \n"
3318 "sub $0x4,%3 \n"
3319 "jge 40b \n"
3320
3321 "49: \n"
3322 "add $0x3,%3 \n"
3323 "jl 19f \n"
3324
3325 // 1 pixel loop \n"
3326 ".p2align 2 \n"
3327 "10: \n"
3328 "movd (%0),%%xmm2 \n"
3329 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003330 "punpcklbw %%xmm1,%%xmm2 \n"
3331 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003332 "paddd %%xmm2,%%xmm0 \n"
3333 "movdqu (%1,%2,1),%%xmm2 \n"
3334 "paddd %%xmm0,%%xmm2 \n"
3335 "movdqu %%xmm2,(%1) \n"
3336 "lea 0x10(%1),%1 \n"
3337 "sub $0x1,%3 \n"
3338 "jge 10b \n"
3339
3340 "19: \n"
3341 : "+r"(row), // %0
3342 "+r"(cumsum), // %1
3343 "+r"(previous_cumsum), // %2
3344 "+r"(width) // %3
3345 :
3346 : "memory", "cc"
3347#if defined(__SSE2__)
3348 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3349#endif
3350 );
3351}
3352#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3353
3354#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3355void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3356 int width, int area, uint8* dst, int count) {
3357 asm volatile (
3358 "movd %5,%%xmm4 \n"
3359 "cvtdq2ps %%xmm4,%%xmm4 \n"
3360 "rcpss %%xmm4,%%xmm4 \n"
3361 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3362 "sub $0x4,%3 \n"
3363 "jl 49f \n"
3364
3365 // 4 pixel loop \n"
3366 ".p2align 2 \n"
3367 "40: \n"
3368 "movdqa (%0),%%xmm0 \n"
3369 "movdqa 0x10(%0),%%xmm1 \n"
3370 "movdqa 0x20(%0),%%xmm2 \n"
3371 "movdqa 0x30(%0),%%xmm3 \n"
3372 "psubd (%0,%4,4),%%xmm0 \n"
3373 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3374 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3375 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3376 "lea 0x40(%0),%0 \n"
3377 "psubd (%1),%%xmm0 \n"
3378 "psubd 0x10(%1),%%xmm1 \n"
3379 "psubd 0x20(%1),%%xmm2 \n"
3380 "psubd 0x30(%1),%%xmm3 \n"
3381 "paddd (%1,%4,4),%%xmm0 \n"
3382 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3383 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3384 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3385 "lea 0x40(%1),%1 \n"
3386 "cvtdq2ps %%xmm0,%%xmm0 \n"
3387 "cvtdq2ps %%xmm1,%%xmm1 \n"
3388 "mulps %%xmm4,%%xmm0 \n"
3389 "mulps %%xmm4,%%xmm1 \n"
3390 "cvtdq2ps %%xmm2,%%xmm2 \n"
3391 "cvtdq2ps %%xmm3,%%xmm3 \n"
3392 "mulps %%xmm4,%%xmm2 \n"
3393 "mulps %%xmm4,%%xmm3 \n"
3394 "cvtps2dq %%xmm0,%%xmm0 \n"
3395 "cvtps2dq %%xmm1,%%xmm1 \n"
3396 "cvtps2dq %%xmm2,%%xmm2 \n"
3397 "cvtps2dq %%xmm3,%%xmm3 \n"
3398 "packssdw %%xmm1,%%xmm0 \n"
3399 "packssdw %%xmm3,%%xmm2 \n"
3400 "packuswb %%xmm2,%%xmm0 \n"
3401 "movdqu %%xmm0,(%2) \n"
3402 "lea 0x10(%2),%2 \n"
3403 "sub $0x4,%3 \n"
3404 "jge 40b \n"
3405
3406 "49: \n"
3407 "add $0x3,%3 \n"
3408 "jl 19f \n"
3409
3410 // 1 pixel loop \n"
3411 ".p2align 2 \n"
3412 "10: \n"
3413 "movdqa (%0),%%xmm0 \n"
3414 "psubd (%0,%4,4),%%xmm0 \n"
3415 "lea 0x10(%0),%0 \n"
3416 "psubd (%1),%%xmm0 \n"
3417 "paddd (%1,%4,4),%%xmm0 \n"
3418 "lea 0x10(%1),%1 \n"
3419 "cvtdq2ps %%xmm0,%%xmm0 \n"
3420 "mulps %%xmm4,%%xmm0 \n"
3421 "cvtps2dq %%xmm0,%%xmm0 \n"
3422 "packssdw %%xmm0,%%xmm0 \n"
3423 "packuswb %%xmm0,%%xmm0 \n"
3424 "movd %%xmm0,(%2) \n"
3425 "lea 0x4(%2),%2 \n"
3426 "sub $0x1,%3 \n"
3427 "jge 10b \n"
3428 "19: \n"
3429 : "+r"(topleft), // %0
3430 "+r"(botleft), // %1
3431 "+r"(dst), // %2
3432 "+rm"(count) // %3
3433 : "r"(static_cast<intptr_t>(width)), // %4
3434 "rm"(area) // %5
3435 : "memory", "cc"
3436#if defined(__SSE2__)
3437 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3438#endif
3439 );
3440}
3441#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003442#ifdef HAS_ARGBSHADE_SSE2
3443// Shade 4 pixels at a time by specified value.
3444// Aligned to 16 bytes.
3445void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3446 uint32 value) {
3447 asm volatile (
3448 "movd %3,%%xmm2 \n"
3449 "sub %0,%1 \n"
3450 "punpcklbw %%xmm2,%%xmm2 \n"
3451 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003452
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003453 // 4 pixel loop.
3454 ".p2align 2 \n"
3455 "1: \n"
3456 "movdqa (%0),%%xmm0 \n"
3457 "movdqa %%xmm0,%%xmm1 \n"
3458 "punpcklbw %%xmm0,%%xmm0 \n"
3459 "punpckhbw %%xmm1,%%xmm1 \n"
3460 "pmulhuw %%xmm2,%%xmm0 \n"
3461 "pmulhuw %%xmm2,%%xmm1 \n"
3462 "psrlw $0x8,%%xmm0 \n"
3463 "psrlw $0x8,%%xmm1 \n"
3464 "packuswb %%xmm1,%%xmm0 \n"
3465 "sub $0x4,%2 \n"
3466 "movdqa %%xmm0,(%0,%1,1) \n"
3467 "lea 0x10(%0),%0 \n"
3468 "jg 1b \n"
3469 : "+r"(src_argb), // %0
3470 "+r"(dst_argb), // %1
3471 "+r"(width) // %2
3472 : "r"(value) // %3
3473 : "memory", "cc"
3474#if defined(__SSE2__)
3475 , "xmm0", "xmm1", "xmm2"
3476#endif
3477 );
3478}
3479#endif // HAS_ARGBSHADE_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003480
fbarchard@google.com73444402012-08-09 17:33:29 +00003481#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003482// TODO(fbarchard): Find 64 bit way to avoid masking.
3483// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
fbarchard@google.com73444402012-08-09 17:33:29 +00003484// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003485// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003486// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003487
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00003488LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00003489void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3490 uint8* dst_argb, const float* uv_dudv, int width) {
3491 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003492 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00003493 asm volatile (
3494 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003495 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003496 "shl $0x10,%1 \n"
3497 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003498 "movd %1,%%xmm5 \n"
3499 "sub $0x4,%4 \n"
3500 "jl 49f \n"
3501
3502 "pshufd $0x44,%%xmm7,%%xmm7 \n"
3503 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003504 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003505 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003506 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003507 "movdqa %%xmm7,%%xmm4 \n"
3508 "addps %%xmm4,%%xmm4 \n"
3509 "movdqa %%xmm2,%%xmm3 \n"
3510 "addps %%xmm4,%%xmm3 \n"
3511 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003512
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003513 // 4 pixel loop \n"
3514 ".p2align 4 \n"
3515 "40: \n"
3516 "cvttps2dq %%xmm2,%%xmm0 \n"
3517 "cvttps2dq %%xmm3,%%xmm1 \n"
3518 "packssdw %%xmm1,%%xmm0 \n"
3519 "pmaddwd %%xmm5,%%xmm0 \n"
3520#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003521 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003522 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003523 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003524 "shr $32,%5 \n"
3525 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
3526#else
3527 "movd %%xmm0,%1 \n"
3528 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3529 "movd %%xmm0,%5 \n"
3530 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3531#endif
3532 "movd (%0,%1,1),%%xmm1 \n"
3533 "movd (%0,%5,1),%%xmm6 \n"
3534 "punpckldq %%xmm6,%%xmm1 \n"
3535 "addps %%xmm4,%%xmm2 \n"
3536 "movq %%xmm1,(%2) \n"
3537#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003538 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003539 "mov %1,%5 \n"
3540 "and $0x0fffffff,%1 \n"
3541 "shr $32,%5 \n"
3542#else
3543 "movd %%xmm0,%1 \n"
3544 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3545 "movd %%xmm0,%5 \n"
3546#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00003547 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003548 "movd (%0,%5,1),%%xmm6 \n"
3549 "punpckldq %%xmm6,%%xmm0 \n"
3550 "addps %%xmm4,%%xmm3 \n"
3551 "sub $0x4,%4 \n"
3552 "movq %%xmm0,0x08(%2) \n"
3553 "lea 0x10(%2),%2 \n"
3554 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003555
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003556 "49: \n"
3557 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003558 "jl 19f \n"
3559
3560 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003561 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003562 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003563 "cvttps2dq %%xmm2,%%xmm0 \n"
3564 "packssdw %%xmm0,%%xmm0 \n"
3565 "pmaddwd %%xmm5,%%xmm0 \n"
3566 "addps %%xmm7,%%xmm2 \n"
3567 "movd %%xmm0,%1 \n"
3568#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00003569 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003570#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00003571 "movd (%0,%1,1),%%xmm0 \n"
3572 "sub $0x1,%4 \n"
3573 "movd %%xmm0,(%2) \n"
3574 "lea 0x4(%2),%2 \n"
3575 "jge 10b \n"
3576 "19: \n"
3577 : "+r"(src_argb), // %0
3578 "+r"(src_argb_stride_temp), // %1
3579 "+r"(dst_argb), // %2
3580 "+r"(uv_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003581 "+rm"(width), // %4
3582 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00003583 :
3584 : "memory", "cc"
3585#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003586 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00003587#endif
3588 );
3589}
3590#endif // HAS_ARGBAFFINEROW_SSE2
3591
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00003592// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
3593void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
3594 ptrdiff_t src_stride, int dst_width,
3595 int source_y_fraction) {
3596 asm volatile (
3597 "sub %1,%0 \n"
3598 "shr %3 \n"
3599 "cmp $0x0,%3 \n"
3600 "je 2f \n"
3601 "cmp $0x40,%3 \n"
3602 "je 3f \n"
3603 "movd %3,%%xmm0 \n"
3604 "neg %3 \n"
3605 "add $0x80,%3 \n"
3606 "movd %3,%%xmm5 \n"
3607 "punpcklbw %%xmm0,%%xmm5 \n"
3608 "punpcklwd %%xmm5,%%xmm5 \n"
3609 "pshufd $0x0,%%xmm5,%%xmm5 \n"
3610 ".p2align 4 \n"
3611 "1: \n"
3612 "movdqa (%1),%%xmm0 \n"
3613 "movdqa (%1,%4,1),%%xmm2 \n"
3614 "movdqa %%xmm0,%%xmm1 \n"
3615 "punpcklbw %%xmm2,%%xmm0 \n"
3616 "punpckhbw %%xmm2,%%xmm1 \n"
3617 "pmaddubsw %%xmm5,%%xmm0 \n"
3618 "pmaddubsw %%xmm5,%%xmm1 \n"
3619 "psrlw $0x7,%%xmm0 \n"
3620 "psrlw $0x7,%%xmm1 \n"
3621 "packuswb %%xmm1,%%xmm0 \n"
3622 "sub $0x4,%2 \n"
3623 "movdqa %%xmm0,(%1,%0,1) \n"
3624 "lea 0x10(%1),%1 \n"
3625 "jg 1b \n"
3626 "jmp 4f \n"
3627 ".p2align 4 \n"
3628 "2: \n"
3629 "movdqa (%1),%%xmm0 \n"
3630 "sub $0x4,%2 \n"
3631 "movdqa %%xmm0,(%1,%0,1) \n"
3632 "lea 0x10(%1),%1 \n"
3633 "jg 2b \n"
3634 "jmp 4f \n"
3635 ".p2align 4 \n"
3636 "3: \n"
3637 "movdqa (%1),%%xmm0 \n"
3638 "pavgb (%1,%4,1),%%xmm0 \n"
3639 "sub $0x4,%2 \n"
3640 "movdqa %%xmm0,(%1,%0,1) \n"
3641 "lea 0x10(%1),%1 \n"
3642 "jg 3b \n"
3643 "4: \n"
3644 ".p2align 4 \n"
3645 : "+r"(dst_ptr), // %0
3646 "+r"(src_ptr), // %1
3647 "+r"(dst_width), // %2
3648 "+r"(source_y_fraction) // %3
3649 : "r"(static_cast<intptr_t>(src_stride)) // %4
3650 : "memory", "cc"
3651#if defined(__SSE2__)
3652 , "xmm0", "xmm1", "xmm2", "xmm5"
3653#endif
3654 );
3655}
3656
fbarchard@google.com2d11d432012-02-16 02:50:39 +00003657#endif // defined(__x86_64__) || defined(__i386__)
3658
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003659#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003660} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003661} // namespace libyuv
3662#endif