blob: 74783d370e48bb35b95621d4de76f9af9892dbcf [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000101// Shuffle table for converting RGBA to ARGB.
102CONST uvec8 kShuffleMaskRGBAToARGB = {
103 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
104};
105
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000106// Shuffle table for converting ARGB to RGBA.
107CONST uvec8 kShuffleMaskARGBToRGBA = {
108 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
109};
110
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000111// Shuffle table for converting ARGB to RGB24.
112CONST uvec8 kShuffleMaskARGBToRGB24 = {
113 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
114};
115
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000116// Shuffle table for converting ARGB to RAW.
117CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000118 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000119};
120
fbarchard@google.comb6149762011-11-07 21:58:52 +0000121void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000122 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000123 "pcmpeqb %%xmm5,%%xmm5 \n"
124 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000125 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000126 "1: \n"
127 "movq (%0),%%xmm0 \n"
128 "lea 0x8(%0),%0 \n"
129 "punpcklbw %%xmm0,%%xmm0 \n"
130 "movdqa %%xmm0,%%xmm1 \n"
131 "punpcklwd %%xmm0,%%xmm0 \n"
132 "punpckhwd %%xmm1,%%xmm1 \n"
133 "por %%xmm5,%%xmm0 \n"
134 "por %%xmm5,%%xmm1 \n"
135 "movdqa %%xmm0,(%1) \n"
136 "movdqa %%xmm1,0x10(%1) \n"
137 "lea 0x20(%1),%1 \n"
138 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000139 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140 : "+r"(src_y), // %0
141 "+r"(dst_argb), // %1
142 "+r"(pix) // %2
143 :
144 : "memory", "cc"
145#if defined(__SSE2__)
146 , "xmm0", "xmm1", "xmm5"
147#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000148 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000149}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000150
151void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000152 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000153 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000154 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000155 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000156 "1: \n"
157 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000158 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000159 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000160 "movdqa %%xmm0,(%0,%1,1) \n"
161 "lea 0x10(%0),%0 \n"
162 "jg 1b \n"
163
fbarchard@google.comb6149762011-11-07 21:58:52 +0000164 : "+r"(src_abgr), // %0
165 "+r"(dst_argb), // %1
166 "+r"(pix) // %2
167 : "m"(kShuffleMaskABGRToARGB) // %3
168 : "memory", "cc"
169#if defined(__SSE2__)
170 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000171#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000172 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173}
174
175void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000176 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000177 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000178 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000179 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000180 "1: \n"
181 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000182 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000183 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000184 "movdqa %%xmm0,(%0,%1,1) \n"
185 "lea 0x10(%0),%0 \n"
186 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000187 : "+r"(src_bgra), // %0
188 "+r"(dst_argb), // %1
189 "+r"(pix) // %2
190 : "m"(kShuffleMaskBGRAToARGB) // %3
191 : "memory", "cc"
192#if defined(__SSE2__)
193 , "xmm0", "xmm5"
194#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000195 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000196}
197
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000198void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
199 asm volatile (
200 "movdqa %3,%%xmm5 \n"
201 "sub %0,%1 \n"
202 ".p2align 4 \n"
203 "1: \n"
204 "movdqa (%0),%%xmm0 \n"
205 "pshufb %%xmm5,%%xmm0 \n"
206 "sub $0x4,%2 \n"
207 "movdqa %%xmm0,(%0,%1,1) \n"
208 "lea 0x10(%0),%0 \n"
209 "jg 1b \n"
210
211 : "+r"(src_rgba), // %0
212 "+r"(dst_argb), // %1
213 "+r"(pix) // %2
214 : "m"(kShuffleMaskRGBAToARGB) // %3
215 : "memory", "cc"
216#if defined(__SSE2__)
217 , "xmm0", "xmm5"
218#endif
219 );
220}
221
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000222void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
223 asm volatile (
224 "movdqa %3,%%xmm5 \n"
225 "sub %0,%1 \n"
226 ".p2align 4 \n"
227 "1: \n"
228 "movdqa (%0),%%xmm0 \n"
229 "pshufb %%xmm5,%%xmm0 \n"
230 "sub $0x4,%2 \n"
231 "movdqa %%xmm0,(%0,%1,1) \n"
232 "lea 0x10(%0),%0 \n"
233 "jg 1b \n"
234
235 : "+r"(src_argb), // %0
236 "+r"(dst_rgba), // %1
237 "+r"(pix) // %2
238 : "m"(kShuffleMaskARGBToRGBA) // %3
239 : "memory", "cc"
240#if defined(__SSE2__)
241 , "xmm0", "xmm5"
242#endif
243 );
244}
245
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000246void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000247 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000248 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
249 "pslld $0x18,%%xmm5 \n"
250 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000251 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000252 "1: \n"
253 "movdqu (%0),%%xmm0 \n"
254 "movdqu 0x10(%0),%%xmm1 \n"
255 "movdqu 0x20(%0),%%xmm3 \n"
256 "lea 0x30(%0),%0 \n"
257 "movdqa %%xmm3,%%xmm2 \n"
258 "palignr $0x8,%%xmm1,%%xmm2 \n"
259 "pshufb %%xmm4,%%xmm2 \n"
260 "por %%xmm5,%%xmm2 \n"
261 "palignr $0xc,%%xmm0,%%xmm1 \n"
262 "pshufb %%xmm4,%%xmm0 \n"
263 "movdqa %%xmm2,0x20(%1) \n"
264 "por %%xmm5,%%xmm0 \n"
265 "pshufb %%xmm4,%%xmm1 \n"
266 "movdqa %%xmm0,(%1) \n"
267 "por %%xmm5,%%xmm1 \n"
268 "palignr $0x4,%%xmm3,%%xmm3 \n"
269 "pshufb %%xmm4,%%xmm3 \n"
270 "movdqa %%xmm1,0x10(%1) \n"
271 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000272 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000273 "movdqa %%xmm3,0x30(%1) \n"
274 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000275 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000276 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000277 "+r"(dst_argb), // %1
278 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000279 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000280 : "memory", "cc"
281#if defined(__SSE2__)
282 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
283#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000284 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000285}
286
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000287void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000288 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000289 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
290 "pslld $0x18,%%xmm5 \n"
291 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000292 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000293 "1: \n"
294 "movdqu (%0),%%xmm0 \n"
295 "movdqu 0x10(%0),%%xmm1 \n"
296 "movdqu 0x20(%0),%%xmm3 \n"
297 "lea 0x30(%0),%0 \n"
298 "movdqa %%xmm3,%%xmm2 \n"
299 "palignr $0x8,%%xmm1,%%xmm2 \n"
300 "pshufb %%xmm4,%%xmm2 \n"
301 "por %%xmm5,%%xmm2 \n"
302 "palignr $0xc,%%xmm0,%%xmm1 \n"
303 "pshufb %%xmm4,%%xmm0 \n"
304 "movdqa %%xmm2,0x20(%1) \n"
305 "por %%xmm5,%%xmm0 \n"
306 "pshufb %%xmm4,%%xmm1 \n"
307 "movdqa %%xmm0,(%1) \n"
308 "por %%xmm5,%%xmm1 \n"
309 "palignr $0x4,%%xmm3,%%xmm3 \n"
310 "pshufb %%xmm4,%%xmm3 \n"
311 "movdqa %%xmm1,0x10(%1) \n"
312 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000313 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000314 "movdqa %%xmm3,0x30(%1) \n"
315 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000316 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000317 : "+r"(src_raw), // %0
318 "+r"(dst_argb), // %1
319 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000320 : "m"(kShuffleMaskRAWToARGB) // %3
321 : "memory", "cc"
322#if defined(__SSE2__)
323 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
324#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000325 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000326}
327
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000328void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000329 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000330 "mov $0x1080108,%%eax \n"
331 "movd %%eax,%%xmm5 \n"
332 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000333 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000334 "movd %%eax,%%xmm6 \n"
335 "pshufd $0x0,%%xmm6,%%xmm6 \n"
336 "pcmpeqb %%xmm3,%%xmm3 \n"
337 "psllw $0xb,%%xmm3 \n"
338 "pcmpeqb %%xmm4,%%xmm4 \n"
339 "psllw $0xa,%%xmm4 \n"
340 "psrlw $0x5,%%xmm4 \n"
341 "pcmpeqb %%xmm7,%%xmm7 \n"
342 "psllw $0x8,%%xmm7 \n"
343 "sub %0,%1 \n"
344 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000345 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000346 "1: \n"
347 "movdqu (%0),%%xmm0 \n"
348 "movdqa %%xmm0,%%xmm1 \n"
349 "movdqa %%xmm0,%%xmm2 \n"
350 "pand %%xmm3,%%xmm1 \n"
351 "psllw $0xb,%%xmm2 \n"
352 "pmulhuw %%xmm5,%%xmm1 \n"
353 "pmulhuw %%xmm5,%%xmm2 \n"
354 "psllw $0x8,%%xmm1 \n"
355 "por %%xmm2,%%xmm1 \n"
356 "pand %%xmm4,%%xmm0 \n"
357 "pmulhuw %%xmm6,%%xmm0 \n"
358 "por %%xmm7,%%xmm0 \n"
359 "movdqa %%xmm1,%%xmm2 \n"
360 "punpcklbw %%xmm0,%%xmm1 \n"
361 "punpckhbw %%xmm0,%%xmm2 \n"
362 "movdqa %%xmm1,(%1,%0,2) \n"
363 "movdqa %%xmm2,0x10(%1,%0,2) \n"
364 "lea 0x10(%0),%0 \n"
365 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000366 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000367 : "+r"(src), // %0
368 "+r"(dst), // %1
369 "+r"(pix) // %2
370 :
371 : "memory", "cc", "eax"
372#if defined(__SSE2__)
373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
374#endif
375 );
376}
377
378void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000379 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000380 "mov $0x1080108,%%eax \n"
381 "movd %%eax,%%xmm5 \n"
382 "pshufd $0x0,%%xmm5,%%xmm5 \n"
383 "mov $0x42004200,%%eax \n"
384 "movd %%eax,%%xmm6 \n"
385 "pshufd $0x0,%%xmm6,%%xmm6 \n"
386 "pcmpeqb %%xmm3,%%xmm3 \n"
387 "psllw $0xb,%%xmm3 \n"
388 "movdqa %%xmm3,%%xmm4 \n"
389 "psrlw $0x6,%%xmm4 \n"
390 "pcmpeqb %%xmm7,%%xmm7 \n"
391 "psllw $0x8,%%xmm7 \n"
392 "sub %0,%1 \n"
393 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000394 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000395 "1: \n"
396 "movdqu (%0),%%xmm0 \n"
397 "movdqa %%xmm0,%%xmm1 \n"
398 "movdqa %%xmm0,%%xmm2 \n"
399 "psllw $0x1,%%xmm1 \n"
400 "psllw $0xb,%%xmm2 \n"
401 "pand %%xmm3,%%xmm1 \n"
402 "pmulhuw %%xmm5,%%xmm2 \n"
403 "pmulhuw %%xmm5,%%xmm1 \n"
404 "psllw $0x8,%%xmm1 \n"
405 "por %%xmm2,%%xmm1 \n"
406 "movdqa %%xmm0,%%xmm2 \n"
407 "pand %%xmm4,%%xmm0 \n"
408 "psraw $0x8,%%xmm2 \n"
409 "pmulhuw %%xmm6,%%xmm0 \n"
410 "pand %%xmm7,%%xmm2 \n"
411 "por %%xmm2,%%xmm0 \n"
412 "movdqa %%xmm1,%%xmm2 \n"
413 "punpcklbw %%xmm0,%%xmm1 \n"
414 "punpckhbw %%xmm0,%%xmm2 \n"
415 "movdqa %%xmm1,(%1,%0,2) \n"
416 "movdqa %%xmm2,0x10(%1,%0,2) \n"
417 "lea 0x10(%0),%0 \n"
418 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000419 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000420 : "+r"(src), // %0
421 "+r"(dst), // %1
422 "+r"(pix) // %2
423 :
424 : "memory", "cc", "eax"
425#if defined(__SSE2__)
426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
427#endif
428 );
429}
430
431void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000432 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000433 "mov $0xf0f0f0f,%%eax \n"
434 "movd %%eax,%%xmm4 \n"
435 "pshufd $0x0,%%xmm4,%%xmm4 \n"
436 "movdqa %%xmm4,%%xmm5 \n"
437 "pslld $0x4,%%xmm5 \n"
438 "sub %0,%1 \n"
439 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000440 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000441 "1: \n"
442 "movdqu (%0),%%xmm0 \n"
443 "movdqa %%xmm0,%%xmm2 \n"
444 "pand %%xmm4,%%xmm0 \n"
445 "pand %%xmm5,%%xmm2 \n"
446 "movdqa %%xmm0,%%xmm1 \n"
447 "movdqa %%xmm2,%%xmm3 \n"
448 "psllw $0x4,%%xmm1 \n"
449 "psrlw $0x4,%%xmm3 \n"
450 "por %%xmm1,%%xmm0 \n"
451 "por %%xmm3,%%xmm2 \n"
452 "movdqa %%xmm0,%%xmm1 \n"
453 "punpcklbw %%xmm2,%%xmm0 \n"
454 "punpckhbw %%xmm2,%%xmm1 \n"
455 "movdqa %%xmm0,(%1,%0,2) \n"
456 "movdqa %%xmm1,0x10(%1,%0,2) \n"
457 "lea 0x10(%0),%0 \n"
458 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000459 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000460 : "+r"(src), // %0
461 "+r"(dst), // %1
462 "+r"(pix) // %2
463 :
464 : "memory", "cc", "eax"
465#if defined(__SSE2__)
466 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
467#endif
468 );
469}
470
471void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000472 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000473 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000474 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000475 "1: \n"
476 "movdqa (%0),%%xmm0 \n"
477 "movdqa 0x10(%0),%%xmm1 \n"
478 "movdqa 0x20(%0),%%xmm2 \n"
479 "movdqa 0x30(%0),%%xmm3 \n"
480 "lea 0x40(%0),%0 \n"
481 "pshufb %%xmm6,%%xmm0 \n"
482 "pshufb %%xmm6,%%xmm1 \n"
483 "pshufb %%xmm6,%%xmm2 \n"
484 "pshufb %%xmm6,%%xmm3 \n"
485 "movdqa %%xmm1,%%xmm4 \n"
486 "psrldq $0x4,%%xmm1 \n"
487 "pslldq $0xc,%%xmm4 \n"
488 "movdqa %%xmm2,%%xmm5 \n"
489 "por %%xmm4,%%xmm0 \n"
490 "pslldq $0x8,%%xmm5 \n"
491 "movdqa %%xmm0,(%1) \n"
492 "por %%xmm5,%%xmm1 \n"
493 "psrldq $0x8,%%xmm2 \n"
494 "pslldq $0x4,%%xmm3 \n"
495 "por %%xmm3,%%xmm2 \n"
496 "movdqa %%xmm1,0x10(%1) \n"
497 "movdqa %%xmm2,0x20(%1) \n"
498 "lea 0x30(%1),%1 \n"
499 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000500 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000501 : "+r"(src), // %0
502 "+r"(dst), // %1
503 "+r"(pix) // %2
504 : "m"(kShuffleMaskARGBToRGB24) // %3
505 : "memory", "cc"
506#if defined(__SSE2__)
507 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
508#endif
509 );
510}
511
512void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000513 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000514 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000515 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000516 "1: \n"
517 "movdqa (%0),%%xmm0 \n"
518 "movdqa 0x10(%0),%%xmm1 \n"
519 "movdqa 0x20(%0),%%xmm2 \n"
520 "movdqa 0x30(%0),%%xmm3 \n"
521 "lea 0x40(%0),%0 \n"
522 "pshufb %%xmm6,%%xmm0 \n"
523 "pshufb %%xmm6,%%xmm1 \n"
524 "pshufb %%xmm6,%%xmm2 \n"
525 "pshufb %%xmm6,%%xmm3 \n"
526 "movdqa %%xmm1,%%xmm4 \n"
527 "psrldq $0x4,%%xmm1 \n"
528 "pslldq $0xc,%%xmm4 \n"
529 "movdqa %%xmm2,%%xmm5 \n"
530 "por %%xmm4,%%xmm0 \n"
531 "pslldq $0x8,%%xmm5 \n"
532 "movdqa %%xmm0,(%1) \n"
533 "por %%xmm5,%%xmm1 \n"
534 "psrldq $0x8,%%xmm2 \n"
535 "pslldq $0x4,%%xmm3 \n"
536 "por %%xmm3,%%xmm2 \n"
537 "movdqa %%xmm1,0x10(%1) \n"
538 "movdqa %%xmm2,0x20(%1) \n"
539 "lea 0x30(%1),%1 \n"
540 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000541 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000542 : "+r"(src), // %0
543 "+r"(dst), // %1
544 "+r"(pix) // %2
545 : "m"(kShuffleMaskARGBToRAW) // %3
546 : "memory", "cc"
547#if defined(__SSE2__)
548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
549#endif
550 );
551}
552
553void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000554 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000555 "pcmpeqb %%xmm3,%%xmm3 \n"
556 "psrld $0x1b,%%xmm3 \n"
557 "pcmpeqb %%xmm4,%%xmm4 \n"
558 "psrld $0x1a,%%xmm4 \n"
559 "pslld $0x5,%%xmm4 \n"
560 "pcmpeqb %%xmm5,%%xmm5 \n"
561 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000562 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000563 "1: \n"
564 "movdqa (%0),%%xmm0 \n"
565 "movdqa %%xmm0,%%xmm1 \n"
566 "movdqa %%xmm0,%%xmm2 \n"
567 "pslld $0x8,%%xmm0 \n"
568 "psrld $0x3,%%xmm1 \n"
569 "psrld $0x5,%%xmm2 \n"
570 "psrad $0x10,%%xmm0 \n"
571 "pand %%xmm3,%%xmm1 \n"
572 "pand %%xmm4,%%xmm2 \n"
573 "pand %%xmm5,%%xmm0 \n"
574 "por %%xmm2,%%xmm1 \n"
575 "por %%xmm1,%%xmm0 \n"
576 "packssdw %%xmm0,%%xmm0 \n"
577 "lea 0x10(%0),%0 \n"
578 "movq %%xmm0,(%1) \n"
579 "lea 0x8(%1),%1 \n"
580 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000581 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000582 : "+r"(src), // %0
583 "+r"(dst), // %1
584 "+r"(pix) // %2
585 :
586 : "memory", "cc"
587#if defined(__SSE2__)
588 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
589#endif
590 );
591}
592
593void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000594 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000595 "pcmpeqb %%xmm4,%%xmm4 \n"
596 "psrld $0x1b,%%xmm4 \n"
597 "movdqa %%xmm4,%%xmm5 \n"
598 "pslld $0x5,%%xmm5 \n"
599 "movdqa %%xmm4,%%xmm6 \n"
600 "pslld $0xa,%%xmm6 \n"
601 "pcmpeqb %%xmm7,%%xmm7 \n"
602 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000603 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000604 "1: \n"
605 "movdqa (%0),%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "movdqa %%xmm0,%%xmm2 \n"
608 "movdqa %%xmm0,%%xmm3 \n"
609 "psrad $0x10,%%xmm0 \n"
610 "psrld $0x3,%%xmm1 \n"
611 "psrld $0x6,%%xmm2 \n"
612 "psrld $0x9,%%xmm3 \n"
613 "pand %%xmm7,%%xmm0 \n"
614 "pand %%xmm4,%%xmm1 \n"
615 "pand %%xmm5,%%xmm2 \n"
616 "pand %%xmm6,%%xmm3 \n"
617 "por %%xmm1,%%xmm0 \n"
618 "por %%xmm3,%%xmm2 \n"
619 "por %%xmm2,%%xmm0 \n"
620 "packssdw %%xmm0,%%xmm0 \n"
621 "lea 0x10(%0),%0 \n"
622 "movq %%xmm0,(%1) \n"
623 "lea 0x8(%1),%1 \n"
624 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000625 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000626 : "+r"(src), // %0
627 "+r"(dst), // %1
628 "+r"(pix) // %2
629 :
630 : "memory", "cc"
631#if defined(__SSE2__)
632 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
633#endif
634 );
635}
636
637void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000638 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000639 "pcmpeqb %%xmm4,%%xmm4 \n"
640 "psllw $0xc,%%xmm4 \n"
641 "movdqa %%xmm4,%%xmm3 \n"
642 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000643 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000644 "1: \n"
645 "movdqa (%0),%%xmm0 \n"
646 "movdqa %%xmm0,%%xmm1 \n"
647 "pand %%xmm3,%%xmm0 \n"
648 "pand %%xmm4,%%xmm1 \n"
649 "psrlq $0x4,%%xmm0 \n"
650 "psrlq $0x8,%%xmm1 \n"
651 "por %%xmm1,%%xmm0 \n"
652 "packuswb %%xmm0,%%xmm0 \n"
653 "lea 0x10(%0),%0 \n"
654 "movq %%xmm0,(%1) \n"
655 "lea 0x8(%1),%1 \n"
656 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000657 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000658 : "+r"(src), // %0
659 "+r"(dst), // %1
660 "+r"(pix) // %2
661 :
662 : "memory", "cc"
663#if defined(__SSE2__)
664 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
665#endif
666 );
667}
668
fbarchard@google.comb6149762011-11-07 21:58:52 +0000669void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000670 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000671 "movdqa %4,%%xmm5 \n"
672 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000673 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000674 "1: \n"
675 "movdqa (%0),%%xmm0 \n"
676 "movdqa 0x10(%0),%%xmm1 \n"
677 "movdqa 0x20(%0),%%xmm2 \n"
678 "movdqa 0x30(%0),%%xmm3 \n"
679 "pmaddubsw %%xmm4,%%xmm0 \n"
680 "pmaddubsw %%xmm4,%%xmm1 \n"
681 "pmaddubsw %%xmm4,%%xmm2 \n"
682 "pmaddubsw %%xmm4,%%xmm3 \n"
683 "lea 0x40(%0),%0 \n"
684 "phaddw %%xmm1,%%xmm0 \n"
685 "phaddw %%xmm3,%%xmm2 \n"
686 "psrlw $0x7,%%xmm0 \n"
687 "psrlw $0x7,%%xmm2 \n"
688 "packuswb %%xmm2,%%xmm0 \n"
689 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000690 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000691 "movdqa %%xmm0,(%1) \n"
692 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000693 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000694 : "+r"(src_argb), // %0
695 "+r"(dst_y), // %1
696 "+r"(pix) // %2
697 : "m"(kARGBToY), // %3
698 "m"(kAddY16) // %4
699 : "memory", "cc"
700#if defined(__SSE2__)
701 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
702#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000703 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000704}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000705
706void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000707 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000708 "movdqa %4,%%xmm5 \n"
709 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000710 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000711 "1: \n"
712 "movdqu (%0),%%xmm0 \n"
713 "movdqu 0x10(%0),%%xmm1 \n"
714 "movdqu 0x20(%0),%%xmm2 \n"
715 "movdqu 0x30(%0),%%xmm3 \n"
716 "pmaddubsw %%xmm4,%%xmm0 \n"
717 "pmaddubsw %%xmm4,%%xmm1 \n"
718 "pmaddubsw %%xmm4,%%xmm2 \n"
719 "pmaddubsw %%xmm4,%%xmm3 \n"
720 "lea 0x40(%0),%0 \n"
721 "phaddw %%xmm1,%%xmm0 \n"
722 "phaddw %%xmm3,%%xmm2 \n"
723 "psrlw $0x7,%%xmm0 \n"
724 "psrlw $0x7,%%xmm2 \n"
725 "packuswb %%xmm2,%%xmm0 \n"
726 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000727 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000728 "movdqu %%xmm0,(%1) \n"
729 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000730 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000731 : "+r"(src_argb), // %0
732 "+r"(dst_y), // %1
733 "+r"(pix) // %2
734 : "m"(kARGBToY), // %3
735 "m"(kAddY16) // %4
736 : "memory", "cc"
737#if defined(__SSE2__)
738 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
739#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000740 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000741}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000742
fbarchard@google.com714050a2012-02-17 22:59:56 +0000743// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000744// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
745// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
746// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000747// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000748void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
749 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000750 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000751 "movdqa %0,%%xmm4 \n"
752 "movdqa %1,%%xmm3 \n"
753 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000754 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000755 : "m"(kARGBToU), // %0
756 "m"(kARGBToV), // %1
757 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000758 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000759 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000760 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000761 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000762 "1: \n"
763 "movdqa (%0),%%xmm0 \n"
764 "movdqa 0x10(%0),%%xmm1 \n"
765 "movdqa 0x20(%0),%%xmm2 \n"
766 "movdqa 0x30(%0),%%xmm6 \n"
767 "pavgb (%0,%4,1),%%xmm0 \n"
768 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
769 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
770 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
771 "lea 0x40(%0),%0 \n"
772 "movdqa %%xmm0,%%xmm7 \n"
773 "shufps $0x88,%%xmm1,%%xmm0 \n"
774 "shufps $0xdd,%%xmm1,%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm0 \n"
776 "movdqa %%xmm2,%%xmm7 \n"
777 "shufps $0x88,%%xmm6,%%xmm2 \n"
778 "shufps $0xdd,%%xmm6,%%xmm7 \n"
779 "pavgb %%xmm7,%%xmm2 \n"
780 "movdqa %%xmm0,%%xmm1 \n"
781 "movdqa %%xmm2,%%xmm6 \n"
782 "pmaddubsw %%xmm4,%%xmm0 \n"
783 "pmaddubsw %%xmm4,%%xmm2 \n"
784 "pmaddubsw %%xmm3,%%xmm1 \n"
785 "pmaddubsw %%xmm3,%%xmm6 \n"
786 "phaddw %%xmm2,%%xmm0 \n"
787 "phaddw %%xmm6,%%xmm1 \n"
788 "psraw $0x8,%%xmm0 \n"
789 "psraw $0x8,%%xmm1 \n"
790 "packsswb %%xmm1,%%xmm0 \n"
791 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000792 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000793 "movlps %%xmm0,(%1) \n"
794 "movhps %%xmm0,(%1,%2,1) \n"
795 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000796 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000797 : "+r"(src_argb0), // %0
798 "+r"(dst_u), // %1
799 "+r"(dst_v), // %2
800 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000801 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000802 : "memory", "cc"
803#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000804 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000805#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000806 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000807}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000808
809void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
810 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000811 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000812 "movdqa %0,%%xmm4 \n"
813 "movdqa %1,%%xmm3 \n"
814 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000815 :
816 : "m"(kARGBToU), // %0
817 "m"(kARGBToV), // %1
818 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000819 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000820 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000821 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000822 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000823 "1: \n"
824 "movdqu (%0),%%xmm0 \n"
825 "movdqu 0x10(%0),%%xmm1 \n"
826 "movdqu 0x20(%0),%%xmm2 \n"
827 "movdqu 0x30(%0),%%xmm6 \n"
828 "movdqu (%0,%4,1),%%xmm7 \n"
829 "pavgb %%xmm7,%%xmm0 \n"
830 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
831 "pavgb %%xmm7,%%xmm1 \n"
832 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
833 "pavgb %%xmm7,%%xmm2 \n"
834 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
835 "pavgb %%xmm7,%%xmm6 \n"
836 "lea 0x40(%0),%0 \n"
837 "movdqa %%xmm0,%%xmm7 \n"
838 "shufps $0x88,%%xmm1,%%xmm0 \n"
839 "shufps $0xdd,%%xmm1,%%xmm7 \n"
840 "pavgb %%xmm7,%%xmm0 \n"
841 "movdqa %%xmm2,%%xmm7 \n"
842 "shufps $0x88,%%xmm6,%%xmm2 \n"
843 "shufps $0xdd,%%xmm6,%%xmm7 \n"
844 "pavgb %%xmm7,%%xmm2 \n"
845 "movdqa %%xmm0,%%xmm1 \n"
846 "movdqa %%xmm2,%%xmm6 \n"
847 "pmaddubsw %%xmm4,%%xmm0 \n"
848 "pmaddubsw %%xmm4,%%xmm2 \n"
849 "pmaddubsw %%xmm3,%%xmm1 \n"
850 "pmaddubsw %%xmm3,%%xmm6 \n"
851 "phaddw %%xmm2,%%xmm0 \n"
852 "phaddw %%xmm6,%%xmm1 \n"
853 "psraw $0x8,%%xmm0 \n"
854 "psraw $0x8,%%xmm1 \n"
855 "packsswb %%xmm1,%%xmm0 \n"
856 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000857 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000858 "movlps %%xmm0,(%1) \n"
859 "movhps %%xmm0,(%1,%2,1) \n"
860 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000861 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000862 : "+r"(src_argb0), // %0
863 "+r"(dst_u), // %1
864 "+r"(dst_v), // %2
865 "+rm"(width) // %3
866 : "r"(static_cast<intptr_t>(src_stride_argb))
867 : "memory", "cc"
868#if defined(__SSE2__)
869 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
870#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000871 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000872}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000873
fbarchard@google.com714050a2012-02-17 22:59:56 +0000874void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000875 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000876 "movdqa %4,%%xmm5 \n"
877 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000878 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000879 "1: \n"
880 "movdqa (%0),%%xmm0 \n"
881 "movdqa 0x10(%0),%%xmm1 \n"
882 "movdqa 0x20(%0),%%xmm2 \n"
883 "movdqa 0x30(%0),%%xmm3 \n"
884 "pmaddubsw %%xmm4,%%xmm0 \n"
885 "pmaddubsw %%xmm4,%%xmm1 \n"
886 "pmaddubsw %%xmm4,%%xmm2 \n"
887 "pmaddubsw %%xmm4,%%xmm3 \n"
888 "lea 0x40(%0),%0 \n"
889 "phaddw %%xmm1,%%xmm0 \n"
890 "phaddw %%xmm3,%%xmm2 \n"
891 "psrlw $0x7,%%xmm0 \n"
892 "psrlw $0x7,%%xmm2 \n"
893 "packuswb %%xmm2,%%xmm0 \n"
894 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000895 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000896 "movdqa %%xmm0,(%1) \n"
897 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000898 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000899 : "+r"(src_bgra), // %0
900 "+r"(dst_y), // %1
901 "+r"(pix) // %2
902 : "m"(kBGRAToY), // %3
903 "m"(kAddY16) // %4
904 : "memory", "cc"
905#if defined(__SSE2__)
906 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000907#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000908 );
909}
910
911void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000912 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000913 "movdqa %4,%%xmm5 \n"
914 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000915 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000916 "1: \n"
917 "movdqu (%0),%%xmm0 \n"
918 "movdqu 0x10(%0),%%xmm1 \n"
919 "movdqu 0x20(%0),%%xmm2 \n"
920 "movdqu 0x30(%0),%%xmm3 \n"
921 "pmaddubsw %%xmm4,%%xmm0 \n"
922 "pmaddubsw %%xmm4,%%xmm1 \n"
923 "pmaddubsw %%xmm4,%%xmm2 \n"
924 "pmaddubsw %%xmm4,%%xmm3 \n"
925 "lea 0x40(%0),%0 \n"
926 "phaddw %%xmm1,%%xmm0 \n"
927 "phaddw %%xmm3,%%xmm2 \n"
928 "psrlw $0x7,%%xmm0 \n"
929 "psrlw $0x7,%%xmm2 \n"
930 "packuswb %%xmm2,%%xmm0 \n"
931 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000932 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000933 "movdqu %%xmm0,(%1) \n"
934 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000935 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000936 : "+r"(src_bgra), // %0
937 "+r"(dst_y), // %1
938 "+r"(pix) // %2
939 : "m"(kBGRAToY), // %3
940 "m"(kAddY16) // %4
941 : "memory", "cc"
942#if defined(__SSE2__)
943 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
944#endif
945 );
946}
947
948void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
949 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000950 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000951 "movdqa %0,%%xmm4 \n"
952 "movdqa %1,%%xmm3 \n"
953 "movdqa %2,%%xmm5 \n"
954 :
955 : "m"(kBGRAToU), // %0
956 "m"(kBGRAToV), // %1
957 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000958 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000959 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000960 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000961 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000962 "1: \n"
963 "movdqa (%0),%%xmm0 \n"
964 "movdqa 0x10(%0),%%xmm1 \n"
965 "movdqa 0x20(%0),%%xmm2 \n"
966 "movdqa 0x30(%0),%%xmm6 \n"
967 "pavgb (%0,%4,1),%%xmm0 \n"
968 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
969 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
970 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
971 "lea 0x40(%0),%0 \n"
972 "movdqa %%xmm0,%%xmm7 \n"
973 "shufps $0x88,%%xmm1,%%xmm0 \n"
974 "shufps $0xdd,%%xmm1,%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm0 \n"
976 "movdqa %%xmm2,%%xmm7 \n"
977 "shufps $0x88,%%xmm6,%%xmm2 \n"
978 "shufps $0xdd,%%xmm6,%%xmm7 \n"
979 "pavgb %%xmm7,%%xmm2 \n"
980 "movdqa %%xmm0,%%xmm1 \n"
981 "movdqa %%xmm2,%%xmm6 \n"
982 "pmaddubsw %%xmm4,%%xmm0 \n"
983 "pmaddubsw %%xmm4,%%xmm2 \n"
984 "pmaddubsw %%xmm3,%%xmm1 \n"
985 "pmaddubsw %%xmm3,%%xmm6 \n"
986 "phaddw %%xmm2,%%xmm0 \n"
987 "phaddw %%xmm6,%%xmm1 \n"
988 "psraw $0x8,%%xmm0 \n"
989 "psraw $0x8,%%xmm1 \n"
990 "packsswb %%xmm1,%%xmm0 \n"
991 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000992 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000993 "movlps %%xmm0,(%1) \n"
994 "movhps %%xmm0,(%1,%2,1) \n"
995 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000996 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000997 : "+r"(src_bgra0), // %0
998 "+r"(dst_u), // %1
999 "+r"(dst_v), // %2
1000 "+rm"(width) // %3
1001 : "r"(static_cast<intptr_t>(src_stride_bgra))
1002 : "memory", "cc"
1003#if defined(__SSE2__)
1004 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1005#endif
1006 );
1007}
1008
1009void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1010 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001011 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001012 "movdqa %0,%%xmm4 \n"
1013 "movdqa %1,%%xmm3 \n"
1014 "movdqa %2,%%xmm5 \n"
1015 :
1016 : "m"(kBGRAToU), // %0
1017 "m"(kBGRAToV), // %1
1018 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001019 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001020 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001021 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001022 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001023 "1: \n"
1024 "movdqu (%0),%%xmm0 \n"
1025 "movdqu 0x10(%0),%%xmm1 \n"
1026 "movdqu 0x20(%0),%%xmm2 \n"
1027 "movdqu 0x30(%0),%%xmm6 \n"
1028 "movdqu (%0,%4,1),%%xmm7 \n"
1029 "pavgb %%xmm7,%%xmm0 \n"
1030 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1031 "pavgb %%xmm7,%%xmm1 \n"
1032 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1033 "pavgb %%xmm7,%%xmm2 \n"
1034 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1035 "pavgb %%xmm7,%%xmm6 \n"
1036 "lea 0x40(%0),%0 \n"
1037 "movdqa %%xmm0,%%xmm7 \n"
1038 "shufps $0x88,%%xmm1,%%xmm0 \n"
1039 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1040 "pavgb %%xmm7,%%xmm0 \n"
1041 "movdqa %%xmm2,%%xmm7 \n"
1042 "shufps $0x88,%%xmm6,%%xmm2 \n"
1043 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1044 "pavgb %%xmm7,%%xmm2 \n"
1045 "movdqa %%xmm0,%%xmm1 \n"
1046 "movdqa %%xmm2,%%xmm6 \n"
1047 "pmaddubsw %%xmm4,%%xmm0 \n"
1048 "pmaddubsw %%xmm4,%%xmm2 \n"
1049 "pmaddubsw %%xmm3,%%xmm1 \n"
1050 "pmaddubsw %%xmm3,%%xmm6 \n"
1051 "phaddw %%xmm2,%%xmm0 \n"
1052 "phaddw %%xmm6,%%xmm1 \n"
1053 "psraw $0x8,%%xmm0 \n"
1054 "psraw $0x8,%%xmm1 \n"
1055 "packsswb %%xmm1,%%xmm0 \n"
1056 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001057 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001058 "movlps %%xmm0,(%1) \n"
1059 "movhps %%xmm0,(%1,%2,1) \n"
1060 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001061 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001062 : "+r"(src_bgra0), // %0
1063 "+r"(dst_u), // %1
1064 "+r"(dst_v), // %2
1065 "+rm"(width) // %3
1066 : "r"(static_cast<intptr_t>(src_stride_bgra))
1067 : "memory", "cc"
1068#if defined(__SSE2__)
1069 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1070#endif
1071 );
1072}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001073
1074void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001075 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001076 "movdqa %4,%%xmm5 \n"
1077 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001078 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001079 "1: \n"
1080 "movdqa (%0),%%xmm0 \n"
1081 "movdqa 0x10(%0),%%xmm1 \n"
1082 "movdqa 0x20(%0),%%xmm2 \n"
1083 "movdqa 0x30(%0),%%xmm3 \n"
1084 "pmaddubsw %%xmm4,%%xmm0 \n"
1085 "pmaddubsw %%xmm4,%%xmm1 \n"
1086 "pmaddubsw %%xmm4,%%xmm2 \n"
1087 "pmaddubsw %%xmm4,%%xmm3 \n"
1088 "lea 0x40(%0),%0 \n"
1089 "phaddw %%xmm1,%%xmm0 \n"
1090 "phaddw %%xmm3,%%xmm2 \n"
1091 "psrlw $0x7,%%xmm0 \n"
1092 "psrlw $0x7,%%xmm2 \n"
1093 "packuswb %%xmm2,%%xmm0 \n"
1094 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001095 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001096 "movdqa %%xmm0,(%1) \n"
1097 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001098 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001099 : "+r"(src_abgr), // %0
1100 "+r"(dst_y), // %1
1101 "+r"(pix) // %2
1102 : "m"(kABGRToY), // %3
1103 "m"(kAddY16) // %4
1104 : "memory", "cc"
1105#if defined(__SSE2__)
1106 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1107#endif
1108 );
1109}
1110
1111void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001112 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001113 "movdqa %4,%%xmm5 \n"
1114 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001115 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001116 "1: \n"
1117 "movdqu (%0),%%xmm0 \n"
1118 "movdqu 0x10(%0),%%xmm1 \n"
1119 "movdqu 0x20(%0),%%xmm2 \n"
1120 "movdqu 0x30(%0),%%xmm3 \n"
1121 "pmaddubsw %%xmm4,%%xmm0 \n"
1122 "pmaddubsw %%xmm4,%%xmm1 \n"
1123 "pmaddubsw %%xmm4,%%xmm2 \n"
1124 "pmaddubsw %%xmm4,%%xmm3 \n"
1125 "lea 0x40(%0),%0 \n"
1126 "phaddw %%xmm1,%%xmm0 \n"
1127 "phaddw %%xmm3,%%xmm2 \n"
1128 "psrlw $0x7,%%xmm0 \n"
1129 "psrlw $0x7,%%xmm2 \n"
1130 "packuswb %%xmm2,%%xmm0 \n"
1131 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001132 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001133 "movdqu %%xmm0,(%1) \n"
1134 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001135 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001136 : "+r"(src_abgr), // %0
1137 "+r"(dst_y), // %1
1138 "+r"(pix) // %2
1139 : "m"(kABGRToY), // %3
1140 "m"(kAddY16) // %4
1141 : "memory", "cc"
1142#if defined(__SSE2__)
1143 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1144#endif
1145 );
1146}
1147
1148void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1149 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001150 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001151 "movdqa %0,%%xmm4 \n"
1152 "movdqa %1,%%xmm3 \n"
1153 "movdqa %2,%%xmm5 \n"
1154 :
1155 : "m"(kABGRToU), // %0
1156 "m"(kABGRToV), // %1
1157 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001158 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001159 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001160 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001161 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001162 "1: \n"
1163 "movdqa (%0),%%xmm0 \n"
1164 "movdqa 0x10(%0),%%xmm1 \n"
1165 "movdqa 0x20(%0),%%xmm2 \n"
1166 "movdqa 0x30(%0),%%xmm6 \n"
1167 "pavgb (%0,%4,1),%%xmm0 \n"
1168 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1169 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1170 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1171 "lea 0x40(%0),%0 \n"
1172 "movdqa %%xmm0,%%xmm7 \n"
1173 "shufps $0x88,%%xmm1,%%xmm0 \n"
1174 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm0 \n"
1176 "movdqa %%xmm2,%%xmm7 \n"
1177 "shufps $0x88,%%xmm6,%%xmm2 \n"
1178 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1179 "pavgb %%xmm7,%%xmm2 \n"
1180 "movdqa %%xmm0,%%xmm1 \n"
1181 "movdqa %%xmm2,%%xmm6 \n"
1182 "pmaddubsw %%xmm4,%%xmm0 \n"
1183 "pmaddubsw %%xmm4,%%xmm2 \n"
1184 "pmaddubsw %%xmm3,%%xmm1 \n"
1185 "pmaddubsw %%xmm3,%%xmm6 \n"
1186 "phaddw %%xmm2,%%xmm0 \n"
1187 "phaddw %%xmm6,%%xmm1 \n"
1188 "psraw $0x8,%%xmm0 \n"
1189 "psraw $0x8,%%xmm1 \n"
1190 "packsswb %%xmm1,%%xmm0 \n"
1191 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001192 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001193 "movlps %%xmm0,(%1) \n"
1194 "movhps %%xmm0,(%1,%2,1) \n"
1195 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001196 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001197 : "+r"(src_abgr0), // %0
1198 "+r"(dst_u), // %1
1199 "+r"(dst_v), // %2
1200 "+rm"(width) // %3
1201 : "r"(static_cast<intptr_t>(src_stride_abgr))
1202 : "memory", "cc"
1203#if defined(__SSE2__)
1204 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1205#endif
1206 );
1207}
1208
1209void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1210 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001211 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001212 "movdqa %0,%%xmm4 \n"
1213 "movdqa %1,%%xmm3 \n"
1214 "movdqa %2,%%xmm5 \n"
1215 :
1216 : "m"(kABGRToU), // %0
1217 "m"(kABGRToV), // %1
1218 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001219 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001220 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001221 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001222 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001223 "1: \n"
1224 "movdqu (%0),%%xmm0 \n"
1225 "movdqu 0x10(%0),%%xmm1 \n"
1226 "movdqu 0x20(%0),%%xmm2 \n"
1227 "movdqu 0x30(%0),%%xmm6 \n"
1228 "movdqu (%0,%4,1),%%xmm7 \n"
1229 "pavgb %%xmm7,%%xmm0 \n"
1230 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1231 "pavgb %%xmm7,%%xmm1 \n"
1232 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1233 "pavgb %%xmm7,%%xmm2 \n"
1234 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1235 "pavgb %%xmm7,%%xmm6 \n"
1236 "lea 0x40(%0),%0 \n"
1237 "movdqa %%xmm0,%%xmm7 \n"
1238 "shufps $0x88,%%xmm1,%%xmm0 \n"
1239 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1240 "pavgb %%xmm7,%%xmm0 \n"
1241 "movdqa %%xmm2,%%xmm7 \n"
1242 "shufps $0x88,%%xmm6,%%xmm2 \n"
1243 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1244 "pavgb %%xmm7,%%xmm2 \n"
1245 "movdqa %%xmm0,%%xmm1 \n"
1246 "movdqa %%xmm2,%%xmm6 \n"
1247 "pmaddubsw %%xmm4,%%xmm0 \n"
1248 "pmaddubsw %%xmm4,%%xmm2 \n"
1249 "pmaddubsw %%xmm3,%%xmm1 \n"
1250 "pmaddubsw %%xmm3,%%xmm6 \n"
1251 "phaddw %%xmm2,%%xmm0 \n"
1252 "phaddw %%xmm6,%%xmm1 \n"
1253 "psraw $0x8,%%xmm0 \n"
1254 "psraw $0x8,%%xmm1 \n"
1255 "packsswb %%xmm1,%%xmm0 \n"
1256 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001257 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001258 "movlps %%xmm0,(%1) \n"
1259 "movhps %%xmm0,(%1,%2,1) \n"
1260 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001261 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001262 : "+r"(src_abgr0), // %0
1263 "+r"(dst_u), // %1
1264 "+r"(dst_v), // %2
1265 "+rm"(width) // %3
1266 : "r"(static_cast<intptr_t>(src_stride_abgr))
1267 : "memory", "cc"
1268#if defined(__SSE2__)
1269 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1270#endif
1271 );
1272}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001273#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001274
fbarchard@google.come214fe32012-06-04 23:47:11 +00001275#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001276#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1277#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1278#define UR 0
1279
1280#define VB 0
1281#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1282#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1283
1284// Bias
1285#define BB UB * 128 + VB * 128
1286#define BG UG * 128 + VG * 128
1287#define BR UR * 128 + VR * 128
1288
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001289#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001290
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001291struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001292 vec8 kUVToB; // 0
1293 vec8 kUVToG; // 16
1294 vec8 kUVToR; // 32
1295 vec16 kUVBiasB; // 48
1296 vec16 kUVBiasG; // 64
1297 vec16 kUVBiasR; // 80
1298 vec16 kYSub16; // 96
1299 vec16 kYToRgb; // 112
1300 vec8 kVUToB; // 128
1301 vec8 kVUToG; // 144
1302 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001303} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001304 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1305 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1306 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1307 { BB, BB, BB, BB, BB, BB, BB, BB },
1308 { BG, BG, BG, BG, BG, BG, BG, BG },
1309 { BR, BR, BR, BR, BR, BR, BR, BR },
1310 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001311 { YG, YG, YG, YG, YG, YG, YG, YG },
1312 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1313 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1314 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001315};
1316
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001317
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001318// Read 8 UV from 411
1319#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001320 "movq (%[u_buf]),%%xmm0 \n" \
1321 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1322 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001323 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001324
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001325// Read 4 UV from 422, upsample to 8 UV
1326#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001327 "movd (%[u_buf]),%%xmm0 \n" \
1328 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1329 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001330 "punpcklbw %%xmm1,%%xmm0 \n" \
1331 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001332
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001333// Read 2 UV from 411, upsample to 8 UV
1334#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001335 "movd (%[u_buf]),%%xmm0 \n" \
1336 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1337 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001338 "punpcklbw %%xmm1,%%xmm0 \n" \
1339 "punpcklwd %%xmm0,%%xmm0 \n" \
1340 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001341
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001342// Read 4 UV from NV12, upsample to 8 UV
1343#define READNV12 \
1344 "movq (%[uv_buf]),%%xmm0 \n" \
1345 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001346 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001347
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001348// Convert 8 pixels: 8 UV and 8 Y
1349#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001350 "movdqa %%xmm0,%%xmm1 \n" \
1351 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001352 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1353 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1354 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1355 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1356 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1357 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1358 "movq (%[y_buf]),%%xmm3 \n" \
1359 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001360 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001361 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1362 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001363 "paddsw %%xmm3,%%xmm0 \n" \
1364 "paddsw %%xmm3,%%xmm1 \n" \
1365 "paddsw %%xmm3,%%xmm2 \n" \
1366 "psraw $0x6,%%xmm0 \n" \
1367 "psraw $0x6,%%xmm1 \n" \
1368 "psraw $0x6,%%xmm2 \n" \
1369 "packuswb %%xmm0,%%xmm0 \n" \
1370 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001371 "packuswb %%xmm2,%%xmm2 \n" \
1372
1373// Convert 8 pixels: 8 VU and 8 Y
1374#define YVUTORGB \
1375 "movdqa %%xmm0,%%xmm1 \n" \
1376 "movdqa %%xmm0,%%xmm2 \n" \
1377 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1378 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1379 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1380 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1381 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1382 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1383 "movq (%[y_buf]),%%xmm3 \n" \
1384 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1385 "punpcklbw %%xmm4,%%xmm3 \n" \
1386 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1387 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1388 "paddsw %%xmm3,%%xmm0 \n" \
1389 "paddsw %%xmm3,%%xmm1 \n" \
1390 "paddsw %%xmm3,%%xmm2 \n" \
1391 "psraw $0x6,%%xmm0 \n" \
1392 "psraw $0x6,%%xmm1 \n" \
1393 "psraw $0x6,%%xmm2 \n" \
1394 "packuswb %%xmm0,%%xmm0 \n" \
1395 "packuswb %%xmm1,%%xmm1 \n" \
1396 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001397
1398void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001399 const uint8* u_buf,
1400 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001401 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001402 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001403 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001404 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001405 "pcmpeqb %%xmm5,%%xmm5 \n"
1406 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001407 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001408 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001409 READYUV444
1410 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001411 "punpcklbw %%xmm1,%%xmm0 \n"
1412 "punpcklbw %%xmm5,%%xmm2 \n"
1413 "movdqa %%xmm0,%%xmm1 \n"
1414 "punpcklwd %%xmm2,%%xmm0 \n"
1415 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001416 "movdqa %%xmm0,(%[argb_buf]) \n"
1417 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1418 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1419 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001420 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001421 : [y_buf]"+r"(y_buf), // %[y_buf]
1422 [u_buf]"+r"(u_buf), // %[u_buf]
1423 [v_buf]"+r"(v_buf), // %[v_buf]
1424 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1425 [width]"+rm"(width) // %[width]
1426 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001427 : "memory", "cc"
1428#if defined(__SSE2__)
1429 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1430#endif
1431 );
1432}
1433
fbarchard@google.come214fe32012-06-04 23:47:11 +00001434void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001435 const uint8* u_buf,
1436 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001437 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001438 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001439 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001440 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001441 "pcmpeqb %%xmm5,%%xmm5 \n"
1442 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001443 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001444 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001445 READYUV422
1446 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001447 "punpcklbw %%xmm1,%%xmm0 \n"
1448 "punpcklbw %%xmm5,%%xmm2 \n"
1449 "movdqa %%xmm0,%%xmm1 \n"
1450 "punpcklwd %%xmm2,%%xmm0 \n"
1451 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001452 "movdqa %%xmm0,(%[argb_buf]) \n"
1453 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1454 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1455 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001456 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001457 : [y_buf]"+r"(y_buf), // %[y_buf]
1458 [u_buf]"+r"(u_buf), // %[u_buf]
1459 [v_buf]"+r"(v_buf), // %[v_buf]
1460 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1461 [width]"+rm"(width) // %[width]
1462 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001463 : "memory", "cc"
1464#if defined(__SSE2__)
1465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1466#endif
1467 );
1468}
1469
1470void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1471 const uint8* u_buf,
1472 const uint8* v_buf,
1473 uint8* argb_buf,
1474 int width) {
1475 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001476 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001477 "pcmpeqb %%xmm5,%%xmm5 \n"
1478 "pxor %%xmm4,%%xmm4 \n"
1479 ".p2align 4 \n"
1480 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001481 READYUV411
1482 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001483 "punpcklbw %%xmm1,%%xmm0 \n"
1484 "punpcklbw %%xmm5,%%xmm2 \n"
1485 "movdqa %%xmm0,%%xmm1 \n"
1486 "punpcklwd %%xmm2,%%xmm0 \n"
1487 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001488 "movdqa %%xmm0,(%[argb_buf]) \n"
1489 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1490 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1491 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001492 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001493 : [y_buf]"+r"(y_buf), // %[y_buf]
1494 [u_buf]"+r"(u_buf), // %[u_buf]
1495 [v_buf]"+r"(v_buf), // %[v_buf]
1496 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1497 [width]"+rm"(width) // %[width]
1498 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1499 : "memory", "cc"
1500#if defined(__SSE2__)
1501 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1502#endif
1503 );
1504}
1505
1506void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1507 const uint8* uv_buf,
1508 uint8* argb_buf,
1509 int width) {
1510 asm volatile (
1511 "pcmpeqb %%xmm5,%%xmm5 \n"
1512 "pxor %%xmm4,%%xmm4 \n"
1513 ".p2align 4 \n"
1514 "1: \n"
1515 READNV12
1516 YUVTORGB
1517 "punpcklbw %%xmm1,%%xmm0 \n"
1518 "punpcklbw %%xmm5,%%xmm2 \n"
1519 "movdqa %%xmm0,%%xmm1 \n"
1520 "punpcklwd %%xmm2,%%xmm0 \n"
1521 "punpckhwd %%xmm2,%%xmm1 \n"
1522 "movdqa %%xmm0,(%[argb_buf]) \n"
1523 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1524 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1525 "sub $0x8,%[width] \n"
1526 "jg 1b \n"
1527 : [y_buf]"+r"(y_buf), // %[y_buf]
1528 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1529 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1530 [width]"+rm"(width) // %[width]
1531 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1532 : "memory", "cc"
1533#if defined(__SSE2__)
1534 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1535#endif
1536 );
1537}
1538
1539void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1540 const uint8* vu_buf,
1541 uint8* argb_buf,
1542 int width) {
1543 asm volatile (
1544 "pcmpeqb %%xmm5,%%xmm5 \n"
1545 "pxor %%xmm4,%%xmm4 \n"
1546 ".p2align 4 \n"
1547 "1: \n"
1548 READNV12
1549 YVUTORGB
1550 "punpcklbw %%xmm1,%%xmm0 \n"
1551 "punpcklbw %%xmm5,%%xmm2 \n"
1552 "movdqa %%xmm0,%%xmm1 \n"
1553 "punpcklwd %%xmm2,%%xmm0 \n"
1554 "punpckhwd %%xmm2,%%xmm1 \n"
1555 "movdqa %%xmm0,(%[argb_buf]) \n"
1556 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1557 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1558 "sub $0x8,%[width] \n"
1559 "jg 1b \n"
1560 : [y_buf]"+r"(y_buf), // %[y_buf]
1561 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1562 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1563 [width]"+rm"(width) // %[width]
1564 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001565 : "memory", "cc"
1566#if defined(__SSE2__)
1567 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1568#endif
1569 );
1570}
1571
1572void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1573 const uint8* u_buf,
1574 const uint8* v_buf,
1575 uint8* argb_buf,
1576 int width) {
1577 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001578 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001579 "pcmpeqb %%xmm5,%%xmm5 \n"
1580 "pxor %%xmm4,%%xmm4 \n"
1581 ".p2align 4 \n"
1582 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001583 READYUV444
1584 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001585 "punpcklbw %%xmm1,%%xmm0 \n"
1586 "punpcklbw %%xmm5,%%xmm2 \n"
1587 "movdqa %%xmm0,%%xmm1 \n"
1588 "punpcklwd %%xmm2,%%xmm0 \n"
1589 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001590 "movdqu %%xmm0,(%[argb_buf]) \n"
1591 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1592 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1593 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001594 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001595 : [y_buf]"+r"(y_buf), // %[y_buf]
1596 [u_buf]"+r"(u_buf), // %[u_buf]
1597 [v_buf]"+r"(v_buf), // %[v_buf]
1598 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1599 [width]"+rm"(width) // %[width]
1600 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001601 : "memory", "cc"
1602#if defined(__SSE2__)
1603 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1604#endif
1605 );
1606}
1607
1608void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1609 const uint8* u_buf,
1610 const uint8* v_buf,
1611 uint8* argb_buf,
1612 int width) {
1613 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001614 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001615 "pcmpeqb %%xmm5,%%xmm5 \n"
1616 "pxor %%xmm4,%%xmm4 \n"
1617 ".p2align 4 \n"
1618 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001619 READYUV422
1620 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001621 "punpcklbw %%xmm1,%%xmm0 \n"
1622 "punpcklbw %%xmm5,%%xmm2 \n"
1623 "movdqa %%xmm0,%%xmm1 \n"
1624 "punpcklwd %%xmm2,%%xmm0 \n"
1625 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001626 "movdqu %%xmm0,(%[argb_buf]) \n"
1627 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1628 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1629 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001630 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001631 : [y_buf]"+r"(y_buf), // %[y_buf]
1632 [u_buf]"+r"(u_buf), // %[u_buf]
1633 [v_buf]"+r"(v_buf), // %[v_buf]
1634 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1635 [width]"+rm"(width) // %[width]
1636 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001637 : "memory", "cc"
1638#if defined(__SSE2__)
1639 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1640#endif
1641 );
1642}
1643
1644void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1645 const uint8* u_buf,
1646 const uint8* v_buf,
1647 uint8* argb_buf,
1648 int width) {
1649 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001650 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001651 "pcmpeqb %%xmm5,%%xmm5 \n"
1652 "pxor %%xmm4,%%xmm4 \n"
1653 ".p2align 4 \n"
1654 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001655 READYUV411
1656 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001657 "punpcklbw %%xmm1,%%xmm0 \n"
1658 "punpcklbw %%xmm5,%%xmm2 \n"
1659 "movdqa %%xmm0,%%xmm1 \n"
1660 "punpcklwd %%xmm2,%%xmm0 \n"
1661 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001662 "movdqu %%xmm0,(%[argb_buf]) \n"
1663 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1664 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1665 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001666 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001667 : [y_buf]"+r"(y_buf), // %[y_buf]
1668 [u_buf]"+r"(u_buf), // %[u_buf]
1669 [v_buf]"+r"(v_buf), // %[v_buf]
1670 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1671 [width]"+rm"(width) // %[width]
1672 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1673 : "memory", "cc"
1674#if defined(__SSE2__)
1675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1676#endif
1677 );
1678}
1679
1680void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1681 const uint8* uv_buf,
1682 uint8* argb_buf,
1683 int width) {
1684 asm volatile (
1685 "pcmpeqb %%xmm5,%%xmm5 \n"
1686 "pxor %%xmm4,%%xmm4 \n"
1687 ".p2align 4 \n"
1688 "1: \n"
1689 READNV12
1690 YUVTORGB
1691 "punpcklbw %%xmm1,%%xmm0 \n"
1692 "punpcklbw %%xmm5,%%xmm2 \n"
1693 "movdqa %%xmm0,%%xmm1 \n"
1694 "punpcklwd %%xmm2,%%xmm0 \n"
1695 "punpckhwd %%xmm2,%%xmm1 \n"
1696 "movdqu %%xmm0,(%[argb_buf]) \n"
1697 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1698 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1699 "sub $0x8,%[width] \n"
1700 "jg 1b \n"
1701 : [y_buf]"+r"(y_buf), // %[y_buf]
1702 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1703 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1704 [width]"+rm"(width) // %[width]
1705 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1706 : "memory", "cc"
1707#if defined(__SSE2__)
1708 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1709#endif
1710 );
1711}
1712
1713void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1714 const uint8* vu_buf,
1715 uint8* argb_buf,
1716 int width) {
1717 asm volatile (
1718 "pcmpeqb %%xmm5,%%xmm5 \n"
1719 "pxor %%xmm4,%%xmm4 \n"
1720 ".p2align 4 \n"
1721 "1: \n"
1722 READNV12
1723 YVUTORGB
1724 "punpcklbw %%xmm1,%%xmm0 \n"
1725 "punpcklbw %%xmm5,%%xmm2 \n"
1726 "movdqa %%xmm0,%%xmm1 \n"
1727 "punpcklwd %%xmm2,%%xmm0 \n"
1728 "punpckhwd %%xmm2,%%xmm1 \n"
1729 "movdqu %%xmm0,(%[argb_buf]) \n"
1730 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1731 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1732 "sub $0x8,%[width] \n"
1733 "jg 1b \n"
1734 : [y_buf]"+r"(y_buf), // %[y_buf]
1735 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1736 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1737 [width]"+rm"(width) // %[width]
1738 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001739 : "memory", "cc"
1740#if defined(__SSE2__)
1741 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1742#endif
1743 );
1744}
1745
1746void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1747 const uint8* u_buf,
1748 const uint8* v_buf,
1749 uint8* bgra_buf,
1750 int width) {
1751 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001752 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001753 "pcmpeqb %%xmm5,%%xmm5 \n"
1754 "pxor %%xmm4,%%xmm4 \n"
1755 ".p2align 4 \n"
1756 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001757 READYUV422
1758 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001759 "pcmpeqb %%xmm5,%%xmm5 \n"
1760 "punpcklbw %%xmm0,%%xmm1 \n"
1761 "punpcklbw %%xmm2,%%xmm5 \n"
1762 "movdqa %%xmm5,%%xmm0 \n"
1763 "punpcklwd %%xmm1,%%xmm5 \n"
1764 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001765 "movdqa %%xmm5,(%[argb_buf]) \n"
1766 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1767 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1768 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001769 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001770 : [y_buf]"+r"(y_buf), // %[y_buf]
1771 [u_buf]"+r"(u_buf), // %[u_buf]
1772 [v_buf]"+r"(v_buf), // %[v_buf]
1773 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1774 [width]"+rm"(width) // %[width]
1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001776 : "memory", "cc"
1777#if defined(__SSE2__)
1778 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1779#endif
1780 );
1781}
1782
fbarchard@google.come214fe32012-06-04 23:47:11 +00001783void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001784 const uint8* u_buf,
1785 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001786 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001787 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001788 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001789 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001790 "pcmpeqb %%xmm5,%%xmm5 \n"
1791 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001792 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001793 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001794 READYUV422
1795 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001796 "punpcklbw %%xmm1,%%xmm2 \n"
1797 "punpcklbw %%xmm5,%%xmm0 \n"
1798 "movdqa %%xmm2,%%xmm1 \n"
1799 "punpcklwd %%xmm0,%%xmm2 \n"
1800 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001801 "movdqa %%xmm2,(%[argb_buf]) \n"
1802 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1803 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1804 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001805 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001806 : [y_buf]"+r"(y_buf), // %[y_buf]
1807 [u_buf]"+r"(u_buf), // %[u_buf]
1808 [v_buf]"+r"(v_buf), // %[v_buf]
1809 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1810 [width]"+rm"(width) // %[width]
1811 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001812 : "memory", "cc"
1813#if defined(__SSE2__)
1814 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1815#endif
1816 );
1817}
1818
fbarchard@google.come91bdac2012-10-09 21:09:33 +00001819void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1820 const uint8* u_buf,
1821 const uint8* v_buf,
1822 uint8* rgba_buf,
1823 int width) {
1824 asm volatile (
1825 "sub %[u_buf],%[v_buf] \n"
1826 "pcmpeqb %%xmm5,%%xmm5 \n"
1827 "pxor %%xmm4,%%xmm4 \n"
1828 ".p2align 4 \n"
1829 "1: \n"
1830 READYUV422
1831 YUVTORGB
1832 "pcmpeqb %%xmm5,%%xmm5 \n"
1833 "punpcklbw %%xmm2,%%xmm1 \n"
1834 "punpcklbw %%xmm0,%%xmm5 \n"
1835 "movdqa %%xmm5,%%xmm0 \n"
1836 "punpcklwd %%xmm1,%%xmm5 \n"
1837 "punpckhwd %%xmm1,%%xmm0 \n"
1838 "movdqa %%xmm5,(%[argb_buf]) \n"
1839 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1840 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1841 "sub $0x8,%[width] \n"
1842 "jg 1b \n"
1843 : [y_buf]"+r"(y_buf), // %[y_buf]
1844 [u_buf]"+r"(u_buf), // %[u_buf]
1845 [v_buf]"+r"(v_buf), // %[v_buf]
1846 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
1847 [width]"+rm"(width) // %[width]
1848 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1849 : "memory", "cc"
1850#if defined(__SSE2__)
1851 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1852#endif
1853 );
1854}
1855
fbarchard@google.come214fe32012-06-04 23:47:11 +00001856void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001857 const uint8* u_buf,
1858 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001859 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001860 int width) {
1861 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001862 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001863 "pcmpeqb %%xmm5,%%xmm5 \n"
1864 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001865 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001866 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001867 READYUV422
1868 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001869 "pcmpeqb %%xmm5,%%xmm5 \n"
1870 "punpcklbw %%xmm0,%%xmm1 \n"
1871 "punpcklbw %%xmm2,%%xmm5 \n"
1872 "movdqa %%xmm5,%%xmm0 \n"
1873 "punpcklwd %%xmm1,%%xmm5 \n"
1874 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001875 "movdqu %%xmm5,(%[argb_buf]) \n"
1876 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
1877 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1878 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001879 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001880 : [y_buf]"+r"(y_buf), // %[y_buf]
1881 [u_buf]"+r"(u_buf), // %[u_buf]
1882 [v_buf]"+r"(v_buf), // %[v_buf]
1883 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1884 [width]"+rm"(width) // %[width]
1885 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001886 : "memory", "cc"
1887#if defined(__SSE2__)
1888 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1889#endif
1890 );
1891}
1892
fbarchard@google.come214fe32012-06-04 23:47:11 +00001893void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001894 const uint8* u_buf,
1895 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001896 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001897 int width) {
1898 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001899 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001900 "pcmpeqb %%xmm5,%%xmm5 \n"
1901 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001902 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001903 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001904 READYUV422
1905 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001906 "punpcklbw %%xmm1,%%xmm2 \n"
1907 "punpcklbw %%xmm5,%%xmm0 \n"
1908 "movdqa %%xmm2,%%xmm1 \n"
1909 "punpcklwd %%xmm0,%%xmm2 \n"
1910 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001911 "movdqu %%xmm2,(%[argb_buf]) \n"
1912 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1913 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1914 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001915 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001916 : [y_buf]"+r"(y_buf), // %[y_buf]
1917 [u_buf]"+r"(u_buf), // %[u_buf]
1918 [v_buf]"+r"(v_buf), // %[v_buf]
1919 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1920 [width]"+rm"(width) // %[width]
1921 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00001922 : "memory", "cc"
1923#if defined(__SSE2__)
1924 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1925#endif
1926 );
1927}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00001928
1929void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
1930 const uint8* u_buf,
1931 const uint8* v_buf,
1932 uint8* rgba_buf,
1933 int width) {
1934 asm volatile (
1935 "sub %[u_buf],%[v_buf] \n"
1936 "pcmpeqb %%xmm5,%%xmm5 \n"
1937 "pxor %%xmm4,%%xmm4 \n"
1938 ".p2align 4 \n"
1939 "1: \n"
1940 READYUV422
1941 YUVTORGB
1942 "pcmpeqb %%xmm5,%%xmm5 \n"
1943 "punpcklbw %%xmm2,%%xmm1 \n"
1944 "punpcklbw %%xmm0,%%xmm5 \n"
1945 "movdqa %%xmm5,%%xmm0 \n"
1946 "punpcklwd %%xmm1,%%xmm5 \n"
1947 "punpckhwd %%xmm1,%%xmm0 \n"
1948 "movdqa %%xmm5,(%[argb_buf]) \n"
1949 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1950 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1951 "sub $0x8,%[width] \n"
1952 "jg 1b \n"
1953 : [y_buf]"+r"(y_buf), // %[y_buf]
1954 [u_buf]"+r"(u_buf), // %[u_buf]
1955 [v_buf]"+r"(v_buf), // %[v_buf]
1956 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
1957 [width]"+rm"(width) // %[width]
1958 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1959 : "memory", "cc"
1960#if defined(__SSE2__)
1961 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1962#endif
1963 );
1964}
1965
fbarchard@google.come214fe32012-06-04 23:47:11 +00001966#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001967
1968#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001969void YToARGBRow_SSE2(const uint8* y_buf,
1970 uint8* rgb_buf,
1971 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001972 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001973 "pcmpeqb %%xmm4,%%xmm4 \n"
1974 "pslld $0x18,%%xmm4 \n"
1975 "mov $0x10001000,%%eax \n"
1976 "movd %%eax,%%xmm3 \n"
1977 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1978 "mov $0x012a012a,%%eax \n"
1979 "movd %%eax,%%xmm2 \n"
1980 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001981 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001982 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001983 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001984 "movq (%0),%%xmm0 \n"
1985 "lea 0x8(%0),%0 \n"
1986 "punpcklbw %%xmm0,%%xmm0 \n"
1987 "psubusw %%xmm3,%%xmm0 \n"
1988 "pmulhuw %%xmm2,%%xmm0 \n"
1989 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001990
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001991 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001992 "punpcklbw %%xmm0,%%xmm0 \n"
1993 "movdqa %%xmm0,%%xmm1 \n"
1994 "punpcklwd %%xmm0,%%xmm0 \n"
1995 "punpckhwd %%xmm1,%%xmm1 \n"
1996 "por %%xmm4,%%xmm0 \n"
1997 "por %%xmm4,%%xmm1 \n"
1998 "movdqa %%xmm0,(%1) \n"
1999 "movdqa %%xmm1,16(%1) \n"
2000 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002001
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002002 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002003 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00002004 : "+r"(y_buf), // %0
2005 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00002006 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002007 :
2008 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002009#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002010 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002011#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002012 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002013}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002014#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002015
fbarchard@google.com42831e02012-01-21 02:54:17 +00002016#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002017// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002018CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002019 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2020};
2021
fbarchard@google.com42831e02012-01-21 02:54:17 +00002022void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002023 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002024 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002025 "movdqa %3,%%xmm5 \n"
2026 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002027 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002028 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002029 "movdqa (%0,%2),%%xmm0 \n"
2030 "pshufb %%xmm5,%%xmm0 \n"
2031 "sub $0x10,%2 \n"
2032 "movdqa %%xmm0,(%1) \n"
2033 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002034 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002035 : "+r"(src), // %0
2036 "+r"(dst), // %1
2037 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002038 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002039 : "memory", "cc"
2040#if defined(__SSE2__)
2041 , "xmm0", "xmm5"
2042#endif
2043 );
2044}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002045#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002046
fbarchard@google.com42831e02012-01-21 02:54:17 +00002047#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002048void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002049 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002050 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002051 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002052 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002053 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002054 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002055 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002056 "psllw $0x8,%%xmm0 \n"
2057 "psrlw $0x8,%%xmm1 \n"
2058 "por %%xmm1,%%xmm0 \n"
2059 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2060 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2061 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2062 "sub $0x10,%2 \n"
2063 "movdqu %%xmm0,(%1) \n"
2064 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002065 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002066 : "+r"(src), // %0
2067 "+r"(dst), // %1
2068 "+r"(temp_width) // %2
2069 :
2070 : "memory", "cc"
2071#if defined(__SSE2__)
2072 , "xmm0", "xmm1"
2073#endif
2074 );
2075}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002076#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002077
fbarchard@google.com16a96642012-03-02 22:38:09 +00002078#ifdef HAS_MIRRORROW_UV_SSSE3
2079// Shuffle table for reversing the bytes of UV channels.
2080CONST uvec8 kShuffleMirrorUV = {
2081 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2082};
2083void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2084 int width) {
2085 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002086 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002087 "movdqa %4,%%xmm1 \n"
2088 "lea -16(%0,%3,2),%0 \n"
2089 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002090 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002091 "1: \n"
2092 "movdqa (%0),%%xmm0 \n"
2093 "lea -16(%0),%0 \n"
2094 "pshufb %%xmm1,%%xmm0 \n"
2095 "sub $8,%3 \n"
2096 "movlpd %%xmm0,(%1) \n"
2097 "movhpd %%xmm0,(%1,%2) \n"
2098 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002099 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002100 : "+r"(src), // %0
2101 "+r"(dst_u), // %1
2102 "+r"(dst_v), // %2
2103 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002104 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002105 : "memory", "cc"
2106#if defined(__SSE2__)
2107 , "xmm0", "xmm1"
2108#endif
2109 );
2110}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002111#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002112
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002113#ifdef HAS_ARGBMIRRORROW_SSSE3
2114// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002115CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002116 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2117};
2118
2119void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2120 intptr_t temp_width = static_cast<intptr_t>(width);
2121 asm volatile (
2122 "movdqa %3,%%xmm5 \n"
2123 "lea -0x10(%0),%0 \n"
2124 ".p2align 4 \n"
2125 "1: \n"
2126 "movdqa (%0,%2,4),%%xmm0 \n"
2127 "pshufb %%xmm5,%%xmm0 \n"
2128 "sub $0x4,%2 \n"
2129 "movdqa %%xmm0,(%1) \n"
2130 "lea 0x10(%1),%1 \n"
2131 "jg 1b \n"
2132 : "+r"(src), // %0
2133 "+r"(dst), // %1
2134 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002135 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002136 : "memory", "cc"
2137#if defined(__SSE2__)
2138 , "xmm0", "xmm5"
2139#endif
2140 );
2141}
2142#endif // HAS_ARGBMIRRORROW_SSSE3
2143
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002144#ifdef HAS_SPLITUV_SSE2
2145void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002146 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002147 "pcmpeqb %%xmm5,%%xmm5 \n"
2148 "psrlw $0x8,%%xmm5 \n"
2149 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002150 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002151 "1: \n"
2152 "movdqa (%0),%%xmm0 \n"
2153 "movdqa 0x10(%0),%%xmm1 \n"
2154 "lea 0x20(%0),%0 \n"
2155 "movdqa %%xmm0,%%xmm2 \n"
2156 "movdqa %%xmm1,%%xmm3 \n"
2157 "pand %%xmm5,%%xmm0 \n"
2158 "pand %%xmm5,%%xmm1 \n"
2159 "packuswb %%xmm1,%%xmm0 \n"
2160 "psrlw $0x8,%%xmm2 \n"
2161 "psrlw $0x8,%%xmm3 \n"
2162 "packuswb %%xmm3,%%xmm2 \n"
2163 "movdqa %%xmm0,(%1) \n"
2164 "movdqa %%xmm2,(%1,%2) \n"
2165 "lea 0x10(%1),%1 \n"
2166 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002167 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002168 : "+r"(src_uv), // %0
2169 "+r"(dst_u), // %1
2170 "+r"(dst_v), // %2
2171 "+r"(pix) // %3
2172 :
2173 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002174#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002175 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002176#endif
2177 );
2178}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002179#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002180
fbarchard@google.com19932f82012-02-16 22:19:14 +00002181#ifdef HAS_COPYROW_SSE2
2182void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002183 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002184 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002185 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002186 "1: \n"
2187 "movdqa (%0),%%xmm0 \n"
2188 "movdqa 0x10(%0),%%xmm1 \n"
2189 "movdqa %%xmm0,(%0,%1) \n"
2190 "movdqa %%xmm1,0x10(%0,%1) \n"
2191 "lea 0x20(%0),%0 \n"
2192 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002193 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002194 : "+r"(src), // %0
2195 "+r"(dst), // %1
2196 "+r"(count) // %2
2197 :
2198 : "memory", "cc"
2199#if defined(__SSE2__)
2200 , "xmm0", "xmm1"
2201#endif
2202 );
2203}
2204#endif // HAS_COPYROW_SSE2
2205
2206#ifdef HAS_COPYROW_X86
2207void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2208 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002209 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002210 "shr $0x2,%2 \n"
2211 "rep movsl \n"
2212 : "+S"(src), // %0
2213 "+D"(dst), // %1
2214 "+c"(width_tmp) // %2
2215 :
2216 : "memory", "cc"
2217 );
2218}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002219#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002220
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002221#ifdef HAS_SETROW_X86
2222void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2223 size_t width_tmp = static_cast<size_t>(width);
2224 asm volatile (
2225 "shr $0x2,%1 \n"
2226 "rep stosl \n"
2227 : "+D"(dst), // %0
2228 "+c"(width_tmp) // %1
2229 : "a"(v32) // %2
2230 : "memory", "cc");
2231}
2232
2233void SetRows32_X86(uint8* dst, uint32 v32, int width,
2234 int dst_stride, int height) {
2235 for (int y = 0; y < height; ++y) {
2236 size_t width_tmp = static_cast<size_t>(width);
2237 uint32* d = reinterpret_cast<uint32*>(dst);
2238 asm volatile (
2239 "rep stosl \n"
2240 : "+D"(d), // %0
2241 "+c"(width_tmp) // %1
2242 : "a"(v32) // %2
2243 : "memory", "cc");
2244 dst += dst_stride;
2245 }
2246}
2247#endif // HAS_SETROW_X86
2248
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002249#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002250void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002251 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002252 "pcmpeqb %%xmm5,%%xmm5 \n"
2253 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002254 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002255 "1: \n"
2256 "movdqa (%0),%%xmm0 \n"
2257 "movdqa 0x10(%0),%%xmm1 \n"
2258 "lea 0x20(%0),%0 \n"
2259 "pand %%xmm5,%%xmm0 \n"
2260 "pand %%xmm5,%%xmm1 \n"
2261 "packuswb %%xmm1,%%xmm0 \n"
2262 "movdqa %%xmm0,(%1) \n"
2263 "lea 0x10(%1),%1 \n"
2264 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002265 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002266 : "+r"(src_yuy2), // %0
2267 "+r"(dst_y), // %1
2268 "+r"(pix) // %2
2269 :
2270 : "memory", "cc"
2271#if defined(__SSE2__)
2272 , "xmm0", "xmm1", "xmm5"
2273#endif
2274 );
2275}
2276
2277void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002278 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002279 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002280 "pcmpeqb %%xmm5,%%xmm5 \n"
2281 "psrlw $0x8,%%xmm5 \n"
2282 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002283 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002284 "1: \n"
2285 "movdqa (%0),%%xmm0 \n"
2286 "movdqa 0x10(%0),%%xmm1 \n"
2287 "movdqa (%0,%4,1),%%xmm2 \n"
2288 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2289 "lea 0x20(%0),%0 \n"
2290 "pavgb %%xmm2,%%xmm0 \n"
2291 "pavgb %%xmm3,%%xmm1 \n"
2292 "psrlw $0x8,%%xmm0 \n"
2293 "psrlw $0x8,%%xmm1 \n"
2294 "packuswb %%xmm1,%%xmm0 \n"
2295 "movdqa %%xmm0,%%xmm1 \n"
2296 "pand %%xmm5,%%xmm0 \n"
2297 "packuswb %%xmm0,%%xmm0 \n"
2298 "psrlw $0x8,%%xmm1 \n"
2299 "packuswb %%xmm1,%%xmm1 \n"
2300 "movq %%xmm0,(%1) \n"
2301 "movq %%xmm1,(%1,%2) \n"
2302 "lea 0x8(%1),%1 \n"
2303 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002304 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002305 : "+r"(src_yuy2), // %0
2306 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002307 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002308 "+r"(pix) // %3
2309 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2310 : "memory", "cc"
2311#if defined(__SSE2__)
2312 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2313#endif
2314 );
2315}
2316
fbarchard@google.comc704f782012-08-30 19:53:48 +00002317void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2318 uint8* dst_u, uint8* dst_v, int pix) {
2319 asm volatile (
2320 "pcmpeqb %%xmm5,%%xmm5 \n"
2321 "psrlw $0x8,%%xmm5 \n"
2322 "sub %1,%2 \n"
2323 ".p2align 4 \n"
2324 "1: \n"
2325 "movdqa (%0),%%xmm0 \n"
2326 "movdqa 0x10(%0),%%xmm1 \n"
2327 "lea 0x20(%0),%0 \n"
2328 "psrlw $0x8,%%xmm0 \n"
2329 "psrlw $0x8,%%xmm1 \n"
2330 "packuswb %%xmm1,%%xmm0 \n"
2331 "movdqa %%xmm0,%%xmm1 \n"
2332 "pand %%xmm5,%%xmm0 \n"
2333 "packuswb %%xmm0,%%xmm0 \n"
2334 "psrlw $0x8,%%xmm1 \n"
2335 "packuswb %%xmm1,%%xmm1 \n"
2336 "movq %%xmm0,(%1) \n"
2337 "movq %%xmm1,(%1,%2) \n"
2338 "lea 0x8(%1),%1 \n"
2339 "sub $0x10,%3 \n"
2340 "jg 1b \n"
2341 : "+r"(src_yuy2), // %0
2342 "+r"(dst_u), // %1
2343 "+r"(dst_v), // %2
2344 "+r"(pix) // %3
2345 :
2346 : "memory", "cc"
2347#if defined(__SSE2__)
2348 , "xmm0", "xmm1", "xmm5"
2349#endif
2350 );
2351}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002352
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002353void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2354 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002355 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002356 "pcmpeqb %%xmm5,%%xmm5 \n"
2357 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002358 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002359 "1: \n"
2360 "movdqu (%0),%%xmm0 \n"
2361 "movdqu 0x10(%0),%%xmm1 \n"
2362 "lea 0x20(%0),%0 \n"
2363 "pand %%xmm5,%%xmm0 \n"
2364 "pand %%xmm5,%%xmm1 \n"
2365 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002366 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002367 "movdqu %%xmm0,(%1) \n"
2368 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002369 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002370 : "+r"(src_yuy2), // %0
2371 "+r"(dst_y), // %1
2372 "+r"(pix) // %2
2373 :
2374 : "memory", "cc"
2375#if defined(__SSE2__)
2376 , "xmm0", "xmm1", "xmm5"
2377#endif
2378 );
2379}
2380
2381void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2382 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002383 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002384 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002385 "pcmpeqb %%xmm5,%%xmm5 \n"
2386 "psrlw $0x8,%%xmm5 \n"
2387 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002388 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002389 "1: \n"
2390 "movdqu (%0),%%xmm0 \n"
2391 "movdqu 0x10(%0),%%xmm1 \n"
2392 "movdqu (%0,%4,1),%%xmm2 \n"
2393 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2394 "lea 0x20(%0),%0 \n"
2395 "pavgb %%xmm2,%%xmm0 \n"
2396 "pavgb %%xmm3,%%xmm1 \n"
2397 "psrlw $0x8,%%xmm0 \n"
2398 "psrlw $0x8,%%xmm1 \n"
2399 "packuswb %%xmm1,%%xmm0 \n"
2400 "movdqa %%xmm0,%%xmm1 \n"
2401 "pand %%xmm5,%%xmm0 \n"
2402 "packuswb %%xmm0,%%xmm0 \n"
2403 "psrlw $0x8,%%xmm1 \n"
2404 "packuswb %%xmm1,%%xmm1 \n"
2405 "movq %%xmm0,(%1) \n"
2406 "movq %%xmm1,(%1,%2) \n"
2407 "lea 0x8(%1),%1 \n"
2408 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002409 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002410 : "+r"(src_yuy2), // %0
2411 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002412 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002413 "+r"(pix) // %3
2414 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2415 : "memory", "cc"
2416#if defined(__SSE2__)
2417 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2418#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002419 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002420}
2421
fbarchard@google.comc704f782012-08-30 19:53:48 +00002422void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2423 uint8* dst_u, uint8* dst_v, int pix) {
2424 asm volatile (
2425 "pcmpeqb %%xmm5,%%xmm5 \n"
2426 "psrlw $0x8,%%xmm5 \n"
2427 "sub %1,%2 \n"
2428 ".p2align 4 \n"
2429 "1: \n"
2430 "movdqu (%0),%%xmm0 \n"
2431 "movdqu 0x10(%0),%%xmm1 \n"
2432 "lea 0x20(%0),%0 \n"
2433 "psrlw $0x8,%%xmm0 \n"
2434 "psrlw $0x8,%%xmm1 \n"
2435 "packuswb %%xmm1,%%xmm0 \n"
2436 "movdqa %%xmm0,%%xmm1 \n"
2437 "pand %%xmm5,%%xmm0 \n"
2438 "packuswb %%xmm0,%%xmm0 \n"
2439 "psrlw $0x8,%%xmm1 \n"
2440 "packuswb %%xmm1,%%xmm1 \n"
2441 "movq %%xmm0,(%1) \n"
2442 "movq %%xmm1,(%1,%2) \n"
2443 "lea 0x8(%1),%1 \n"
2444 "sub $0x10,%3 \n"
2445 "jg 1b \n"
2446 : "+r"(src_yuy2), // %0
2447 "+r"(dst_u), // %1
2448 "+r"(dst_v), // %2
2449 "+r"(pix) // %3
2450 :
2451 : "memory", "cc"
2452#if defined(__SSE2__)
2453 , "xmm0", "xmm1", "xmm5"
2454#endif
2455 );
2456}
2457
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002458void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002459 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002460 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002461 "1: \n"
2462 "movdqa (%0),%%xmm0 \n"
2463 "movdqa 0x10(%0),%%xmm1 \n"
2464 "lea 0x20(%0),%0 \n"
2465 "psrlw $0x8,%%xmm0 \n"
2466 "psrlw $0x8,%%xmm1 \n"
2467 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002468 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002469 "movdqa %%xmm0,(%1) \n"
2470 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002471 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002472 : "+r"(src_uyvy), // %0
2473 "+r"(dst_y), // %1
2474 "+r"(pix) // %2
2475 :
2476 : "memory", "cc"
2477#if defined(__SSE2__)
2478 , "xmm0", "xmm1"
2479#endif
2480 );
2481}
2482
2483void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002484 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002485 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002486 "pcmpeqb %%xmm5,%%xmm5 \n"
2487 "psrlw $0x8,%%xmm5 \n"
2488 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002489 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002490 "1: \n"
2491 "movdqa (%0),%%xmm0 \n"
2492 "movdqa 0x10(%0),%%xmm1 \n"
2493 "movdqa (%0,%4,1),%%xmm2 \n"
2494 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2495 "lea 0x20(%0),%0 \n"
2496 "pavgb %%xmm2,%%xmm0 \n"
2497 "pavgb %%xmm3,%%xmm1 \n"
2498 "pand %%xmm5,%%xmm0 \n"
2499 "pand %%xmm5,%%xmm1 \n"
2500 "packuswb %%xmm1,%%xmm0 \n"
2501 "movdqa %%xmm0,%%xmm1 \n"
2502 "pand %%xmm5,%%xmm0 \n"
2503 "packuswb %%xmm0,%%xmm0 \n"
2504 "psrlw $0x8,%%xmm1 \n"
2505 "packuswb %%xmm1,%%xmm1 \n"
2506 "movq %%xmm0,(%1) \n"
2507 "movq %%xmm1,(%1,%2) \n"
2508 "lea 0x8(%1),%1 \n"
2509 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002510 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002511 : "+r"(src_uyvy), // %0
2512 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002513 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002514 "+r"(pix) // %3
2515 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2516 : "memory", "cc"
2517#if defined(__SSE2__)
2518 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2519#endif
2520 );
2521}
2522
fbarchard@google.comc704f782012-08-30 19:53:48 +00002523void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2524 uint8* dst_u, uint8* dst_v, int pix) {
2525 asm volatile (
2526 "pcmpeqb %%xmm5,%%xmm5 \n"
2527 "psrlw $0x8,%%xmm5 \n"
2528 "sub %1,%2 \n"
2529 ".p2align 4 \n"
2530 "1: \n"
2531 "movdqa (%0),%%xmm0 \n"
2532 "movdqa 0x10(%0),%%xmm1 \n"
2533 "lea 0x20(%0),%0 \n"
2534 "pand %%xmm5,%%xmm0 \n"
2535 "pand %%xmm5,%%xmm1 \n"
2536 "packuswb %%xmm1,%%xmm0 \n"
2537 "movdqa %%xmm0,%%xmm1 \n"
2538 "pand %%xmm5,%%xmm0 \n"
2539 "packuswb %%xmm0,%%xmm0 \n"
2540 "psrlw $0x8,%%xmm1 \n"
2541 "packuswb %%xmm1,%%xmm1 \n"
2542 "movq %%xmm0,(%1) \n"
2543 "movq %%xmm1,(%1,%2) \n"
2544 "lea 0x8(%1),%1 \n"
2545 "sub $0x10,%3 \n"
2546 "jg 1b \n"
2547 : "+r"(src_uyvy), // %0
2548 "+r"(dst_u), // %1
2549 "+r"(dst_v), // %2
2550 "+r"(pix) // %3
2551 :
2552 : "memory", "cc"
2553#if defined(__SSE2__)
2554 , "xmm0", "xmm1", "xmm5"
2555#endif
2556 );
2557}
2558
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002559void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2560 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002561 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002562 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002563 "1: \n"
2564 "movdqu (%0),%%xmm0 \n"
2565 "movdqu 0x10(%0),%%xmm1 \n"
2566 "lea 0x20(%0),%0 \n"
2567 "psrlw $0x8,%%xmm0 \n"
2568 "psrlw $0x8,%%xmm1 \n"
2569 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002570 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002571 "movdqu %%xmm0,(%1) \n"
2572 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002573 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002574 : "+r"(src_uyvy), // %0
2575 "+r"(dst_y), // %1
2576 "+r"(pix) // %2
2577 :
2578 : "memory", "cc"
2579#if defined(__SSE2__)
2580 , "xmm0", "xmm1"
2581#endif
2582 );
2583}
2584
2585void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002586 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002587 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002588 "pcmpeqb %%xmm5,%%xmm5 \n"
2589 "psrlw $0x8,%%xmm5 \n"
2590 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002591 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002592 "1: \n"
2593 "movdqu (%0),%%xmm0 \n"
2594 "movdqu 0x10(%0),%%xmm1 \n"
2595 "movdqu (%0,%4,1),%%xmm2 \n"
2596 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2597 "lea 0x20(%0),%0 \n"
2598 "pavgb %%xmm2,%%xmm0 \n"
2599 "pavgb %%xmm3,%%xmm1 \n"
2600 "pand %%xmm5,%%xmm0 \n"
2601 "pand %%xmm5,%%xmm1 \n"
2602 "packuswb %%xmm1,%%xmm0 \n"
2603 "movdqa %%xmm0,%%xmm1 \n"
2604 "pand %%xmm5,%%xmm0 \n"
2605 "packuswb %%xmm0,%%xmm0 \n"
2606 "psrlw $0x8,%%xmm1 \n"
2607 "packuswb %%xmm1,%%xmm1 \n"
2608 "movq %%xmm0,(%1) \n"
2609 "movq %%xmm1,(%1,%2) \n"
2610 "lea 0x8(%1),%1 \n"
2611 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002612 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002613 : "+r"(src_uyvy), // %0
2614 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002615 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002616 "+r"(pix) // %3
2617 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2618 : "memory", "cc"
2619#if defined(__SSE2__)
2620 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2621#endif
2622 );
2623}
fbarchard@google.comc704f782012-08-30 19:53:48 +00002624
2625void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2626 uint8* dst_u, uint8* dst_v, int pix) {
2627 asm volatile (
2628 "pcmpeqb %%xmm5,%%xmm5 \n"
2629 "psrlw $0x8,%%xmm5 \n"
2630 "sub %1,%2 \n"
2631 ".p2align 4 \n"
2632 "1: \n"
2633 "movdqu (%0),%%xmm0 \n"
2634 "movdqu 0x10(%0),%%xmm1 \n"
2635 "lea 0x20(%0),%0 \n"
2636 "pand %%xmm5,%%xmm0 \n"
2637 "pand %%xmm5,%%xmm1 \n"
2638 "packuswb %%xmm1,%%xmm0 \n"
2639 "movdqa %%xmm0,%%xmm1 \n"
2640 "pand %%xmm5,%%xmm0 \n"
2641 "packuswb %%xmm0,%%xmm0 \n"
2642 "psrlw $0x8,%%xmm1 \n"
2643 "packuswb %%xmm1,%%xmm1 \n"
2644 "movq %%xmm0,(%1) \n"
2645 "movq %%xmm1,(%1,%2) \n"
2646 "lea 0x8(%1),%1 \n"
2647 "sub $0x10,%3 \n"
2648 "jg 1b \n"
2649 : "+r"(src_uyvy), // %0
2650 "+r"(dst_u), // %1
2651 "+r"(dst_v), // %2
2652 "+r"(pix) // %3
2653 :
2654 : "memory", "cc"
2655#if defined(__SSE2__)
2656 , "xmm0", "xmm1", "xmm5"
2657#endif
2658 );
2659}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002660#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002661
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002662#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002663// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002664void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
2665 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002666 asm volatile (
2667 "pcmpeqb %%xmm7,%%xmm7 \n"
2668 "psrlw $0xf,%%xmm7 \n"
2669 "pcmpeqb %%xmm6,%%xmm6 \n"
2670 "psrlw $0x8,%%xmm6 \n"
2671 "pcmpeqb %%xmm5,%%xmm5 \n"
2672 "psllw $0x8,%%xmm5 \n"
2673 "pcmpeqb %%xmm4,%%xmm4 \n"
2674 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002675 "sub $0x1,%3 \n"
2676 "je 91f \n"
2677 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002678
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002679 // 1 pixel loop until destination pointer is aligned.
2680 "10: \n"
2681 "test $0xf,%2 \n"
2682 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002683 "movd (%0),%%xmm3 \n"
2684 "lea 0x4(%0),%0 \n"
2685 "movdqa %%xmm3,%%xmm0 \n"
2686 "pxor %%xmm4,%%xmm3 \n"
2687 "movd (%1),%%xmm2 \n"
2688 "psrlw $0x8,%%xmm3 \n"
2689 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2690 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2691 "pand %%xmm6,%%xmm2 \n"
2692 "paddw %%xmm7,%%xmm3 \n"
2693 "pmullw %%xmm3,%%xmm2 \n"
2694 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002695 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002696 "psrlw $0x8,%%xmm1 \n"
2697 "por %%xmm4,%%xmm0 \n"
2698 "pmullw %%xmm3,%%xmm1 \n"
2699 "psrlw $0x8,%%xmm2 \n"
2700 "paddusb %%xmm2,%%xmm0 \n"
2701 "pand %%xmm5,%%xmm1 \n"
2702 "paddusb %%xmm1,%%xmm0 \n"
2703 "sub $0x1,%3 \n"
2704 "movd %%xmm0,(%2) \n"
2705 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002706 "jge 10b \n"
2707
2708 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002709 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002710 "jl 49f \n"
2711
fbarchard@google.com794fe122012-06-15 01:05:01 +00002712 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002713 ".p2align 2 \n"
2714 "41: \n"
2715 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002716 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002717 "movdqa %%xmm3,%%xmm0 \n"
2718 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002719 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002720 "psrlw $0x8,%%xmm3 \n"
2721 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2722 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002723 "pand %%xmm6,%%xmm2 \n"
2724 "paddw %%xmm7,%%xmm3 \n"
2725 "pmullw %%xmm3,%%xmm2 \n"
2726 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002727 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002728 "psrlw $0x8,%%xmm1 \n"
2729 "por %%xmm4,%%xmm0 \n"
2730 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002731 "psrlw $0x8,%%xmm2 \n"
2732 "paddusb %%xmm2,%%xmm0 \n"
2733 "pand %%xmm5,%%xmm1 \n"
2734 "paddusb %%xmm1,%%xmm0 \n"
2735 "sub $0x4,%3 \n"
2736 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002737 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002738 "jge 41b \n"
2739
2740 "49: \n"
2741 "add $0x3,%3 \n"
2742 "jl 99f \n"
2743
fbarchard@google.com794fe122012-06-15 01:05:01 +00002744 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002745 "91: \n"
2746 "movd (%0),%%xmm3 \n"
2747 "lea 0x4(%0),%0 \n"
2748 "movdqa %%xmm3,%%xmm0 \n"
2749 "pxor %%xmm4,%%xmm3 \n"
2750 "movd (%1),%%xmm2 \n"
2751 "psrlw $0x8,%%xmm3 \n"
2752 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2753 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2754 "pand %%xmm6,%%xmm2 \n"
2755 "paddw %%xmm7,%%xmm3 \n"
2756 "pmullw %%xmm3,%%xmm2 \n"
2757 "movd (%1),%%xmm1 \n"
2758 "lea 0x4(%1),%1 \n"
2759 "psrlw $0x8,%%xmm1 \n"
2760 "por %%xmm4,%%xmm0 \n"
2761 "pmullw %%xmm3,%%xmm1 \n"
2762 "psrlw $0x8,%%xmm2 \n"
2763 "paddusb %%xmm2,%%xmm0 \n"
2764 "pand %%xmm5,%%xmm1 \n"
2765 "paddusb %%xmm1,%%xmm0 \n"
2766 "sub $0x1,%3 \n"
2767 "movd %%xmm0,(%2) \n"
2768 "lea 0x4(%2),%2 \n"
2769 "jge 91b \n"
2770 "99: \n"
2771 : "+r"(src_argb0), // %0
2772 "+r"(src_argb1), // %1
2773 "+r"(dst_argb), // %2
2774 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00002775 :
2776 : "memory", "cc"
2777#if defined(__SSE2__)
2778 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2779#endif
2780 );
2781}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002782#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002783
fbarchard@google.com96af8702012-04-06 18:22:27 +00002784#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002785// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00002786CONST uvec8 kShuffleAlpha = {
2787 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2788 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2789};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002790
2791// Blend 8 pixels at a time
2792// Shuffle table for reversing the bytes.
2793
2794// Same as SSE2, but replaces
2795// psrlw xmm3, 8 // alpha
2796// pshufhw xmm3, xmm3,0F5h // 8 alpha words
2797// pshuflw xmm3, xmm3,0F5h
2798// with..
2799// pshufb xmm3, kShuffleAlpha // alpha
2800
2801void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2802 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002803 asm volatile (
2804 "pcmpeqb %%xmm7,%%xmm7 \n"
2805 "psrlw $0xf,%%xmm7 \n"
2806 "pcmpeqb %%xmm6,%%xmm6 \n"
2807 "psrlw $0x8,%%xmm6 \n"
2808 "pcmpeqb %%xmm5,%%xmm5 \n"
2809 "psllw $0x8,%%xmm5 \n"
2810 "pcmpeqb %%xmm4,%%xmm4 \n"
2811 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002812 "sub $0x1,%3 \n"
2813 "je 91f \n"
2814 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002815
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002816 // 1 pixel loop until destination pointer is aligned.
2817 "10: \n"
2818 "test $0xf,%2 \n"
2819 "je 19f \n"
2820 "movd (%0),%%xmm3 \n"
2821 "lea 0x4(%0),%0 \n"
2822 "movdqa %%xmm3,%%xmm0 \n"
2823 "pxor %%xmm4,%%xmm3 \n"
2824 "movd (%1),%%xmm2 \n"
2825 "pshufb %4,%%xmm3 \n"
2826 "pand %%xmm6,%%xmm2 \n"
2827 "paddw %%xmm7,%%xmm3 \n"
2828 "pmullw %%xmm3,%%xmm2 \n"
2829 "movd (%1),%%xmm1 \n"
2830 "lea 0x4(%1),%1 \n"
2831 "psrlw $0x8,%%xmm1 \n"
2832 "por %%xmm4,%%xmm0 \n"
2833 "pmullw %%xmm3,%%xmm1 \n"
2834 "psrlw $0x8,%%xmm2 \n"
2835 "paddusb %%xmm2,%%xmm0 \n"
2836 "pand %%xmm5,%%xmm1 \n"
2837 "paddusb %%xmm1,%%xmm0 \n"
2838 "sub $0x1,%3 \n"
2839 "movd %%xmm0,(%2) \n"
2840 "lea 0x4(%2),%2 \n"
2841 "jge 10b \n"
2842
2843 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002844 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002845 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002846 "test $0xf,%0 \n"
2847 "jne 41f \n"
2848 "test $0xf,%1 \n"
2849 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002850
fbarchard@google.com794fe122012-06-15 01:05:01 +00002851 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002852 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002853 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002854 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002855 "lea 0x10(%0),%0 \n"
2856 "movdqa %%xmm3,%%xmm0 \n"
2857 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002858 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002859 "pshufb %4,%%xmm3 \n"
2860 "pand %%xmm6,%%xmm2 \n"
2861 "paddw %%xmm7,%%xmm3 \n"
2862 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002863 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002864 "lea 0x10(%1),%1 \n"
2865 "psrlw $0x8,%%xmm1 \n"
2866 "por %%xmm4,%%xmm0 \n"
2867 "pmullw %%xmm3,%%xmm1 \n"
2868 "psrlw $0x8,%%xmm2 \n"
2869 "paddusb %%xmm2,%%xmm0 \n"
2870 "pand %%xmm5,%%xmm1 \n"
2871 "paddusb %%xmm1,%%xmm0 \n"
2872 "sub $0x4,%3 \n"
2873 "movdqa %%xmm0,(%2) \n"
2874 "lea 0x10(%2),%2 \n"
2875 "jge 40b \n"
2876 "jmp 49f \n"
2877
2878 // 4 pixel unaligned loop.
2879 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002880 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002881 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002882 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002883 "movdqa %%xmm3,%%xmm0 \n"
2884 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002885 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002886 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002887 "pand %%xmm6,%%xmm2 \n"
2888 "paddw %%xmm7,%%xmm3 \n"
2889 "pmullw %%xmm3,%%xmm2 \n"
2890 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002891 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002892 "psrlw $0x8,%%xmm1 \n"
2893 "por %%xmm4,%%xmm0 \n"
2894 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002895 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002896 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002897 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002898 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002899 "sub $0x4,%3 \n"
2900 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002901 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002902 "jge 41b \n"
2903
2904 "49: \n"
2905 "add $0x3,%3 \n"
2906 "jl 99f \n"
2907
fbarchard@google.com794fe122012-06-15 01:05:01 +00002908 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002909 "91: \n"
2910 "movd (%0),%%xmm3 \n"
2911 "lea 0x4(%0),%0 \n"
2912 "movdqa %%xmm3,%%xmm0 \n"
2913 "pxor %%xmm4,%%xmm3 \n"
2914 "movd (%1),%%xmm2 \n"
2915 "pshufb %4,%%xmm3 \n"
2916 "pand %%xmm6,%%xmm2 \n"
2917 "paddw %%xmm7,%%xmm3 \n"
2918 "pmullw %%xmm3,%%xmm2 \n"
2919 "movd (%1),%%xmm1 \n"
2920 "lea 0x4(%1),%1 \n"
2921 "psrlw $0x8,%%xmm1 \n"
2922 "por %%xmm4,%%xmm0 \n"
2923 "pmullw %%xmm3,%%xmm1 \n"
2924 "psrlw $0x8,%%xmm2 \n"
2925 "paddusb %%xmm2,%%xmm0 \n"
2926 "pand %%xmm5,%%xmm1 \n"
2927 "paddusb %%xmm1,%%xmm0 \n"
2928 "sub $0x1,%3 \n"
2929 "movd %%xmm0,(%2) \n"
2930 "lea 0x4(%2),%2 \n"
2931 "jge 91b \n"
2932 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002933 : "+r"(src_argb0), // %0
2934 "+r"(src_argb1), // %1
2935 "+r"(dst_argb), // %2
2936 "+r"(width) // %3
2937 : "m"(kShuffleAlpha) // %4
2938 : "memory", "cc"
2939#if defined(__SSE2__)
2940 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2941#endif
2942 );
2943}
2944#endif // HAS_ARGBBLENDROW_SSSE3
2945
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002946#ifdef HAS_ARGBATTENUATE_SSE2
2947// Attenuate 4 pixels at a time.
2948// aligned to 16 bytes
2949void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2950 asm volatile (
2951 "sub %0,%1 \n"
2952 "pcmpeqb %%xmm4,%%xmm4 \n"
2953 "pslld $0x18,%%xmm4 \n"
2954 "pcmpeqb %%xmm5,%%xmm5 \n"
2955 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002956
fbarchard@google.com81b804e2012-06-20 02:15:01 +00002957 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002958 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002959 "1: \n"
2960 "movdqa (%0),%%xmm0 \n"
2961 "punpcklbw %%xmm0,%%xmm0 \n"
2962 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2963 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2964 "pmulhuw %%xmm2,%%xmm0 \n"
2965 "movdqa (%0),%%xmm1 \n"
2966 "punpckhbw %%xmm1,%%xmm1 \n"
2967 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2968 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2969 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002970 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002971 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002972 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002973 "psrlw $0x8,%%xmm1 \n"
2974 "packuswb %%xmm1,%%xmm0 \n"
2975 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002976 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002977 "sub $0x4,%2 \n"
2978 "movdqa %%xmm0,(%0,%1,1) \n"
2979 "lea 0x10(%0),%0 \n"
2980 "jg 1b \n"
2981 : "+r"(src_argb), // %0
2982 "+r"(dst_argb), // %1
2983 "+r"(width) // %2
2984 :
2985 : "memory", "cc"
2986#if defined(__SSE2__)
2987 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2988#endif
2989 );
2990}
2991#endif // HAS_ARGBATTENUATE_SSE2
2992
fbarchard@google.comeeac2902012-07-18 18:54:32 +00002993#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00002994// Shuffle table duplicating alpha
2995CONST uvec8 kShuffleAlpha0 = {
2996 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2997};
2998CONST uvec8 kShuffleAlpha1 = {
2999 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3000 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3001};
3002// Attenuate 4 pixels at a time.
3003// aligned to 16 bytes
3004void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3005 asm volatile (
3006 "sub %0,%1 \n"
3007 "pcmpeqb %%xmm3,%%xmm3 \n"
3008 "pslld $0x18,%%xmm3 \n"
3009 "movdqa %3,%%xmm4 \n"
3010 "movdqa %4,%%xmm5 \n"
3011
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003012 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003013 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003014 "1: \n"
3015 "movdqa (%0),%%xmm0 \n"
3016 "pshufb %%xmm4,%%xmm0 \n"
3017 "movdqa (%0),%%xmm1 \n"
3018 "punpcklbw %%xmm1,%%xmm1 \n"
3019 "pmulhuw %%xmm1,%%xmm0 \n"
3020 "movdqa (%0),%%xmm1 \n"
3021 "pshufb %%xmm5,%%xmm1 \n"
3022 "movdqa (%0),%%xmm2 \n"
3023 "punpckhbw %%xmm2,%%xmm2 \n"
3024 "pmulhuw %%xmm2,%%xmm1 \n"
3025 "movdqa (%0),%%xmm2 \n"
3026 "pand %%xmm3,%%xmm2 \n"
3027 "psrlw $0x8,%%xmm0 \n"
3028 "psrlw $0x8,%%xmm1 \n"
3029 "packuswb %%xmm1,%%xmm0 \n"
3030 "por %%xmm2,%%xmm0 \n"
3031 "sub $0x4,%2 \n"
3032 "movdqa %%xmm0,(%0,%1,1) \n"
3033 "lea 0x10(%0),%0 \n"
3034 "jg 1b \n"
3035 : "+r"(src_argb), // %0
3036 "+r"(dst_argb), // %1
3037 "+r"(width) // %2
3038 : "m"(kShuffleAlpha0), // %3
3039 "m"(kShuffleAlpha1) // %4
3040 : "memory", "cc"
3041#if defined(__SSE2__)
3042 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3043#endif
3044 );
3045}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003046#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003047
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003048#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003049// Unattenuate 4 pixels at a time.
3050// aligned to 16 bytes
3051void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3052 int width) {
3053 uintptr_t alpha = 0;
3054 asm volatile (
3055 "sub %0,%1 \n"
3056 "pcmpeqb %%xmm4,%%xmm4 \n"
3057 "pslld $0x18,%%xmm4 \n"
3058
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003059 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003060 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003061 "1: \n"
3062 "movdqa (%0),%%xmm0 \n"
3063 "movzb 0x3(%0),%3 \n"
3064 "punpcklbw %%xmm0,%%xmm0 \n"
3065 "movd 0x0(%4,%3,4),%%xmm2 \n"
3066 "movzb 0x7(%0),%3 \n"
3067 "movd 0x0(%4,%3,4),%%xmm3 \n"
3068 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3069 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3070 "movlhps %%xmm3,%%xmm2 \n"
3071 "pmulhuw %%xmm2,%%xmm0 \n"
3072 "movdqa (%0),%%xmm1 \n"
3073 "movzb 0xb(%0),%3 \n"
3074 "punpckhbw %%xmm1,%%xmm1 \n"
3075 "movd 0x0(%4,%3,4),%%xmm2 \n"
3076 "movzb 0xf(%0),%3 \n"
3077 "movd 0x0(%4,%3,4),%%xmm3 \n"
3078 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3079 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3080 "movlhps %%xmm3,%%xmm2 \n"
3081 "pmulhuw %%xmm2,%%xmm1 \n"
3082 "movdqa (%0),%%xmm2 \n"
3083 "pand %%xmm4,%%xmm2 \n"
3084 "packuswb %%xmm1,%%xmm0 \n"
3085 "por %%xmm2,%%xmm0 \n"
3086 "sub $0x4,%2 \n"
3087 "movdqa %%xmm0,(%0,%1,1) \n"
3088 "lea 0x10(%0),%0 \n"
3089 "jg 1b \n"
3090 : "+r"(src_argb), // %0
3091 "+r"(dst_argb), // %1
3092 "+r"(width), // %2
3093 "+r"(alpha) // %3
3094 : "r"(fixed_invtbl8) // %4
3095 : "memory", "cc"
3096#if defined(__SSE2__)
3097 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3098#endif
3099 );
3100}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003101#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003102
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003103#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003104// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003105CONST vec8 kARGBToGray = {
3106 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3107};
3108
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003109// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003110void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003111 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003112 "movdqa %3,%%xmm4 \n"
3113 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003114
3115 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003116 ".p2align 4 \n"
3117 "1: \n"
3118 "movdqa (%0),%%xmm0 \n"
3119 "movdqa 0x10(%0),%%xmm1 \n"
3120 "pmaddubsw %%xmm4,%%xmm0 \n"
3121 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003122 "phaddw %%xmm1,%%xmm0 \n"
3123 "psrlw $0x7,%%xmm0 \n"
3124 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003125 "movdqa (%0),%%xmm2 \n"
3126 "movdqa 0x10(%0),%%xmm3 \n"
3127 "psrld $0x18,%%xmm2 \n"
3128 "psrld $0x18,%%xmm3 \n"
3129 "packuswb %%xmm3,%%xmm2 \n"
3130 "packuswb %%xmm2,%%xmm2 \n"
3131 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003132 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003133 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003134 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003135 "punpcklwd %%xmm3,%%xmm0 \n"
3136 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003137 "sub $0x8,%2 \n"
3138 "movdqa %%xmm0,(%0,%1,1) \n"
3139 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003140 "lea 0x20(%0),%0 \n"
3141 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003142 : "+r"(src_argb), // %0
3143 "+r"(dst_argb), // %1
3144 "+r"(width) // %2
3145 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003146 : "memory", "cc"
3147#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003148 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003149#endif
3150 );
3151}
3152#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003153
3154#ifdef HAS_ARGBSEPIAROW_SSSE3
3155// b = (r * 35 + g * 68 + b * 17) >> 7
3156// g = (r * 45 + g * 88 + b * 22) >> 7
3157// r = (r * 50 + g * 98 + b * 24) >> 7
3158// Constant for ARGB color to sepia tone
3159CONST vec8 kARGBToSepiaB = {
3160 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3161};
3162
3163CONST vec8 kARGBToSepiaG = {
3164 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3165};
3166
3167CONST vec8 kARGBToSepiaR = {
3168 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3169};
3170
fbarchard@google.come442dc42012-06-18 17:37:09 +00003171// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003172void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3173 asm volatile (
3174 "movdqa %2,%%xmm2 \n"
3175 "movdqa %3,%%xmm3 \n"
3176 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003177
3178 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003179 ".p2align 4 \n"
3180 "1: \n"
3181 "movdqa (%0),%%xmm0 \n"
3182 "movdqa 0x10(%0),%%xmm6 \n"
3183 "pmaddubsw %%xmm2,%%xmm0 \n"
3184 "pmaddubsw %%xmm2,%%xmm6 \n"
3185 "phaddw %%xmm6,%%xmm0 \n"
3186 "psrlw $0x7,%%xmm0 \n"
3187 "packuswb %%xmm0,%%xmm0 \n"
3188 "movdqa (%0),%%xmm5 \n"
3189 "movdqa 0x10(%0),%%xmm1 \n"
3190 "pmaddubsw %%xmm3,%%xmm5 \n"
3191 "pmaddubsw %%xmm3,%%xmm1 \n"
3192 "phaddw %%xmm1,%%xmm5 \n"
3193 "psrlw $0x7,%%xmm5 \n"
3194 "packuswb %%xmm5,%%xmm5 \n"
3195 "punpcklbw %%xmm5,%%xmm0 \n"
3196 "movdqa (%0),%%xmm5 \n"
3197 "movdqa 0x10(%0),%%xmm1 \n"
3198 "pmaddubsw %%xmm4,%%xmm5 \n"
3199 "pmaddubsw %%xmm4,%%xmm1 \n"
3200 "phaddw %%xmm1,%%xmm5 \n"
3201 "psrlw $0x7,%%xmm5 \n"
3202 "packuswb %%xmm5,%%xmm5 \n"
3203 "movdqa (%0),%%xmm6 \n"
3204 "movdqa 0x10(%0),%%xmm1 \n"
3205 "psrld $0x18,%%xmm6 \n"
3206 "psrld $0x18,%%xmm1 \n"
3207 "packuswb %%xmm1,%%xmm6 \n"
3208 "packuswb %%xmm6,%%xmm6 \n"
3209 "punpcklbw %%xmm6,%%xmm5 \n"
3210 "movdqa %%xmm0,%%xmm1 \n"
3211 "punpcklwd %%xmm5,%%xmm0 \n"
3212 "punpckhwd %%xmm5,%%xmm1 \n"
3213 "sub $0x8,%1 \n"
3214 "movdqa %%xmm0,(%0) \n"
3215 "movdqa %%xmm1,0x10(%0) \n"
3216 "lea 0x20(%0),%0 \n"
3217 "jg 1b \n"
3218 : "+r"(dst_argb), // %0
3219 "+r"(width) // %1
3220 : "m"(kARGBToSepiaB), // %2
3221 "m"(kARGBToSepiaG), // %3
3222 "m"(kARGBToSepiaR) // %4
3223 : "memory", "cc"
3224#if defined(__SSE2__)
3225 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3226#endif
3227 );
3228}
3229#endif // HAS_ARGBSEPIAROW_SSSE3
3230
fbarchard@google.come442dc42012-06-18 17:37:09 +00003231#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3232// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3233// Same as Sepia except matrix is provided.
3234void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3235 int width) {
3236 asm volatile (
3237 "movd (%2),%%xmm2 \n"
3238 "movd 0x4(%2),%%xmm3 \n"
3239 "movd 0x8(%2),%%xmm4 \n"
3240 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3241 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3242 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003243
3244 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003245 ".p2align 4 \n"
3246 "1: \n"
3247 "movdqa (%0),%%xmm0 \n"
3248 "movdqa 0x10(%0),%%xmm6 \n"
3249 "pmaddubsw %%xmm2,%%xmm0 \n"
3250 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003251 "movdqa (%0),%%xmm5 \n"
3252 "movdqa 0x10(%0),%%xmm1 \n"
3253 "pmaddubsw %%xmm3,%%xmm5 \n"
3254 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003255 "phaddsw %%xmm6,%%xmm0 \n"
3256 "phaddsw %%xmm1,%%xmm5 \n"
3257 "psraw $0x7,%%xmm0 \n"
3258 "psraw $0x7,%%xmm5 \n"
3259 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003260 "packuswb %%xmm5,%%xmm5 \n"
3261 "punpcklbw %%xmm5,%%xmm0 \n"
3262 "movdqa (%0),%%xmm5 \n"
3263 "movdqa 0x10(%0),%%xmm1 \n"
3264 "pmaddubsw %%xmm4,%%xmm5 \n"
3265 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003266 "phaddsw %%xmm1,%%xmm5 \n"
3267 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003268 "packuswb %%xmm5,%%xmm5 \n"
3269 "movdqa (%0),%%xmm6 \n"
3270 "movdqa 0x10(%0),%%xmm1 \n"
3271 "psrld $0x18,%%xmm6 \n"
3272 "psrld $0x18,%%xmm1 \n"
3273 "packuswb %%xmm1,%%xmm6 \n"
3274 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003275 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003276 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003277 "punpcklwd %%xmm5,%%xmm0 \n"
3278 "punpckhwd %%xmm5,%%xmm1 \n"
3279 "sub $0x8,%1 \n"
3280 "movdqa %%xmm0,(%0) \n"
3281 "movdqa %%xmm1,0x10(%0) \n"
3282 "lea 0x20(%0),%0 \n"
3283 "jg 1b \n"
3284 : "+r"(dst_argb), // %0
3285 "+r"(width) // %1
3286 : "r"(matrix_argb) // %2
3287 : "memory", "cc"
3288#if defined(__SSE2__)
3289 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3290#endif
3291 );
3292}
3293#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3294
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003295#ifdef HAS_ARGBQUANTIZEROW_SSE2
3296// Quantize 4 ARGB pixels (16 bytes).
3297// aligned to 16 bytes
3298void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3299 int interval_offset, int width) {
3300 asm volatile (
3301 "movd %2,%%xmm2 \n"
3302 "movd %3,%%xmm3 \n"
3303 "movd %4,%%xmm4 \n"
3304 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3305 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3306 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3307 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3308 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3309 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3310 "pxor %%xmm5,%%xmm5 \n"
3311 "pcmpeqb %%xmm6,%%xmm6 \n"
3312 "pslld $0x18,%%xmm6 \n"
3313
3314 // 4 pixel loop.
3315 ".p2align 2 \n"
3316 "1: \n"
3317 "movdqa (%0),%%xmm0 \n"
3318 "punpcklbw %%xmm5,%%xmm0 \n"
3319 "pmulhuw %%xmm2,%%xmm0 \n"
3320 "movdqa (%0),%%xmm1 \n"
3321 "punpckhbw %%xmm5,%%xmm1 \n"
3322 "pmulhuw %%xmm2,%%xmm1 \n"
3323 "pmullw %%xmm3,%%xmm0 \n"
3324 "movdqa (%0),%%xmm7 \n"
3325 "pmullw %%xmm3,%%xmm1 \n"
3326 "pand %%xmm6,%%xmm7 \n"
3327 "paddw %%xmm4,%%xmm0 \n"
3328 "paddw %%xmm4,%%xmm1 \n"
3329 "packuswb %%xmm1,%%xmm0 \n"
3330 "por %%xmm7,%%xmm0 \n"
3331 "sub $0x4,%1 \n"
3332 "movdqa %%xmm0,(%0) \n"
3333 "lea 0x10(%0),%0 \n"
3334 "jg 1b \n"
3335 : "+r"(dst_argb), // %0
3336 "+r"(width) // %1
3337 : "r"(scale), // %2
3338 "r"(interval_size), // %3
3339 "r"(interval_offset) // %4
3340 : "memory", "cc"
3341#if defined(__SSE2__)
3342 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3343#endif
3344 );
3345}
3346#endif // HAS_ARGBQUANTIZEROW_SSE2
3347
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003348#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3349// Creates a table of cumulative sums where each value is a sum of all values
3350// above and to the left of the value, inclusive of the value.
3351void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003352 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003353 asm volatile (
3354 "sub %1,%2 \n"
3355 "pxor %%xmm0,%%xmm0 \n"
3356 "pxor %%xmm1,%%xmm1 \n"
3357 "sub $0x4,%3 \n"
3358 "jl 49f \n"
3359 "test $0xf,%1 \n"
3360 "jne 49f \n"
3361
3362 // 4 pixel loop \n"
3363 ".p2align 2 \n"
3364 "40: \n"
3365 "movdqu (%0),%%xmm2 \n"
3366 "lea 0x10(%0),%0 \n"
3367 "movdqa %%xmm2,%%xmm4 \n"
3368 "punpcklbw %%xmm1,%%xmm2 \n"
3369 "movdqa %%xmm2,%%xmm3 \n"
3370 "punpcklwd %%xmm1,%%xmm2 \n"
3371 "punpckhwd %%xmm1,%%xmm3 \n"
3372 "punpckhbw %%xmm1,%%xmm4 \n"
3373 "movdqa %%xmm4,%%xmm5 \n"
3374 "punpcklwd %%xmm1,%%xmm4 \n"
3375 "punpckhwd %%xmm1,%%xmm5 \n"
3376 "paddd %%xmm2,%%xmm0 \n"
3377 "movdqa (%1,%2,1),%%xmm2 \n"
3378 "paddd %%xmm0,%%xmm2 \n"
3379 "paddd %%xmm3,%%xmm0 \n"
3380 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3381 "paddd %%xmm0,%%xmm3 \n"
3382 "paddd %%xmm4,%%xmm0 \n"
3383 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3384 "paddd %%xmm0,%%xmm4 \n"
3385 "paddd %%xmm5,%%xmm0 \n"
3386 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3387 "paddd %%xmm0,%%xmm5 \n"
3388 "movdqa %%xmm2,(%1) \n"
3389 "movdqa %%xmm3,0x10(%1) \n"
3390 "movdqa %%xmm4,0x20(%1) \n"
3391 "movdqa %%xmm5,0x30(%1) \n"
3392 "lea 0x40(%1),%1 \n"
3393 "sub $0x4,%3 \n"
3394 "jge 40b \n"
3395
3396 "49: \n"
3397 "add $0x3,%3 \n"
3398 "jl 19f \n"
3399
3400 // 1 pixel loop \n"
3401 ".p2align 2 \n"
3402 "10: \n"
3403 "movd (%0),%%xmm2 \n"
3404 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003405 "punpcklbw %%xmm1,%%xmm2 \n"
3406 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003407 "paddd %%xmm2,%%xmm0 \n"
3408 "movdqu (%1,%2,1),%%xmm2 \n"
3409 "paddd %%xmm0,%%xmm2 \n"
3410 "movdqu %%xmm2,(%1) \n"
3411 "lea 0x10(%1),%1 \n"
3412 "sub $0x1,%3 \n"
3413 "jge 10b \n"
3414
3415 "19: \n"
3416 : "+r"(row), // %0
3417 "+r"(cumsum), // %1
3418 "+r"(previous_cumsum), // %2
3419 "+r"(width) // %3
3420 :
3421 : "memory", "cc"
3422#if defined(__SSE2__)
3423 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3424#endif
3425 );
3426}
3427#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3428
3429#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3430void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3431 int width, int area, uint8* dst, int count) {
3432 asm volatile (
3433 "movd %5,%%xmm4 \n"
3434 "cvtdq2ps %%xmm4,%%xmm4 \n"
3435 "rcpss %%xmm4,%%xmm4 \n"
3436 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3437 "sub $0x4,%3 \n"
3438 "jl 49f \n"
3439
3440 // 4 pixel loop \n"
3441 ".p2align 2 \n"
3442 "40: \n"
3443 "movdqa (%0),%%xmm0 \n"
3444 "movdqa 0x10(%0),%%xmm1 \n"
3445 "movdqa 0x20(%0),%%xmm2 \n"
3446 "movdqa 0x30(%0),%%xmm3 \n"
3447 "psubd (%0,%4,4),%%xmm0 \n"
3448 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3449 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3450 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3451 "lea 0x40(%0),%0 \n"
3452 "psubd (%1),%%xmm0 \n"
3453 "psubd 0x10(%1),%%xmm1 \n"
3454 "psubd 0x20(%1),%%xmm2 \n"
3455 "psubd 0x30(%1),%%xmm3 \n"
3456 "paddd (%1,%4,4),%%xmm0 \n"
3457 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3458 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3459 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3460 "lea 0x40(%1),%1 \n"
3461 "cvtdq2ps %%xmm0,%%xmm0 \n"
3462 "cvtdq2ps %%xmm1,%%xmm1 \n"
3463 "mulps %%xmm4,%%xmm0 \n"
3464 "mulps %%xmm4,%%xmm1 \n"
3465 "cvtdq2ps %%xmm2,%%xmm2 \n"
3466 "cvtdq2ps %%xmm3,%%xmm3 \n"
3467 "mulps %%xmm4,%%xmm2 \n"
3468 "mulps %%xmm4,%%xmm3 \n"
3469 "cvtps2dq %%xmm0,%%xmm0 \n"
3470 "cvtps2dq %%xmm1,%%xmm1 \n"
3471 "cvtps2dq %%xmm2,%%xmm2 \n"
3472 "cvtps2dq %%xmm3,%%xmm3 \n"
3473 "packssdw %%xmm1,%%xmm0 \n"
3474 "packssdw %%xmm3,%%xmm2 \n"
3475 "packuswb %%xmm2,%%xmm0 \n"
3476 "movdqu %%xmm0,(%2) \n"
3477 "lea 0x10(%2),%2 \n"
3478 "sub $0x4,%3 \n"
3479 "jge 40b \n"
3480
3481 "49: \n"
3482 "add $0x3,%3 \n"
3483 "jl 19f \n"
3484
3485 // 1 pixel loop \n"
3486 ".p2align 2 \n"
3487 "10: \n"
3488 "movdqa (%0),%%xmm0 \n"
3489 "psubd (%0,%4,4),%%xmm0 \n"
3490 "lea 0x10(%0),%0 \n"
3491 "psubd (%1),%%xmm0 \n"
3492 "paddd (%1,%4,4),%%xmm0 \n"
3493 "lea 0x10(%1),%1 \n"
3494 "cvtdq2ps %%xmm0,%%xmm0 \n"
3495 "mulps %%xmm4,%%xmm0 \n"
3496 "cvtps2dq %%xmm0,%%xmm0 \n"
3497 "packssdw %%xmm0,%%xmm0 \n"
3498 "packuswb %%xmm0,%%xmm0 \n"
3499 "movd %%xmm0,(%2) \n"
3500 "lea 0x4(%2),%2 \n"
3501 "sub $0x1,%3 \n"
3502 "jge 10b \n"
3503 "19: \n"
3504 : "+r"(topleft), // %0
3505 "+r"(botleft), // %1
3506 "+r"(dst), // %2
3507 "+rm"(count) // %3
3508 : "r"(static_cast<intptr_t>(width)), // %4
3509 "rm"(area) // %5
3510 : "memory", "cc"
3511#if defined(__SSE2__)
3512 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3513#endif
3514 );
3515}
3516#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003517#ifdef HAS_ARGBSHADE_SSE2
3518// Shade 4 pixels at a time by specified value.
3519// Aligned to 16 bytes.
3520void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3521 uint32 value) {
3522 asm volatile (
3523 "movd %3,%%xmm2 \n"
3524 "sub %0,%1 \n"
3525 "punpcklbw %%xmm2,%%xmm2 \n"
3526 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003527
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003528 // 4 pixel loop.
3529 ".p2align 2 \n"
3530 "1: \n"
3531 "movdqa (%0),%%xmm0 \n"
3532 "movdqa %%xmm0,%%xmm1 \n"
3533 "punpcklbw %%xmm0,%%xmm0 \n"
3534 "punpckhbw %%xmm1,%%xmm1 \n"
3535 "pmulhuw %%xmm2,%%xmm0 \n"
3536 "pmulhuw %%xmm2,%%xmm1 \n"
3537 "psrlw $0x8,%%xmm0 \n"
3538 "psrlw $0x8,%%xmm1 \n"
3539 "packuswb %%xmm1,%%xmm0 \n"
3540 "sub $0x4,%2 \n"
3541 "movdqa %%xmm0,(%0,%1,1) \n"
3542 "lea 0x10(%0),%0 \n"
3543 "jg 1b \n"
3544 : "+r"(src_argb), // %0
3545 "+r"(dst_argb), // %1
3546 "+r"(width) // %2
3547 : "r"(value) // %3
3548 : "memory", "cc"
3549#if defined(__SSE2__)
3550 , "xmm0", "xmm1", "xmm2"
3551#endif
3552 );
3553}
3554#endif // HAS_ARGBSHADE_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003555
fbarchard@google.com73444402012-08-09 17:33:29 +00003556#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003557// TODO(fbarchard): Find 64 bit way to avoid masking.
3558// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
fbarchard@google.com73444402012-08-09 17:33:29 +00003559// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003560// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003561// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003562
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00003563LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00003564void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3565 uint8* dst_argb, const float* uv_dudv, int width) {
3566 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003567 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00003568 asm volatile (
3569 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003570 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003571 "shl $0x10,%1 \n"
3572 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003573 "movd %1,%%xmm5 \n"
3574 "sub $0x4,%4 \n"
3575 "jl 49f \n"
3576
3577 "pshufd $0x44,%%xmm7,%%xmm7 \n"
3578 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003579 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003580 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003581 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003582 "movdqa %%xmm7,%%xmm4 \n"
3583 "addps %%xmm4,%%xmm4 \n"
3584 "movdqa %%xmm2,%%xmm3 \n"
3585 "addps %%xmm4,%%xmm3 \n"
3586 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003587
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003588 // 4 pixel loop \n"
3589 ".p2align 4 \n"
3590 "40: \n"
3591 "cvttps2dq %%xmm2,%%xmm0 \n"
3592 "cvttps2dq %%xmm3,%%xmm1 \n"
3593 "packssdw %%xmm1,%%xmm0 \n"
3594 "pmaddwd %%xmm5,%%xmm0 \n"
3595#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003596 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003597 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003598 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003599 "shr $32,%5 \n"
3600 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
3601#else
3602 "movd %%xmm0,%1 \n"
3603 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3604 "movd %%xmm0,%5 \n"
3605 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3606#endif
3607 "movd (%0,%1,1),%%xmm1 \n"
3608 "movd (%0,%5,1),%%xmm6 \n"
3609 "punpckldq %%xmm6,%%xmm1 \n"
3610 "addps %%xmm4,%%xmm2 \n"
3611 "movq %%xmm1,(%2) \n"
3612#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003613 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003614 "mov %1,%5 \n"
3615 "and $0x0fffffff,%1 \n"
3616 "shr $32,%5 \n"
3617#else
3618 "movd %%xmm0,%1 \n"
3619 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3620 "movd %%xmm0,%5 \n"
3621#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00003622 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003623 "movd (%0,%5,1),%%xmm6 \n"
3624 "punpckldq %%xmm6,%%xmm0 \n"
3625 "addps %%xmm4,%%xmm3 \n"
3626 "sub $0x4,%4 \n"
3627 "movq %%xmm0,0x08(%2) \n"
3628 "lea 0x10(%2),%2 \n"
3629 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003630
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003631 "49: \n"
3632 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003633 "jl 19f \n"
3634
3635 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003636 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003637 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003638 "cvttps2dq %%xmm2,%%xmm0 \n"
3639 "packssdw %%xmm0,%%xmm0 \n"
3640 "pmaddwd %%xmm5,%%xmm0 \n"
3641 "addps %%xmm7,%%xmm2 \n"
3642 "movd %%xmm0,%1 \n"
3643#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00003644 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003645#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00003646 "movd (%0,%1,1),%%xmm0 \n"
3647 "sub $0x1,%4 \n"
3648 "movd %%xmm0,(%2) \n"
3649 "lea 0x4(%2),%2 \n"
3650 "jge 10b \n"
3651 "19: \n"
3652 : "+r"(src_argb), // %0
3653 "+r"(src_argb_stride_temp), // %1
3654 "+r"(dst_argb), // %2
3655 "+r"(uv_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003656 "+rm"(width), // %4
3657 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00003658 :
3659 : "memory", "cc"
3660#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003661 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00003662#endif
3663 );
3664}
3665#endif // HAS_ARGBAFFINEROW_SSE2
3666
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00003667// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
3668void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
3669 ptrdiff_t src_stride, int dst_width,
3670 int source_y_fraction) {
3671 asm volatile (
3672 "sub %1,%0 \n"
3673 "shr %3 \n"
3674 "cmp $0x0,%3 \n"
3675 "je 2f \n"
3676 "cmp $0x40,%3 \n"
3677 "je 3f \n"
3678 "movd %3,%%xmm0 \n"
3679 "neg %3 \n"
3680 "add $0x80,%3 \n"
3681 "movd %3,%%xmm5 \n"
3682 "punpcklbw %%xmm0,%%xmm5 \n"
3683 "punpcklwd %%xmm5,%%xmm5 \n"
3684 "pshufd $0x0,%%xmm5,%%xmm5 \n"
3685 ".p2align 4 \n"
3686 "1: \n"
3687 "movdqa (%1),%%xmm0 \n"
3688 "movdqa (%1,%4,1),%%xmm2 \n"
3689 "movdqa %%xmm0,%%xmm1 \n"
3690 "punpcklbw %%xmm2,%%xmm0 \n"
3691 "punpckhbw %%xmm2,%%xmm1 \n"
3692 "pmaddubsw %%xmm5,%%xmm0 \n"
3693 "pmaddubsw %%xmm5,%%xmm1 \n"
3694 "psrlw $0x7,%%xmm0 \n"
3695 "psrlw $0x7,%%xmm1 \n"
3696 "packuswb %%xmm1,%%xmm0 \n"
3697 "sub $0x4,%2 \n"
3698 "movdqa %%xmm0,(%1,%0,1) \n"
3699 "lea 0x10(%1),%1 \n"
3700 "jg 1b \n"
3701 "jmp 4f \n"
3702 ".p2align 4 \n"
3703 "2: \n"
3704 "movdqa (%1),%%xmm0 \n"
3705 "sub $0x4,%2 \n"
3706 "movdqa %%xmm0,(%1,%0,1) \n"
3707 "lea 0x10(%1),%1 \n"
3708 "jg 2b \n"
3709 "jmp 4f \n"
3710 ".p2align 4 \n"
3711 "3: \n"
3712 "movdqa (%1),%%xmm0 \n"
3713 "pavgb (%1,%4,1),%%xmm0 \n"
3714 "sub $0x4,%2 \n"
3715 "movdqa %%xmm0,(%1,%0,1) \n"
3716 "lea 0x10(%1),%1 \n"
3717 "jg 3b \n"
3718 "4: \n"
3719 ".p2align 4 \n"
3720 : "+r"(dst_ptr), // %0
3721 "+r"(src_ptr), // %1
3722 "+r"(dst_width), // %2
3723 "+r"(source_y_fraction) // %3
3724 : "r"(static_cast<intptr_t>(src_stride)) // %4
3725 : "memory", "cc"
3726#if defined(__SSE2__)
3727 , "xmm0", "xmm1", "xmm2", "xmm5"
3728#endif
3729 );
3730}
3731
fbarchard@google.come91bdac2012-10-09 21:09:33 +00003732void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
3733 uint8* dst_uv, int pix) {
3734 asm volatile (
3735 "sub %0,%1 \n"
3736 ".p2align 4 \n"
3737 "1: \n"
3738 "movdqa (%0),%%xmm0 \n"
3739 "pavgb (%0,%3),%%xmm0 \n"
3740 "sub $0x10,%2 \n"
3741 "movdqa %%xmm0,(%0,%1) \n"
3742 "lea 0x10(%0),%0 \n"
3743 "jg 1b \n"
3744 : "+r"(src_uv), // %0
3745 "+r"(dst_uv), // %1
3746 "+r"(pix) // %2
3747 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
3748 : "memory", "cc"
3749#if defined(__SSE2__)
3750 , "xmm0"
3751#endif
3752 );
3753}
fbarchard@google.com2d11d432012-02-16 02:50:39 +00003754#endif // defined(__x86_64__) || defined(__i386__)
3755
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003756#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003757} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003758} // namespace libyuv
3759#endif