blob: fa0c07ec697a69f5b1d18596ee4e341e126e2afb [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
fbarchard@google.com4de0c432012-10-11 01:25:46 +000072// Constants for RGBA.
73CONST vec8 kRGBAToY = {
74 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
75};
76
77CONST vec8 kRGBAToU = {
78 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
79};
80
81CONST vec8 kRGBAToV = {
82 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
83};
84
fbarchard@google.com714050a2012-02-17 22:59:56 +000085CONST uvec8 kAddY16 = {
86 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000087};
fbarchard@google.com2430e042011-11-11 21:57:06 +000088
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000089CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
92};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000093
fbarchard@google.comba1f5262012-01-12 19:22:41 +000094// Shuffle table for converting RGB24 to ARGB.
95CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000096 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
97};
98
99// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000100CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000101 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
102};
103
fbarchard@google.comb6149762011-11-07 21:58:52 +0000104// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000105CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000106 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
107};
108
109// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000110CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
112};
113
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000114// Shuffle table for converting RGBA to ARGB.
115CONST uvec8 kShuffleMaskRGBAToARGB = {
116 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
117};
118
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000119// Shuffle table for converting ARGB to RGBA.
120CONST uvec8 kShuffleMaskARGBToRGBA = {
121 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
122};
123
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000124// Shuffle table for converting ARGB to RGB24.
125CONST uvec8 kShuffleMaskARGBToRGB24 = {
126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
127};
128
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000129// Shuffle table for converting ARGB to RAW.
130CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000131 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000132};
133
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000134// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000135CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
137};
138
139// Shuffle table for converting ARGB to RAW.
140CONST uvec8 kShuffleMaskARGBToRAW_0 = {
141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142};
143
fbarchard@google.comb6149762011-11-07 21:58:52 +0000144void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000145 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pcmpeqb %%xmm5,%%xmm5 \n"
147 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000148 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "1: \n"
150 "movq (%0),%%xmm0 \n"
151 "lea 0x8(%0),%0 \n"
152 "punpcklbw %%xmm0,%%xmm0 \n"
153 "movdqa %%xmm0,%%xmm1 \n"
154 "punpcklwd %%xmm0,%%xmm0 \n"
155 "punpckhwd %%xmm1,%%xmm1 \n"
156 "por %%xmm5,%%xmm0 \n"
157 "por %%xmm5,%%xmm1 \n"
158 "movdqa %%xmm0,(%1) \n"
159 "movdqa %%xmm1,0x10(%1) \n"
160 "lea 0x20(%1),%1 \n"
161 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000162 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163 : "+r"(src_y), // %0
164 "+r"(dst_argb), // %1
165 "+r"(pix) // %2
166 :
167 : "memory", "cc"
168#if defined(__SSE2__)
169 , "xmm0", "xmm1", "xmm5"
170#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000171 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000172}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173
fbarchard@google.com00b69a22012-11-02 06:03:28 +0000174void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
175 int pix) {
176 asm volatile (
177 "pcmpeqb %%xmm5,%%xmm5 \n"
178 "pslld $0x18,%%xmm5 \n"
179 ".p2align 4 \n"
180 "1: \n"
181 "movq (%0),%%xmm0 \n"
182 "lea 0x8(%0),%0 \n"
183 "punpcklbw %%xmm0,%%xmm0 \n"
184 "movdqa %%xmm0,%%xmm1 \n"
185 "punpcklwd %%xmm0,%%xmm0 \n"
186 "punpckhwd %%xmm1,%%xmm1 \n"
187 "por %%xmm5,%%xmm0 \n"
188 "por %%xmm5,%%xmm1 \n"
189 "movdqu %%xmm0,(%1) \n"
190 "movdqu %%xmm1,0x10(%1) \n"
191 "lea 0x20(%1),%1 \n"
192 "sub $0x8,%2 \n"
193 "jg 1b \n"
194 : "+r"(src_y), // %0
195 "+r"(dst_argb), // %1
196 "+r"(pix) // %2
197 :
198 : "memory", "cc"
199#if defined(__SSE2__)
200 , "xmm0", "xmm1", "xmm5"
201#endif
202 );
203}
204
fbarchard@google.comb6149762011-11-07 21:58:52 +0000205void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000206 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000207 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000208 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000209 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000210 "1: \n"
211 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000212 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000213 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "movdqa %%xmm0,(%0,%1,1) \n"
215 "lea 0x10(%0),%0 \n"
216 "jg 1b \n"
217
fbarchard@google.comb6149762011-11-07 21:58:52 +0000218 : "+r"(src_abgr), // %0
219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
221 : "m"(kShuffleMaskABGRToARGB) // %3
222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000227}
228
229void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000232 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000233 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000234 "1: \n"
235 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000236 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000237 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000238 "movdqa %%xmm0,(%0,%1,1) \n"
239 "lea 0x10(%0),%0 \n"
240 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000241 : "+r"(src_bgra), // %0
242 "+r"(dst_argb), // %1
243 "+r"(pix) // %2
244 : "m"(kShuffleMaskBGRAToARGB) // %3
245 : "memory", "cc"
246#if defined(__SSE2__)
247 , "xmm0", "xmm5"
248#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000249 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000250}
251
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000252void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
253 asm volatile (
254 "movdqa %3,%%xmm5 \n"
255 "sub %0,%1 \n"
256 ".p2align 4 \n"
257 "1: \n"
258 "movdqa (%0),%%xmm0 \n"
259 "pshufb %%xmm5,%%xmm0 \n"
260 "sub $0x4,%2 \n"
261 "movdqa %%xmm0,(%0,%1,1) \n"
262 "lea 0x10(%0),%0 \n"
263 "jg 1b \n"
264
265 : "+r"(src_rgba), // %0
266 "+r"(dst_argb), // %1
267 "+r"(pix) // %2
268 : "m"(kShuffleMaskRGBAToARGB) // %3
269 : "memory", "cc"
270#if defined(__SSE2__)
271 , "xmm0", "xmm5"
272#endif
273 );
274}
275
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000276void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
277 asm volatile (
278 "movdqa %3,%%xmm5 \n"
279 "sub %0,%1 \n"
280 ".p2align 4 \n"
281 "1: \n"
282 "movdqa (%0),%%xmm0 \n"
283 "pshufb %%xmm5,%%xmm0 \n"
284 "sub $0x4,%2 \n"
285 "movdqa %%xmm0,(%0,%1,1) \n"
286 "lea 0x10(%0),%0 \n"
287 "jg 1b \n"
288
289 : "+r"(src_argb), // %0
290 "+r"(dst_rgba), // %1
291 "+r"(pix) // %2
292 : "m"(kShuffleMaskARGBToRGBA) // %3
293 : "memory", "cc"
294#if defined(__SSE2__)
295 , "xmm0", "xmm5"
296#endif
297 );
298}
299
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000300void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000301 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000302 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
303 "pslld $0x18,%%xmm5 \n"
304 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000305 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000306 "1: \n"
307 "movdqu (%0),%%xmm0 \n"
308 "movdqu 0x10(%0),%%xmm1 \n"
309 "movdqu 0x20(%0),%%xmm3 \n"
310 "lea 0x30(%0),%0 \n"
311 "movdqa %%xmm3,%%xmm2 \n"
312 "palignr $0x8,%%xmm1,%%xmm2 \n"
313 "pshufb %%xmm4,%%xmm2 \n"
314 "por %%xmm5,%%xmm2 \n"
315 "palignr $0xc,%%xmm0,%%xmm1 \n"
316 "pshufb %%xmm4,%%xmm0 \n"
317 "movdqa %%xmm2,0x20(%1) \n"
318 "por %%xmm5,%%xmm0 \n"
319 "pshufb %%xmm4,%%xmm1 \n"
320 "movdqa %%xmm0,(%1) \n"
321 "por %%xmm5,%%xmm1 \n"
322 "palignr $0x4,%%xmm3,%%xmm3 \n"
323 "pshufb %%xmm4,%%xmm3 \n"
324 "movdqa %%xmm1,0x10(%1) \n"
325 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000326 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000327 "movdqa %%xmm3,0x30(%1) \n"
328 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000329 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000330 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000331 "+r"(dst_argb), // %1
332 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000333 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000334 : "memory", "cc"
335#if defined(__SSE2__)
336 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
337#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000338 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000339}
340
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000341void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000342 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000343 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
344 "pslld $0x18,%%xmm5 \n"
345 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000346 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000347 "1: \n"
348 "movdqu (%0),%%xmm0 \n"
349 "movdqu 0x10(%0),%%xmm1 \n"
350 "movdqu 0x20(%0),%%xmm3 \n"
351 "lea 0x30(%0),%0 \n"
352 "movdqa %%xmm3,%%xmm2 \n"
353 "palignr $0x8,%%xmm1,%%xmm2 \n"
354 "pshufb %%xmm4,%%xmm2 \n"
355 "por %%xmm5,%%xmm2 \n"
356 "palignr $0xc,%%xmm0,%%xmm1 \n"
357 "pshufb %%xmm4,%%xmm0 \n"
358 "movdqa %%xmm2,0x20(%1) \n"
359 "por %%xmm5,%%xmm0 \n"
360 "pshufb %%xmm4,%%xmm1 \n"
361 "movdqa %%xmm0,(%1) \n"
362 "por %%xmm5,%%xmm1 \n"
363 "palignr $0x4,%%xmm3,%%xmm3 \n"
364 "pshufb %%xmm4,%%xmm3 \n"
365 "movdqa %%xmm1,0x10(%1) \n"
366 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000367 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000368 "movdqa %%xmm3,0x30(%1) \n"
369 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000370 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000371 : "+r"(src_raw), // %0
372 "+r"(dst_argb), // %1
373 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000374 : "m"(kShuffleMaskRAWToARGB) // %3
375 : "memory", "cc"
376#if defined(__SSE2__)
377 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
378#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000379 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000380}
381
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000382void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000383 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000384 "mov $0x1080108,%%eax \n"
385 "movd %%eax,%%xmm5 \n"
386 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000387 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000388 "movd %%eax,%%xmm6 \n"
389 "pshufd $0x0,%%xmm6,%%xmm6 \n"
390 "pcmpeqb %%xmm3,%%xmm3 \n"
391 "psllw $0xb,%%xmm3 \n"
392 "pcmpeqb %%xmm4,%%xmm4 \n"
393 "psllw $0xa,%%xmm4 \n"
394 "psrlw $0x5,%%xmm4 \n"
395 "pcmpeqb %%xmm7,%%xmm7 \n"
396 "psllw $0x8,%%xmm7 \n"
397 "sub %0,%1 \n"
398 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000399 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000400 "1: \n"
401 "movdqu (%0),%%xmm0 \n"
402 "movdqa %%xmm0,%%xmm1 \n"
403 "movdqa %%xmm0,%%xmm2 \n"
404 "pand %%xmm3,%%xmm1 \n"
405 "psllw $0xb,%%xmm2 \n"
406 "pmulhuw %%xmm5,%%xmm1 \n"
407 "pmulhuw %%xmm5,%%xmm2 \n"
408 "psllw $0x8,%%xmm1 \n"
409 "por %%xmm2,%%xmm1 \n"
410 "pand %%xmm4,%%xmm0 \n"
411 "pmulhuw %%xmm6,%%xmm0 \n"
412 "por %%xmm7,%%xmm0 \n"
413 "movdqa %%xmm1,%%xmm2 \n"
414 "punpcklbw %%xmm0,%%xmm1 \n"
415 "punpckhbw %%xmm0,%%xmm2 \n"
416 "movdqa %%xmm1,(%1,%0,2) \n"
417 "movdqa %%xmm2,0x10(%1,%0,2) \n"
418 "lea 0x10(%0),%0 \n"
419 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000420 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000421 : "+r"(src), // %0
422 "+r"(dst), // %1
423 "+r"(pix) // %2
424 :
425 : "memory", "cc", "eax"
426#if defined(__SSE2__)
427 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
428#endif
429 );
430}
431
432void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000433 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000434 "mov $0x1080108,%%eax \n"
435 "movd %%eax,%%xmm5 \n"
436 "pshufd $0x0,%%xmm5,%%xmm5 \n"
437 "mov $0x42004200,%%eax \n"
438 "movd %%eax,%%xmm6 \n"
439 "pshufd $0x0,%%xmm6,%%xmm6 \n"
440 "pcmpeqb %%xmm3,%%xmm3 \n"
441 "psllw $0xb,%%xmm3 \n"
442 "movdqa %%xmm3,%%xmm4 \n"
443 "psrlw $0x6,%%xmm4 \n"
444 "pcmpeqb %%xmm7,%%xmm7 \n"
445 "psllw $0x8,%%xmm7 \n"
446 "sub %0,%1 \n"
447 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000448 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000449 "1: \n"
450 "movdqu (%0),%%xmm0 \n"
451 "movdqa %%xmm0,%%xmm1 \n"
452 "movdqa %%xmm0,%%xmm2 \n"
453 "psllw $0x1,%%xmm1 \n"
454 "psllw $0xb,%%xmm2 \n"
455 "pand %%xmm3,%%xmm1 \n"
456 "pmulhuw %%xmm5,%%xmm2 \n"
457 "pmulhuw %%xmm5,%%xmm1 \n"
458 "psllw $0x8,%%xmm1 \n"
459 "por %%xmm2,%%xmm1 \n"
460 "movdqa %%xmm0,%%xmm2 \n"
461 "pand %%xmm4,%%xmm0 \n"
462 "psraw $0x8,%%xmm2 \n"
463 "pmulhuw %%xmm6,%%xmm0 \n"
464 "pand %%xmm7,%%xmm2 \n"
465 "por %%xmm2,%%xmm0 \n"
466 "movdqa %%xmm1,%%xmm2 \n"
467 "punpcklbw %%xmm0,%%xmm1 \n"
468 "punpckhbw %%xmm0,%%xmm2 \n"
469 "movdqa %%xmm1,(%1,%0,2) \n"
470 "movdqa %%xmm2,0x10(%1,%0,2) \n"
471 "lea 0x10(%0),%0 \n"
472 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000473 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000474 : "+r"(src), // %0
475 "+r"(dst), // %1
476 "+r"(pix) // %2
477 :
478 : "memory", "cc", "eax"
479#if defined(__SSE2__)
480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
481#endif
482 );
483}
484
485void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000486 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000487 "mov $0xf0f0f0f,%%eax \n"
488 "movd %%eax,%%xmm4 \n"
489 "pshufd $0x0,%%xmm4,%%xmm4 \n"
490 "movdqa %%xmm4,%%xmm5 \n"
491 "pslld $0x4,%%xmm5 \n"
492 "sub %0,%1 \n"
493 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000494 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000495 "1: \n"
496 "movdqu (%0),%%xmm0 \n"
497 "movdqa %%xmm0,%%xmm2 \n"
498 "pand %%xmm4,%%xmm0 \n"
499 "pand %%xmm5,%%xmm2 \n"
500 "movdqa %%xmm0,%%xmm1 \n"
501 "movdqa %%xmm2,%%xmm3 \n"
502 "psllw $0x4,%%xmm1 \n"
503 "psrlw $0x4,%%xmm3 \n"
504 "por %%xmm1,%%xmm0 \n"
505 "por %%xmm3,%%xmm2 \n"
506 "movdqa %%xmm0,%%xmm1 \n"
507 "punpcklbw %%xmm2,%%xmm0 \n"
508 "punpckhbw %%xmm2,%%xmm1 \n"
509 "movdqa %%xmm0,(%1,%0,2) \n"
510 "movdqa %%xmm1,0x10(%1,%0,2) \n"
511 "lea 0x10(%0),%0 \n"
512 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000513 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000514 : "+r"(src), // %0
515 "+r"(dst), // %1
516 "+r"(pix) // %2
517 :
518 : "memory", "cc", "eax"
519#if defined(__SSE2__)
520 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
521#endif
522 );
523}
524
525void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000526 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000527 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000528 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000529 "1: \n"
530 "movdqa (%0),%%xmm0 \n"
531 "movdqa 0x10(%0),%%xmm1 \n"
532 "movdqa 0x20(%0),%%xmm2 \n"
533 "movdqa 0x30(%0),%%xmm3 \n"
534 "lea 0x40(%0),%0 \n"
535 "pshufb %%xmm6,%%xmm0 \n"
536 "pshufb %%xmm6,%%xmm1 \n"
537 "pshufb %%xmm6,%%xmm2 \n"
538 "pshufb %%xmm6,%%xmm3 \n"
539 "movdqa %%xmm1,%%xmm4 \n"
540 "psrldq $0x4,%%xmm1 \n"
541 "pslldq $0xc,%%xmm4 \n"
542 "movdqa %%xmm2,%%xmm5 \n"
543 "por %%xmm4,%%xmm0 \n"
544 "pslldq $0x8,%%xmm5 \n"
545 "movdqa %%xmm0,(%1) \n"
546 "por %%xmm5,%%xmm1 \n"
547 "psrldq $0x8,%%xmm2 \n"
548 "pslldq $0x4,%%xmm3 \n"
549 "por %%xmm3,%%xmm2 \n"
550 "movdqa %%xmm1,0x10(%1) \n"
551 "movdqa %%xmm2,0x20(%1) \n"
552 "lea 0x30(%1),%1 \n"
553 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000554 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000555 : "+r"(src), // %0
556 "+r"(dst), // %1
557 "+r"(pix) // %2
558 : "m"(kShuffleMaskARGBToRGB24) // %3
559 : "memory", "cc"
560#if defined(__SSE2__)
561 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
562#endif
563 );
564}
565
566void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000567 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000569 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000570 "1: \n"
571 "movdqa (%0),%%xmm0 \n"
572 "movdqa 0x10(%0),%%xmm1 \n"
573 "movdqa 0x20(%0),%%xmm2 \n"
574 "movdqa 0x30(%0),%%xmm3 \n"
575 "lea 0x40(%0),%0 \n"
576 "pshufb %%xmm6,%%xmm0 \n"
577 "pshufb %%xmm6,%%xmm1 \n"
578 "pshufb %%xmm6,%%xmm2 \n"
579 "pshufb %%xmm6,%%xmm3 \n"
580 "movdqa %%xmm1,%%xmm4 \n"
581 "psrldq $0x4,%%xmm1 \n"
582 "pslldq $0xc,%%xmm4 \n"
583 "movdqa %%xmm2,%%xmm5 \n"
584 "por %%xmm4,%%xmm0 \n"
585 "pslldq $0x8,%%xmm5 \n"
586 "movdqa %%xmm0,(%1) \n"
587 "por %%xmm5,%%xmm1 \n"
588 "psrldq $0x8,%%xmm2 \n"
589 "pslldq $0x4,%%xmm3 \n"
590 "por %%xmm3,%%xmm2 \n"
591 "movdqa %%xmm1,0x10(%1) \n"
592 "movdqa %%xmm2,0x20(%1) \n"
593 "lea 0x30(%1),%1 \n"
594 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000595 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000596 : "+r"(src), // %0
597 "+r"(dst), // %1
598 "+r"(pix) // %2
599 : "m"(kShuffleMaskARGBToRAW) // %3
600 : "memory", "cc"
601#if defined(__SSE2__)
602 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
603#endif
604 );
605}
606
fbarchard@google.comdbcabea2012-10-29 21:20:25 +0000607void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000608 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000609 "pcmpeqb %%xmm3,%%xmm3 \n"
610 "psrld $0x1b,%%xmm3 \n"
611 "pcmpeqb %%xmm4,%%xmm4 \n"
612 "psrld $0x1a,%%xmm4 \n"
613 "pslld $0x5,%%xmm4 \n"
614 "pcmpeqb %%xmm5,%%xmm5 \n"
615 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000616 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000617 "1: \n"
618 "movdqa (%0),%%xmm0 \n"
619 "movdqa %%xmm0,%%xmm1 \n"
620 "movdqa %%xmm0,%%xmm2 \n"
621 "pslld $0x8,%%xmm0 \n"
622 "psrld $0x3,%%xmm1 \n"
623 "psrld $0x5,%%xmm2 \n"
624 "psrad $0x10,%%xmm0 \n"
625 "pand %%xmm3,%%xmm1 \n"
626 "pand %%xmm4,%%xmm2 \n"
627 "pand %%xmm5,%%xmm0 \n"
628 "por %%xmm2,%%xmm1 \n"
629 "por %%xmm1,%%xmm0 \n"
630 "packssdw %%xmm0,%%xmm0 \n"
631 "lea 0x10(%0),%0 \n"
632 "movq %%xmm0,(%1) \n"
633 "lea 0x8(%1),%1 \n"
634 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000636 : "+r"(src), // %0
637 "+r"(dst), // %1
638 "+r"(pix) // %2
639 :
640 : "memory", "cc"
641#if defined(__SSE2__)
642 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
643#endif
644 );
645}
646
647void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000648 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000649 "pcmpeqb %%xmm4,%%xmm4 \n"
650 "psrld $0x1b,%%xmm4 \n"
651 "movdqa %%xmm4,%%xmm5 \n"
652 "pslld $0x5,%%xmm5 \n"
653 "movdqa %%xmm4,%%xmm6 \n"
654 "pslld $0xa,%%xmm6 \n"
655 "pcmpeqb %%xmm7,%%xmm7 \n"
656 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000657 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000658 "1: \n"
659 "movdqa (%0),%%xmm0 \n"
660 "movdqa %%xmm0,%%xmm1 \n"
661 "movdqa %%xmm0,%%xmm2 \n"
662 "movdqa %%xmm0,%%xmm3 \n"
663 "psrad $0x10,%%xmm0 \n"
664 "psrld $0x3,%%xmm1 \n"
665 "psrld $0x6,%%xmm2 \n"
666 "psrld $0x9,%%xmm3 \n"
667 "pand %%xmm7,%%xmm0 \n"
668 "pand %%xmm4,%%xmm1 \n"
669 "pand %%xmm5,%%xmm2 \n"
670 "pand %%xmm6,%%xmm3 \n"
671 "por %%xmm1,%%xmm0 \n"
672 "por %%xmm3,%%xmm2 \n"
673 "por %%xmm2,%%xmm0 \n"
674 "packssdw %%xmm0,%%xmm0 \n"
675 "lea 0x10(%0),%0 \n"
676 "movq %%xmm0,(%1) \n"
677 "lea 0x8(%1),%1 \n"
678 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000679 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000680 : "+r"(src), // %0
681 "+r"(dst), // %1
682 "+r"(pix) // %2
683 :
684 : "memory", "cc"
685#if defined(__SSE2__)
686 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
687#endif
688 );
689}
690
691void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000693 "pcmpeqb %%xmm4,%%xmm4 \n"
694 "psllw $0xc,%%xmm4 \n"
695 "movdqa %%xmm4,%%xmm3 \n"
696 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000697 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000698 "1: \n"
699 "movdqa (%0),%%xmm0 \n"
700 "movdqa %%xmm0,%%xmm1 \n"
701 "pand %%xmm3,%%xmm0 \n"
702 "pand %%xmm4,%%xmm1 \n"
703 "psrlq $0x4,%%xmm0 \n"
704 "psrlq $0x8,%%xmm1 \n"
705 "por %%xmm1,%%xmm0 \n"
706 "packuswb %%xmm0,%%xmm0 \n"
707 "lea 0x10(%0),%0 \n"
708 "movq %%xmm0,(%1) \n"
709 "lea 0x8(%1),%1 \n"
710 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000711 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000712 : "+r"(src), // %0
713 "+r"(dst), // %1
714 "+r"(pix) // %2
715 :
716 : "memory", "cc"
717#if defined(__SSE2__)
718 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
719#endif
720 );
721}
722
fbarchard@google.comb6149762011-11-07 21:58:52 +0000723void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000724 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000725 "movdqa %4,%%xmm5 \n"
726 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000727 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000728 "1: \n"
729 "movdqa (%0),%%xmm0 \n"
730 "movdqa 0x10(%0),%%xmm1 \n"
731 "movdqa 0x20(%0),%%xmm2 \n"
732 "movdqa 0x30(%0),%%xmm3 \n"
733 "pmaddubsw %%xmm4,%%xmm0 \n"
734 "pmaddubsw %%xmm4,%%xmm1 \n"
735 "pmaddubsw %%xmm4,%%xmm2 \n"
736 "pmaddubsw %%xmm4,%%xmm3 \n"
737 "lea 0x40(%0),%0 \n"
738 "phaddw %%xmm1,%%xmm0 \n"
739 "phaddw %%xmm3,%%xmm2 \n"
740 "psrlw $0x7,%%xmm0 \n"
741 "psrlw $0x7,%%xmm2 \n"
742 "packuswb %%xmm2,%%xmm0 \n"
743 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000744 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000745 "movdqa %%xmm0,(%1) \n"
746 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000747 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000748 : "+r"(src_argb), // %0
749 "+r"(dst_y), // %1
750 "+r"(pix) // %2
751 : "m"(kARGBToY), // %3
752 "m"(kAddY16) // %4
753 : "memory", "cc"
754#if defined(__SSE2__)
755 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
756#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000757 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000758}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000759
760void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000761 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000762 "movdqa %4,%%xmm5 \n"
763 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm3 \n"
770 "pmaddubsw %%xmm4,%%xmm0 \n"
771 "pmaddubsw %%xmm4,%%xmm1 \n"
772 "pmaddubsw %%xmm4,%%xmm2 \n"
773 "pmaddubsw %%xmm4,%%xmm3 \n"
774 "lea 0x40(%0),%0 \n"
775 "phaddw %%xmm1,%%xmm0 \n"
776 "phaddw %%xmm3,%%xmm2 \n"
777 "psrlw $0x7,%%xmm0 \n"
778 "psrlw $0x7,%%xmm2 \n"
779 "packuswb %%xmm2,%%xmm0 \n"
780 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000781 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000782 "movdqu %%xmm0,(%1) \n"
783 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000784 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000785 : "+r"(src_argb), // %0
786 "+r"(dst_y), // %1
787 "+r"(pix) // %2
788 : "m"(kARGBToY), // %3
789 "m"(kAddY16) // %4
790 : "memory", "cc"
791#if defined(__SSE2__)
792 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
793#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000794 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000795}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000796
fbarchard@google.com714050a2012-02-17 22:59:56 +0000797// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000798// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
799// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
800// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000801// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000802void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
803 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000804 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000805 "movdqa %0,%%xmm4 \n"
806 "movdqa %1,%%xmm3 \n"
807 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000808 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000809 : "m"(kARGBToU), // %0
810 "m"(kARGBToV), // %1
811 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000812 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000813 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000814 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000815 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000816 "1: \n"
817 "movdqa (%0),%%xmm0 \n"
818 "movdqa 0x10(%0),%%xmm1 \n"
819 "movdqa 0x20(%0),%%xmm2 \n"
820 "movdqa 0x30(%0),%%xmm6 \n"
821 "pavgb (%0,%4,1),%%xmm0 \n"
822 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
823 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
824 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
825 "lea 0x40(%0),%0 \n"
826 "movdqa %%xmm0,%%xmm7 \n"
827 "shufps $0x88,%%xmm1,%%xmm0 \n"
828 "shufps $0xdd,%%xmm1,%%xmm7 \n"
829 "pavgb %%xmm7,%%xmm0 \n"
830 "movdqa %%xmm2,%%xmm7 \n"
831 "shufps $0x88,%%xmm6,%%xmm2 \n"
832 "shufps $0xdd,%%xmm6,%%xmm7 \n"
833 "pavgb %%xmm7,%%xmm2 \n"
834 "movdqa %%xmm0,%%xmm1 \n"
835 "movdqa %%xmm2,%%xmm6 \n"
836 "pmaddubsw %%xmm4,%%xmm0 \n"
837 "pmaddubsw %%xmm4,%%xmm2 \n"
838 "pmaddubsw %%xmm3,%%xmm1 \n"
839 "pmaddubsw %%xmm3,%%xmm6 \n"
840 "phaddw %%xmm2,%%xmm0 \n"
841 "phaddw %%xmm6,%%xmm1 \n"
842 "psraw $0x8,%%xmm0 \n"
843 "psraw $0x8,%%xmm1 \n"
844 "packsswb %%xmm1,%%xmm0 \n"
845 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000846 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000847 "movlps %%xmm0,(%1) \n"
848 "movhps %%xmm0,(%1,%2,1) \n"
849 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000850 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000851 : "+r"(src_argb0), // %0
852 "+r"(dst_u), // %1
853 "+r"(dst_v), // %2
854 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000855 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000856 : "memory", "cc"
857#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000858 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000859#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000860 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000861}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000862
863void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
864 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000865 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000866 "movdqa %0,%%xmm4 \n"
867 "movdqa %1,%%xmm3 \n"
868 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000869 :
870 : "m"(kARGBToU), // %0
871 "m"(kARGBToV), // %1
872 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000873 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000874 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000875 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000876 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000877 "1: \n"
878 "movdqu (%0),%%xmm0 \n"
879 "movdqu 0x10(%0),%%xmm1 \n"
880 "movdqu 0x20(%0),%%xmm2 \n"
881 "movdqu 0x30(%0),%%xmm6 \n"
882 "movdqu (%0,%4,1),%%xmm7 \n"
883 "pavgb %%xmm7,%%xmm0 \n"
884 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
885 "pavgb %%xmm7,%%xmm1 \n"
886 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
887 "pavgb %%xmm7,%%xmm2 \n"
888 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
889 "pavgb %%xmm7,%%xmm6 \n"
890 "lea 0x40(%0),%0 \n"
891 "movdqa %%xmm0,%%xmm7 \n"
892 "shufps $0x88,%%xmm1,%%xmm0 \n"
893 "shufps $0xdd,%%xmm1,%%xmm7 \n"
894 "pavgb %%xmm7,%%xmm0 \n"
895 "movdqa %%xmm2,%%xmm7 \n"
896 "shufps $0x88,%%xmm6,%%xmm2 \n"
897 "shufps $0xdd,%%xmm6,%%xmm7 \n"
898 "pavgb %%xmm7,%%xmm2 \n"
899 "movdqa %%xmm0,%%xmm1 \n"
900 "movdqa %%xmm2,%%xmm6 \n"
901 "pmaddubsw %%xmm4,%%xmm0 \n"
902 "pmaddubsw %%xmm4,%%xmm2 \n"
903 "pmaddubsw %%xmm3,%%xmm1 \n"
904 "pmaddubsw %%xmm3,%%xmm6 \n"
905 "phaddw %%xmm2,%%xmm0 \n"
906 "phaddw %%xmm6,%%xmm1 \n"
907 "psraw $0x8,%%xmm0 \n"
908 "psraw $0x8,%%xmm1 \n"
909 "packsswb %%xmm1,%%xmm0 \n"
910 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000911 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000912 "movlps %%xmm0,(%1) \n"
913 "movhps %%xmm0,(%1,%2,1) \n"
914 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000915 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000916 : "+r"(src_argb0), // %0
917 "+r"(dst_u), // %1
918 "+r"(dst_v), // %2
919 "+rm"(width) // %3
920 : "r"(static_cast<intptr_t>(src_stride_argb))
921 : "memory", "cc"
922#if defined(__SSE2__)
923 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
924#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000925 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000926}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000927
fbarchard@google.com714050a2012-02-17 22:59:56 +0000928void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000929 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000930 "movdqa %4,%%xmm5 \n"
931 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000932 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000933 "1: \n"
934 "movdqa (%0),%%xmm0 \n"
935 "movdqa 0x10(%0),%%xmm1 \n"
936 "movdqa 0x20(%0),%%xmm2 \n"
937 "movdqa 0x30(%0),%%xmm3 \n"
938 "pmaddubsw %%xmm4,%%xmm0 \n"
939 "pmaddubsw %%xmm4,%%xmm1 \n"
940 "pmaddubsw %%xmm4,%%xmm2 \n"
941 "pmaddubsw %%xmm4,%%xmm3 \n"
942 "lea 0x40(%0),%0 \n"
943 "phaddw %%xmm1,%%xmm0 \n"
944 "phaddw %%xmm3,%%xmm2 \n"
945 "psrlw $0x7,%%xmm0 \n"
946 "psrlw $0x7,%%xmm2 \n"
947 "packuswb %%xmm2,%%xmm0 \n"
948 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000949 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000950 "movdqa %%xmm0,(%1) \n"
951 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000952 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000953 : "+r"(src_bgra), // %0
954 "+r"(dst_y), // %1
955 "+r"(pix) // %2
956 : "m"(kBGRAToY), // %3
957 "m"(kAddY16) // %4
958 : "memory", "cc"
959#if defined(__SSE2__)
960 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000961#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000962 );
963}
964
965void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000966 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000967 "movdqa %4,%%xmm5 \n"
968 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000969 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000970 "1: \n"
971 "movdqu (%0),%%xmm0 \n"
972 "movdqu 0x10(%0),%%xmm1 \n"
973 "movdqu 0x20(%0),%%xmm2 \n"
974 "movdqu 0x30(%0),%%xmm3 \n"
975 "pmaddubsw %%xmm4,%%xmm0 \n"
976 "pmaddubsw %%xmm4,%%xmm1 \n"
977 "pmaddubsw %%xmm4,%%xmm2 \n"
978 "pmaddubsw %%xmm4,%%xmm3 \n"
979 "lea 0x40(%0),%0 \n"
980 "phaddw %%xmm1,%%xmm0 \n"
981 "phaddw %%xmm3,%%xmm2 \n"
982 "psrlw $0x7,%%xmm0 \n"
983 "psrlw $0x7,%%xmm2 \n"
984 "packuswb %%xmm2,%%xmm0 \n"
985 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000986 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000987 "movdqu %%xmm0,(%1) \n"
988 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000989 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000990 : "+r"(src_bgra), // %0
991 "+r"(dst_y), // %1
992 "+r"(pix) // %2
993 : "m"(kBGRAToY), // %3
994 "m"(kAddY16) // %4
995 : "memory", "cc"
996#if defined(__SSE2__)
997 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
998#endif
999 );
1000}
1001
1002void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1003 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001004 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001005 "movdqa %0,%%xmm4 \n"
1006 "movdqa %1,%%xmm3 \n"
1007 "movdqa %2,%%xmm5 \n"
1008 :
1009 : "m"(kBGRAToU), // %0
1010 "m"(kBGRAToV), // %1
1011 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001012 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001013 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001014 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001015 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001016 "1: \n"
1017 "movdqa (%0),%%xmm0 \n"
1018 "movdqa 0x10(%0),%%xmm1 \n"
1019 "movdqa 0x20(%0),%%xmm2 \n"
1020 "movdqa 0x30(%0),%%xmm6 \n"
1021 "pavgb (%0,%4,1),%%xmm0 \n"
1022 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1023 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1024 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1025 "lea 0x40(%0),%0 \n"
1026 "movdqa %%xmm0,%%xmm7 \n"
1027 "shufps $0x88,%%xmm1,%%xmm0 \n"
1028 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1029 "pavgb %%xmm7,%%xmm0 \n"
1030 "movdqa %%xmm2,%%xmm7 \n"
1031 "shufps $0x88,%%xmm6,%%xmm2 \n"
1032 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1033 "pavgb %%xmm7,%%xmm2 \n"
1034 "movdqa %%xmm0,%%xmm1 \n"
1035 "movdqa %%xmm2,%%xmm6 \n"
1036 "pmaddubsw %%xmm4,%%xmm0 \n"
1037 "pmaddubsw %%xmm4,%%xmm2 \n"
1038 "pmaddubsw %%xmm3,%%xmm1 \n"
1039 "pmaddubsw %%xmm3,%%xmm6 \n"
1040 "phaddw %%xmm2,%%xmm0 \n"
1041 "phaddw %%xmm6,%%xmm1 \n"
1042 "psraw $0x8,%%xmm0 \n"
1043 "psraw $0x8,%%xmm1 \n"
1044 "packsswb %%xmm1,%%xmm0 \n"
1045 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001046 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001047 "movlps %%xmm0,(%1) \n"
1048 "movhps %%xmm0,(%1,%2,1) \n"
1049 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001050 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001051 : "+r"(src_bgra0), // %0
1052 "+r"(dst_u), // %1
1053 "+r"(dst_v), // %2
1054 "+rm"(width) // %3
1055 : "r"(static_cast<intptr_t>(src_stride_bgra))
1056 : "memory", "cc"
1057#if defined(__SSE2__)
1058 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1059#endif
1060 );
1061}
1062
1063void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1064 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001065 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001066 "movdqa %0,%%xmm4 \n"
1067 "movdqa %1,%%xmm3 \n"
1068 "movdqa %2,%%xmm5 \n"
1069 :
1070 : "m"(kBGRAToU), // %0
1071 "m"(kBGRAToV), // %1
1072 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001073 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001074 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001075 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001076 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001077 "1: \n"
1078 "movdqu (%0),%%xmm0 \n"
1079 "movdqu 0x10(%0),%%xmm1 \n"
1080 "movdqu 0x20(%0),%%xmm2 \n"
1081 "movdqu 0x30(%0),%%xmm6 \n"
1082 "movdqu (%0,%4,1),%%xmm7 \n"
1083 "pavgb %%xmm7,%%xmm0 \n"
1084 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1085 "pavgb %%xmm7,%%xmm1 \n"
1086 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1087 "pavgb %%xmm7,%%xmm2 \n"
1088 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1089 "pavgb %%xmm7,%%xmm6 \n"
1090 "lea 0x40(%0),%0 \n"
1091 "movdqa %%xmm0,%%xmm7 \n"
1092 "shufps $0x88,%%xmm1,%%xmm0 \n"
1093 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1094 "pavgb %%xmm7,%%xmm0 \n"
1095 "movdqa %%xmm2,%%xmm7 \n"
1096 "shufps $0x88,%%xmm6,%%xmm2 \n"
1097 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1098 "pavgb %%xmm7,%%xmm2 \n"
1099 "movdqa %%xmm0,%%xmm1 \n"
1100 "movdqa %%xmm2,%%xmm6 \n"
1101 "pmaddubsw %%xmm4,%%xmm0 \n"
1102 "pmaddubsw %%xmm4,%%xmm2 \n"
1103 "pmaddubsw %%xmm3,%%xmm1 \n"
1104 "pmaddubsw %%xmm3,%%xmm6 \n"
1105 "phaddw %%xmm2,%%xmm0 \n"
1106 "phaddw %%xmm6,%%xmm1 \n"
1107 "psraw $0x8,%%xmm0 \n"
1108 "psraw $0x8,%%xmm1 \n"
1109 "packsswb %%xmm1,%%xmm0 \n"
1110 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001111 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001112 "movlps %%xmm0,(%1) \n"
1113 "movhps %%xmm0,(%1,%2,1) \n"
1114 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001115 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001116 : "+r"(src_bgra0), // %0
1117 "+r"(dst_u), // %1
1118 "+r"(dst_v), // %2
1119 "+rm"(width) // %3
1120 : "r"(static_cast<intptr_t>(src_stride_bgra))
1121 : "memory", "cc"
1122#if defined(__SSE2__)
1123 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1124#endif
1125 );
1126}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001127
1128void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001129 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001130 "movdqa %4,%%xmm5 \n"
1131 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001132 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001133 "1: \n"
1134 "movdqa (%0),%%xmm0 \n"
1135 "movdqa 0x10(%0),%%xmm1 \n"
1136 "movdqa 0x20(%0),%%xmm2 \n"
1137 "movdqa 0x30(%0),%%xmm3 \n"
1138 "pmaddubsw %%xmm4,%%xmm0 \n"
1139 "pmaddubsw %%xmm4,%%xmm1 \n"
1140 "pmaddubsw %%xmm4,%%xmm2 \n"
1141 "pmaddubsw %%xmm4,%%xmm3 \n"
1142 "lea 0x40(%0),%0 \n"
1143 "phaddw %%xmm1,%%xmm0 \n"
1144 "phaddw %%xmm3,%%xmm2 \n"
1145 "psrlw $0x7,%%xmm0 \n"
1146 "psrlw $0x7,%%xmm2 \n"
1147 "packuswb %%xmm2,%%xmm0 \n"
1148 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001149 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001150 "movdqa %%xmm0,(%1) \n"
1151 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001152 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001153 : "+r"(src_abgr), // %0
1154 "+r"(dst_y), // %1
1155 "+r"(pix) // %2
1156 : "m"(kABGRToY), // %3
1157 "m"(kAddY16) // %4
1158 : "memory", "cc"
1159#if defined(__SSE2__)
1160 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1161#endif
1162 );
1163}
1164
1165void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001166 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001167 "movdqa %4,%%xmm5 \n"
1168 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001169 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001170 "1: \n"
1171 "movdqu (%0),%%xmm0 \n"
1172 "movdqu 0x10(%0),%%xmm1 \n"
1173 "movdqu 0x20(%0),%%xmm2 \n"
1174 "movdqu 0x30(%0),%%xmm3 \n"
1175 "pmaddubsw %%xmm4,%%xmm0 \n"
1176 "pmaddubsw %%xmm4,%%xmm1 \n"
1177 "pmaddubsw %%xmm4,%%xmm2 \n"
1178 "pmaddubsw %%xmm4,%%xmm3 \n"
1179 "lea 0x40(%0),%0 \n"
1180 "phaddw %%xmm1,%%xmm0 \n"
1181 "phaddw %%xmm3,%%xmm2 \n"
1182 "psrlw $0x7,%%xmm0 \n"
1183 "psrlw $0x7,%%xmm2 \n"
1184 "packuswb %%xmm2,%%xmm0 \n"
1185 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001186 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001187 "movdqu %%xmm0,(%1) \n"
1188 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001189 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001190 : "+r"(src_abgr), // %0
1191 "+r"(dst_y), // %1
1192 "+r"(pix) // %2
1193 : "m"(kABGRToY), // %3
1194 "m"(kAddY16) // %4
1195 : "memory", "cc"
1196#if defined(__SSE2__)
1197 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1198#endif
1199 );
1200}
1201
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001202void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1203 asm volatile (
1204 "movdqa %4,%%xmm5 \n"
1205 "movdqa %3,%%xmm4 \n"
1206 ".p2align 4 \n"
1207 "1: \n"
1208 "movdqa (%0),%%xmm0 \n"
1209 "movdqa 0x10(%0),%%xmm1 \n"
1210 "movdqa 0x20(%0),%%xmm2 \n"
1211 "movdqa 0x30(%0),%%xmm3 \n"
1212 "pmaddubsw %%xmm4,%%xmm0 \n"
1213 "pmaddubsw %%xmm4,%%xmm1 \n"
1214 "pmaddubsw %%xmm4,%%xmm2 \n"
1215 "pmaddubsw %%xmm4,%%xmm3 \n"
1216 "lea 0x40(%0),%0 \n"
1217 "phaddw %%xmm1,%%xmm0 \n"
1218 "phaddw %%xmm3,%%xmm2 \n"
1219 "psrlw $0x7,%%xmm0 \n"
1220 "psrlw $0x7,%%xmm2 \n"
1221 "packuswb %%xmm2,%%xmm0 \n"
1222 "paddb %%xmm5,%%xmm0 \n"
1223 "sub $0x10,%2 \n"
1224 "movdqa %%xmm0,(%1) \n"
1225 "lea 0x10(%1),%1 \n"
1226 "jg 1b \n"
1227 : "+r"(src_rgba), // %0
1228 "+r"(dst_y), // %1
1229 "+r"(pix) // %2
1230 : "m"(kRGBAToY), // %3
1231 "m"(kAddY16) // %4
1232 : "memory", "cc"
1233#if defined(__SSE2__)
1234 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1235#endif
1236 );
1237}
1238
1239void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1240 asm volatile (
1241 "movdqa %4,%%xmm5 \n"
1242 "movdqa %3,%%xmm4 \n"
1243 ".p2align 4 \n"
1244 "1: \n"
1245 "movdqu (%0),%%xmm0 \n"
1246 "movdqu 0x10(%0),%%xmm1 \n"
1247 "movdqu 0x20(%0),%%xmm2 \n"
1248 "movdqu 0x30(%0),%%xmm3 \n"
1249 "pmaddubsw %%xmm4,%%xmm0 \n"
1250 "pmaddubsw %%xmm4,%%xmm1 \n"
1251 "pmaddubsw %%xmm4,%%xmm2 \n"
1252 "pmaddubsw %%xmm4,%%xmm3 \n"
1253 "lea 0x40(%0),%0 \n"
1254 "phaddw %%xmm1,%%xmm0 \n"
1255 "phaddw %%xmm3,%%xmm2 \n"
1256 "psrlw $0x7,%%xmm0 \n"
1257 "psrlw $0x7,%%xmm2 \n"
1258 "packuswb %%xmm2,%%xmm0 \n"
1259 "paddb %%xmm5,%%xmm0 \n"
1260 "sub $0x10,%2 \n"
1261 "movdqu %%xmm0,(%1) \n"
1262 "lea 0x10(%1),%1 \n"
1263 "jg 1b \n"
1264 : "+r"(src_rgba), // %0
1265 "+r"(dst_y), // %1
1266 "+r"(pix) // %2
1267 : "m"(kRGBAToY), // %3
1268 "m"(kAddY16) // %4
1269 : "memory", "cc"
1270#if defined(__SSE2__)
1271 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1272#endif
1273 );
1274}
1275
fbarchard@google.com714050a2012-02-17 22:59:56 +00001276void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1277 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001278 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001279 "movdqa %0,%%xmm4 \n"
1280 "movdqa %1,%%xmm3 \n"
1281 "movdqa %2,%%xmm5 \n"
1282 :
1283 : "m"(kABGRToU), // %0
1284 "m"(kABGRToV), // %1
1285 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001286 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001287 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001288 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001289 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001290 "1: \n"
1291 "movdqa (%0),%%xmm0 \n"
1292 "movdqa 0x10(%0),%%xmm1 \n"
1293 "movdqa 0x20(%0),%%xmm2 \n"
1294 "movdqa 0x30(%0),%%xmm6 \n"
1295 "pavgb (%0,%4,1),%%xmm0 \n"
1296 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1297 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1298 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1299 "lea 0x40(%0),%0 \n"
1300 "movdqa %%xmm0,%%xmm7 \n"
1301 "shufps $0x88,%%xmm1,%%xmm0 \n"
1302 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1303 "pavgb %%xmm7,%%xmm0 \n"
1304 "movdqa %%xmm2,%%xmm7 \n"
1305 "shufps $0x88,%%xmm6,%%xmm2 \n"
1306 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1307 "pavgb %%xmm7,%%xmm2 \n"
1308 "movdqa %%xmm0,%%xmm1 \n"
1309 "movdqa %%xmm2,%%xmm6 \n"
1310 "pmaddubsw %%xmm4,%%xmm0 \n"
1311 "pmaddubsw %%xmm4,%%xmm2 \n"
1312 "pmaddubsw %%xmm3,%%xmm1 \n"
1313 "pmaddubsw %%xmm3,%%xmm6 \n"
1314 "phaddw %%xmm2,%%xmm0 \n"
1315 "phaddw %%xmm6,%%xmm1 \n"
1316 "psraw $0x8,%%xmm0 \n"
1317 "psraw $0x8,%%xmm1 \n"
1318 "packsswb %%xmm1,%%xmm0 \n"
1319 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001320 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001321 "movlps %%xmm0,(%1) \n"
1322 "movhps %%xmm0,(%1,%2,1) \n"
1323 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001324 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001325 : "+r"(src_abgr0), // %0
1326 "+r"(dst_u), // %1
1327 "+r"(dst_v), // %2
1328 "+rm"(width) // %3
1329 : "r"(static_cast<intptr_t>(src_stride_abgr))
1330 : "memory", "cc"
1331#if defined(__SSE2__)
1332 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1333#endif
1334 );
1335}
1336
1337void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1338 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001339 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001340 "movdqa %0,%%xmm4 \n"
1341 "movdqa %1,%%xmm3 \n"
1342 "movdqa %2,%%xmm5 \n"
1343 :
1344 : "m"(kABGRToU), // %0
1345 "m"(kABGRToV), // %1
1346 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001347 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001348 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001349 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001350 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001351 "1: \n"
1352 "movdqu (%0),%%xmm0 \n"
1353 "movdqu 0x10(%0),%%xmm1 \n"
1354 "movdqu 0x20(%0),%%xmm2 \n"
1355 "movdqu 0x30(%0),%%xmm6 \n"
1356 "movdqu (%0,%4,1),%%xmm7 \n"
1357 "pavgb %%xmm7,%%xmm0 \n"
1358 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1359 "pavgb %%xmm7,%%xmm1 \n"
1360 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1361 "pavgb %%xmm7,%%xmm2 \n"
1362 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1363 "pavgb %%xmm7,%%xmm6 \n"
1364 "lea 0x40(%0),%0 \n"
1365 "movdqa %%xmm0,%%xmm7 \n"
1366 "shufps $0x88,%%xmm1,%%xmm0 \n"
1367 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1368 "pavgb %%xmm7,%%xmm0 \n"
1369 "movdqa %%xmm2,%%xmm7 \n"
1370 "shufps $0x88,%%xmm6,%%xmm2 \n"
1371 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1372 "pavgb %%xmm7,%%xmm2 \n"
1373 "movdqa %%xmm0,%%xmm1 \n"
1374 "movdqa %%xmm2,%%xmm6 \n"
1375 "pmaddubsw %%xmm4,%%xmm0 \n"
1376 "pmaddubsw %%xmm4,%%xmm2 \n"
1377 "pmaddubsw %%xmm3,%%xmm1 \n"
1378 "pmaddubsw %%xmm3,%%xmm6 \n"
1379 "phaddw %%xmm2,%%xmm0 \n"
1380 "phaddw %%xmm6,%%xmm1 \n"
1381 "psraw $0x8,%%xmm0 \n"
1382 "psraw $0x8,%%xmm1 \n"
1383 "packsswb %%xmm1,%%xmm0 \n"
1384 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001385 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001386 "movlps %%xmm0,(%1) \n"
1387 "movhps %%xmm0,(%1,%2,1) \n"
1388 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001389 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001390 : "+r"(src_abgr0), // %0
1391 "+r"(dst_u), // %1
1392 "+r"(dst_v), // %2
1393 "+rm"(width) // %3
1394 : "r"(static_cast<intptr_t>(src_stride_abgr))
1395 : "memory", "cc"
1396#if defined(__SSE2__)
1397 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1398#endif
1399 );
1400}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001401
1402void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1403 uint8* dst_u, uint8* dst_v, int width) {
1404 asm volatile (
1405 "movdqa %0,%%xmm4 \n"
1406 "movdqa %1,%%xmm3 \n"
1407 "movdqa %2,%%xmm5 \n"
1408 :
1409 : "m"(kRGBAToU), // %0
1410 "m"(kRGBAToV), // %1
1411 "m"(kAddUV128) // %2
1412 );
1413 asm volatile (
1414 "sub %1,%2 \n"
1415 ".p2align 4 \n"
1416 "1: \n"
1417 "movdqa (%0),%%xmm0 \n"
1418 "movdqa 0x10(%0),%%xmm1 \n"
1419 "movdqa 0x20(%0),%%xmm2 \n"
1420 "movdqa 0x30(%0),%%xmm6 \n"
1421 "pavgb (%0,%4,1),%%xmm0 \n"
1422 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1423 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1424 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1425 "lea 0x40(%0),%0 \n"
1426 "movdqa %%xmm0,%%xmm7 \n"
1427 "shufps $0x88,%%xmm1,%%xmm0 \n"
1428 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1429 "pavgb %%xmm7,%%xmm0 \n"
1430 "movdqa %%xmm2,%%xmm7 \n"
1431 "shufps $0x88,%%xmm6,%%xmm2 \n"
1432 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1433 "pavgb %%xmm7,%%xmm2 \n"
1434 "movdqa %%xmm0,%%xmm1 \n"
1435 "movdqa %%xmm2,%%xmm6 \n"
1436 "pmaddubsw %%xmm4,%%xmm0 \n"
1437 "pmaddubsw %%xmm4,%%xmm2 \n"
1438 "pmaddubsw %%xmm3,%%xmm1 \n"
1439 "pmaddubsw %%xmm3,%%xmm6 \n"
1440 "phaddw %%xmm2,%%xmm0 \n"
1441 "phaddw %%xmm6,%%xmm1 \n"
1442 "psraw $0x8,%%xmm0 \n"
1443 "psraw $0x8,%%xmm1 \n"
1444 "packsswb %%xmm1,%%xmm0 \n"
1445 "paddb %%xmm5,%%xmm0 \n"
1446 "sub $0x10,%3 \n"
1447 "movlps %%xmm0,(%1) \n"
1448 "movhps %%xmm0,(%1,%2,1) \n"
1449 "lea 0x8(%1),%1 \n"
1450 "jg 1b \n"
1451 : "+r"(src_rgba0), // %0
1452 "+r"(dst_u), // %1
1453 "+r"(dst_v), // %2
1454 "+rm"(width) // %3
1455 : "r"(static_cast<intptr_t>(src_stride_rgba))
1456 : "memory", "cc"
1457#if defined(__SSE2__)
1458 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1459#endif
1460 );
1461}
1462
1463void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1464 uint8* dst_u, uint8* dst_v, int width) {
1465 asm volatile (
1466 "movdqa %0,%%xmm4 \n"
1467 "movdqa %1,%%xmm3 \n"
1468 "movdqa %2,%%xmm5 \n"
1469 :
1470 : "m"(kRGBAToU), // %0
1471 "m"(kRGBAToV), // %1
1472 "m"(kAddUV128) // %2
1473 );
1474 asm volatile (
1475 "sub %1,%2 \n"
1476 ".p2align 4 \n"
1477 "1: \n"
1478 "movdqu (%0),%%xmm0 \n"
1479 "movdqu 0x10(%0),%%xmm1 \n"
1480 "movdqu 0x20(%0),%%xmm2 \n"
1481 "movdqu 0x30(%0),%%xmm6 \n"
1482 "movdqu (%0,%4,1),%%xmm7 \n"
1483 "pavgb %%xmm7,%%xmm0 \n"
1484 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1485 "pavgb %%xmm7,%%xmm1 \n"
1486 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1487 "pavgb %%xmm7,%%xmm2 \n"
1488 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1489 "pavgb %%xmm7,%%xmm6 \n"
1490 "lea 0x40(%0),%0 \n"
1491 "movdqa %%xmm0,%%xmm7 \n"
1492 "shufps $0x88,%%xmm1,%%xmm0 \n"
1493 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1494 "pavgb %%xmm7,%%xmm0 \n"
1495 "movdqa %%xmm2,%%xmm7 \n"
1496 "shufps $0x88,%%xmm6,%%xmm2 \n"
1497 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1498 "pavgb %%xmm7,%%xmm2 \n"
1499 "movdqa %%xmm0,%%xmm1 \n"
1500 "movdqa %%xmm2,%%xmm6 \n"
1501 "pmaddubsw %%xmm4,%%xmm0 \n"
1502 "pmaddubsw %%xmm4,%%xmm2 \n"
1503 "pmaddubsw %%xmm3,%%xmm1 \n"
1504 "pmaddubsw %%xmm3,%%xmm6 \n"
1505 "phaddw %%xmm2,%%xmm0 \n"
1506 "phaddw %%xmm6,%%xmm1 \n"
1507 "psraw $0x8,%%xmm0 \n"
1508 "psraw $0x8,%%xmm1 \n"
1509 "packsswb %%xmm1,%%xmm0 \n"
1510 "paddb %%xmm5,%%xmm0 \n"
1511 "sub $0x10,%3 \n"
1512 "movlps %%xmm0,(%1) \n"
1513 "movhps %%xmm0,(%1,%2,1) \n"
1514 "lea 0x8(%1),%1 \n"
1515 "jg 1b \n"
1516 : "+r"(src_rgba0), // %0
1517 "+r"(dst_u), // %1
1518 "+r"(dst_v), // %2
1519 "+rm"(width) // %3
1520 : "r"(static_cast<intptr_t>(src_stride_rgba))
1521 : "memory", "cc"
1522#if defined(__SSE2__)
1523 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1524#endif
1525 );
1526}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001527#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001528
fbarchard@google.come214fe32012-06-04 23:47:11 +00001529#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001530#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1531#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1532#define UR 0
1533
1534#define VB 0
1535#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1536#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1537
1538// Bias
1539#define BB UB * 128 + VB * 128
1540#define BG UG * 128 + VG * 128
1541#define BR UR * 128 + VR * 128
1542
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001543#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001544
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001545struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001546 vec8 kUVToB; // 0
1547 vec8 kUVToG; // 16
1548 vec8 kUVToR; // 32
1549 vec16 kUVBiasB; // 48
1550 vec16 kUVBiasG; // 64
1551 vec16 kUVBiasR; // 80
1552 vec16 kYSub16; // 96
1553 vec16 kYToRgb; // 112
1554 vec8 kVUToB; // 128
1555 vec8 kVUToG; // 144
1556 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001557} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001558 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1559 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1560 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1561 { BB, BB, BB, BB, BB, BB, BB, BB },
1562 { BG, BG, BG, BG, BG, BG, BG, BG },
1563 { BR, BR, BR, BR, BR, BR, BR, BR },
1564 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001565 { YG, YG, YG, YG, YG, YG, YG, YG },
1566 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1567 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1568 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001569};
1570
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001571
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001572// Read 8 UV from 411
1573#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001574 "movq (%[u_buf]),%%xmm0 \n" \
1575 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1576 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001577 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001578
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001579// Read 4 UV from 422, upsample to 8 UV
1580#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001581 "movd (%[u_buf]),%%xmm0 \n" \
1582 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1583 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001584 "punpcklbw %%xmm1,%%xmm0 \n" \
1585 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001586
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001587// Read 2 UV from 411, upsample to 8 UV
1588#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001589 "movd (%[u_buf]),%%xmm0 \n" \
1590 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1591 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001592 "punpcklbw %%xmm1,%%xmm0 \n" \
1593 "punpcklwd %%xmm0,%%xmm0 \n" \
1594 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001595
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001596// Read 4 UV from NV12, upsample to 8 UV
1597#define READNV12 \
1598 "movq (%[uv_buf]),%%xmm0 \n" \
1599 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001600 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001601
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001602// Convert 8 pixels: 8 UV and 8 Y
1603#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001604 "movdqa %%xmm0,%%xmm1 \n" \
1605 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001606 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1607 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1608 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1609 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1610 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1611 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1612 "movq (%[y_buf]),%%xmm3 \n" \
1613 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001614 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001615 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1616 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001617 "paddsw %%xmm3,%%xmm0 \n" \
1618 "paddsw %%xmm3,%%xmm1 \n" \
1619 "paddsw %%xmm3,%%xmm2 \n" \
1620 "psraw $0x6,%%xmm0 \n" \
1621 "psraw $0x6,%%xmm1 \n" \
1622 "psraw $0x6,%%xmm2 \n" \
1623 "packuswb %%xmm0,%%xmm0 \n" \
1624 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001625 "packuswb %%xmm2,%%xmm2 \n" \
1626
1627// Convert 8 pixels: 8 VU and 8 Y
1628#define YVUTORGB \
1629 "movdqa %%xmm0,%%xmm1 \n" \
1630 "movdqa %%xmm0,%%xmm2 \n" \
1631 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1632 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1633 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1634 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1635 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1636 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1637 "movq (%[y_buf]),%%xmm3 \n" \
1638 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1639 "punpcklbw %%xmm4,%%xmm3 \n" \
1640 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1641 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1642 "paddsw %%xmm3,%%xmm0 \n" \
1643 "paddsw %%xmm3,%%xmm1 \n" \
1644 "paddsw %%xmm3,%%xmm2 \n" \
1645 "psraw $0x6,%%xmm0 \n" \
1646 "psraw $0x6,%%xmm1 \n" \
1647 "psraw $0x6,%%xmm2 \n" \
1648 "packuswb %%xmm0,%%xmm0 \n" \
1649 "packuswb %%xmm1,%%xmm1 \n" \
1650 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001651
1652void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001653 const uint8* u_buf,
1654 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001655 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001656 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001657 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001658 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001659 "pcmpeqb %%xmm5,%%xmm5 \n"
1660 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001661 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001662 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001663 READYUV444
1664 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001665 "punpcklbw %%xmm1,%%xmm0 \n"
1666 "punpcklbw %%xmm5,%%xmm2 \n"
1667 "movdqa %%xmm0,%%xmm1 \n"
1668 "punpcklwd %%xmm2,%%xmm0 \n"
1669 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001670 "movdqa %%xmm0,(%[argb_buf]) \n"
1671 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1672 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1673 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001674 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001675 : [y_buf]"+r"(y_buf), // %[y_buf]
1676 [u_buf]"+r"(u_buf), // %[u_buf]
1677 [v_buf]"+r"(v_buf), // %[v_buf]
1678 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1679 [width]"+rm"(width) // %[width]
1680 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001681 : "memory", "cc"
1682#if defined(__SSE2__)
1683 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1684#endif
1685 );
1686}
1687
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001688void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1689 const uint8* u_buf,
1690 const uint8* v_buf,
1691 uint8* rgb24_buf,
1692 int width) {
1693// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1694#ifdef __APPLE__
1695 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001696 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1697 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1698 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1699 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001700#endif
1701
1702 asm volatile (
1703#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001704 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1705 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001706#endif
1707 "sub %[u_buf],%[v_buf] \n"
1708 "pxor %%xmm4,%%xmm4 \n"
1709 ".p2align 4 \n"
1710 "1: \n"
1711 READYUV422
1712 YUVTORGB
1713 "punpcklbw %%xmm1,%%xmm0 \n"
1714 "punpcklbw %%xmm2,%%xmm2 \n"
1715 "movdqa %%xmm0,%%xmm1 \n"
1716 "punpcklwd %%xmm2,%%xmm0 \n"
1717 "punpckhwd %%xmm2,%%xmm1 \n"
1718 "pshufb %%xmm5,%%xmm0 \n"
1719 "pshufb %%xmm6,%%xmm1 \n"
1720 "palignr $0xc,%%xmm0,%%xmm1 \n"
1721 "movq %%xmm0,(%[rgb24_buf]) \n"
1722 "movdqu %%xmm1,0x8(%[rgb24_buf]) \n"
1723 "lea 0x18(%[rgb24_buf]),%[rgb24_buf] \n"
1724 "sub $0x8,%[width] \n"
1725 "jg 1b \n"
1726 : [y_buf]"+r"(y_buf), // %[y_buf]
1727 [u_buf]"+r"(u_buf), // %[u_buf]
1728 [v_buf]"+r"(v_buf), // %[v_buf]
1729 [rgb24_buf]"+r"(rgb24_buf), // %[rgb24_buf]
1730 [width]"+rm"(width) // %[width]
1731 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1732#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001733 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1734 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001735#endif
1736 : "memory", "cc"
1737#if defined(__SSE2__)
1738 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1739#endif
1740 );
1741}
1742
1743void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1744 const uint8* u_buf,
1745 const uint8* v_buf,
1746 uint8* raw_buf,
1747 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001748// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001749#ifdef __APPLE__
1750 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001751 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1752 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1753 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1754 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001755#endif
1756
1757 asm volatile (
1758#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001759 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1760 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001761#endif
1762 "sub %[u_buf],%[v_buf] \n"
1763 "pxor %%xmm4,%%xmm4 \n"
1764 ".p2align 4 \n"
1765 "1: \n"
1766 READYUV422
1767 YUVTORGB
1768 "punpcklbw %%xmm1,%%xmm0 \n"
1769 "punpcklbw %%xmm2,%%xmm2 \n"
1770 "movdqa %%xmm0,%%xmm1 \n"
1771 "punpcklwd %%xmm2,%%xmm0 \n"
1772 "punpckhwd %%xmm2,%%xmm1 \n"
1773 "pshufb %%xmm5,%%xmm0 \n"
1774 "pshufb %%xmm6,%%xmm1 \n"
1775 "palignr $0xc,%%xmm0,%%xmm1 \n"
1776 "movq %%xmm0,(%[raw_buf]) \n"
1777 "movdqu %%xmm1,0x8(%[raw_buf]) \n"
1778 "lea 0x18(%[raw_buf]),%[raw_buf] \n"
1779 "sub $0x8,%[width] \n"
1780 "jg 1b \n"
1781 : [y_buf]"+r"(y_buf), // %[y_buf]
1782 [u_buf]"+r"(u_buf), // %[u_buf]
1783 [v_buf]"+r"(v_buf), // %[v_buf]
1784 [raw_buf]"+r"(raw_buf), // %[raw_buf]
1785 [width]"+rm"(width) // %[width]
1786 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1787#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001788 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1789 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001790#endif
1791 : "memory", "cc"
1792#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001793 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001794#endif
1795 );
1796}
1797
fbarchard@google.come214fe32012-06-04 23:47:11 +00001798void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001799 const uint8* u_buf,
1800 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001801 uint8* argb_buf,
fbarchard@google.comdbcabea2012-10-29 21:20:25 +00001802 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001803 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001804 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001805 "pcmpeqb %%xmm5,%%xmm5 \n"
1806 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001807 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001808 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001809 READYUV422
1810 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001811 "punpcklbw %%xmm1,%%xmm0 \n"
1812 "punpcklbw %%xmm5,%%xmm2 \n"
1813 "movdqa %%xmm0,%%xmm1 \n"
1814 "punpcklwd %%xmm2,%%xmm0 \n"
1815 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001816 "movdqa %%xmm0,(%[argb_buf]) \n"
1817 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1818 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1819 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001820 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001821 : [y_buf]"+r"(y_buf), // %[y_buf]
1822 [u_buf]"+r"(u_buf), // %[u_buf]
1823 [v_buf]"+r"(v_buf), // %[v_buf]
1824 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1825 [width]"+rm"(width) // %[width]
1826 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001827 : "memory", "cc"
1828#if defined(__SSE2__)
1829 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1830#endif
1831 );
1832}
1833
1834void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1835 const uint8* u_buf,
1836 const uint8* v_buf,
1837 uint8* argb_buf,
1838 int width) {
1839 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001840 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001841 "pcmpeqb %%xmm5,%%xmm5 \n"
1842 "pxor %%xmm4,%%xmm4 \n"
1843 ".p2align 4 \n"
1844 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001845 READYUV411
1846 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001847 "punpcklbw %%xmm1,%%xmm0 \n"
1848 "punpcklbw %%xmm5,%%xmm2 \n"
1849 "movdqa %%xmm0,%%xmm1 \n"
1850 "punpcklwd %%xmm2,%%xmm0 \n"
1851 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001852 "movdqa %%xmm0,(%[argb_buf]) \n"
1853 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1854 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1855 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001856 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001857 : [y_buf]"+r"(y_buf), // %[y_buf]
1858 [u_buf]"+r"(u_buf), // %[u_buf]
1859 [v_buf]"+r"(v_buf), // %[v_buf]
1860 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1861 [width]"+rm"(width) // %[width]
1862 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1863 : "memory", "cc"
1864#if defined(__SSE2__)
1865 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1866#endif
1867 );
1868}
1869
1870void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1871 const uint8* uv_buf,
1872 uint8* argb_buf,
1873 int width) {
1874 asm volatile (
1875 "pcmpeqb %%xmm5,%%xmm5 \n"
1876 "pxor %%xmm4,%%xmm4 \n"
1877 ".p2align 4 \n"
1878 "1: \n"
1879 READNV12
1880 YUVTORGB
1881 "punpcklbw %%xmm1,%%xmm0 \n"
1882 "punpcklbw %%xmm5,%%xmm2 \n"
1883 "movdqa %%xmm0,%%xmm1 \n"
1884 "punpcklwd %%xmm2,%%xmm0 \n"
1885 "punpckhwd %%xmm2,%%xmm1 \n"
1886 "movdqa %%xmm0,(%[argb_buf]) \n"
1887 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1888 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1889 "sub $0x8,%[width] \n"
1890 "jg 1b \n"
1891 : [y_buf]"+r"(y_buf), // %[y_buf]
1892 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1893 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1894 [width]"+rm"(width) // %[width]
1895 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1896 : "memory", "cc"
1897#if defined(__SSE2__)
1898 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1899#endif
1900 );
1901}
1902
1903void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1904 const uint8* vu_buf,
1905 uint8* argb_buf,
1906 int width) {
1907 asm volatile (
1908 "pcmpeqb %%xmm5,%%xmm5 \n"
1909 "pxor %%xmm4,%%xmm4 \n"
1910 ".p2align 4 \n"
1911 "1: \n"
1912 READNV12
1913 YVUTORGB
1914 "punpcklbw %%xmm1,%%xmm0 \n"
1915 "punpcklbw %%xmm5,%%xmm2 \n"
1916 "movdqa %%xmm0,%%xmm1 \n"
1917 "punpcklwd %%xmm2,%%xmm0 \n"
1918 "punpckhwd %%xmm2,%%xmm1 \n"
1919 "movdqa %%xmm0,(%[argb_buf]) \n"
1920 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1921 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1922 "sub $0x8,%[width] \n"
1923 "jg 1b \n"
1924 : [y_buf]"+r"(y_buf), // %[y_buf]
1925 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1926 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1927 [width]"+rm"(width) // %[width]
1928 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001929 : "memory", "cc"
1930#if defined(__SSE2__)
1931 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1932#endif
1933 );
1934}
1935
1936void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1937 const uint8* u_buf,
1938 const uint8* v_buf,
1939 uint8* argb_buf,
1940 int width) {
1941 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001942 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001943 "pcmpeqb %%xmm5,%%xmm5 \n"
1944 "pxor %%xmm4,%%xmm4 \n"
1945 ".p2align 4 \n"
1946 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001947 READYUV444
1948 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001949 "punpcklbw %%xmm1,%%xmm0 \n"
1950 "punpcklbw %%xmm5,%%xmm2 \n"
1951 "movdqa %%xmm0,%%xmm1 \n"
1952 "punpcklwd %%xmm2,%%xmm0 \n"
1953 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001954 "movdqu %%xmm0,(%[argb_buf]) \n"
1955 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1956 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1957 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001958 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001959 : [y_buf]"+r"(y_buf), // %[y_buf]
1960 [u_buf]"+r"(u_buf), // %[u_buf]
1961 [v_buf]"+r"(v_buf), // %[v_buf]
1962 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1963 [width]"+rm"(width) // %[width]
1964 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001965 : "memory", "cc"
1966#if defined(__SSE2__)
1967 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1968#endif
1969 );
1970}
1971
1972void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1973 const uint8* u_buf,
1974 const uint8* v_buf,
1975 uint8* argb_buf,
1976 int width) {
1977 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001978 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001979 "pcmpeqb %%xmm5,%%xmm5 \n"
1980 "pxor %%xmm4,%%xmm4 \n"
1981 ".p2align 4 \n"
1982 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001983 READYUV422
1984 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001985 "punpcklbw %%xmm1,%%xmm0 \n"
1986 "punpcklbw %%xmm5,%%xmm2 \n"
1987 "movdqa %%xmm0,%%xmm1 \n"
1988 "punpcklwd %%xmm2,%%xmm0 \n"
1989 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001990 "movdqu %%xmm0,(%[argb_buf]) \n"
1991 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1992 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1993 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001994 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001995 : [y_buf]"+r"(y_buf), // %[y_buf]
1996 [u_buf]"+r"(u_buf), // %[u_buf]
1997 [v_buf]"+r"(v_buf), // %[v_buf]
1998 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1999 [width]"+rm"(width) // %[width]
2000 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002001 : "memory", "cc"
2002#if defined(__SSE2__)
2003 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2004#endif
2005 );
2006}
2007
2008void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2009 const uint8* u_buf,
2010 const uint8* v_buf,
2011 uint8* argb_buf,
2012 int width) {
2013 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002014 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002015 "pcmpeqb %%xmm5,%%xmm5 \n"
2016 "pxor %%xmm4,%%xmm4 \n"
2017 ".p2align 4 \n"
2018 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002019 READYUV411
2020 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002021 "punpcklbw %%xmm1,%%xmm0 \n"
2022 "punpcklbw %%xmm5,%%xmm2 \n"
2023 "movdqa %%xmm0,%%xmm1 \n"
2024 "punpcklwd %%xmm2,%%xmm0 \n"
2025 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002026 "movdqu %%xmm0,(%[argb_buf]) \n"
2027 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2028 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2029 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002030 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002031 : [y_buf]"+r"(y_buf), // %[y_buf]
2032 [u_buf]"+r"(u_buf), // %[u_buf]
2033 [v_buf]"+r"(v_buf), // %[v_buf]
2034 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2035 [width]"+rm"(width) // %[width]
2036 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2037 : "memory", "cc"
2038#if defined(__SSE2__)
2039 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2040#endif
2041 );
2042}
2043
2044void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2045 const uint8* uv_buf,
2046 uint8* argb_buf,
2047 int width) {
2048 asm volatile (
2049 "pcmpeqb %%xmm5,%%xmm5 \n"
2050 "pxor %%xmm4,%%xmm4 \n"
2051 ".p2align 4 \n"
2052 "1: \n"
2053 READNV12
2054 YUVTORGB
2055 "punpcklbw %%xmm1,%%xmm0 \n"
2056 "punpcklbw %%xmm5,%%xmm2 \n"
2057 "movdqa %%xmm0,%%xmm1 \n"
2058 "punpcklwd %%xmm2,%%xmm0 \n"
2059 "punpckhwd %%xmm2,%%xmm1 \n"
2060 "movdqu %%xmm0,(%[argb_buf]) \n"
2061 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2062 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2063 "sub $0x8,%[width] \n"
2064 "jg 1b \n"
2065 : [y_buf]"+r"(y_buf), // %[y_buf]
2066 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2067 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2068 [width]"+rm"(width) // %[width]
2069 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2070 : "memory", "cc"
2071#if defined(__SSE2__)
2072 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2073#endif
2074 );
2075}
2076
2077void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2078 const uint8* vu_buf,
2079 uint8* argb_buf,
2080 int width) {
2081 asm volatile (
2082 "pcmpeqb %%xmm5,%%xmm5 \n"
2083 "pxor %%xmm4,%%xmm4 \n"
2084 ".p2align 4 \n"
2085 "1: \n"
2086 READNV12
2087 YVUTORGB
2088 "punpcklbw %%xmm1,%%xmm0 \n"
2089 "punpcklbw %%xmm5,%%xmm2 \n"
2090 "movdqa %%xmm0,%%xmm1 \n"
2091 "punpcklwd %%xmm2,%%xmm0 \n"
2092 "punpckhwd %%xmm2,%%xmm1 \n"
2093 "movdqu %%xmm0,(%[argb_buf]) \n"
2094 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2095 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2096 "sub $0x8,%[width] \n"
2097 "jg 1b \n"
2098 : [y_buf]"+r"(y_buf), // %[y_buf]
2099 [uv_buf]"+r"(vu_buf), // %[uv_buf]
2100 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2101 [width]"+rm"(width) // %[width]
2102 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002103 : "memory", "cc"
2104#if defined(__SSE2__)
2105 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2106#endif
2107 );
2108}
2109
2110void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2111 const uint8* u_buf,
2112 const uint8* v_buf,
2113 uint8* bgra_buf,
2114 int width) {
2115 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002116 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002117 "pcmpeqb %%xmm5,%%xmm5 \n"
2118 "pxor %%xmm4,%%xmm4 \n"
2119 ".p2align 4 \n"
2120 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002121 READYUV422
2122 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002123 "pcmpeqb %%xmm5,%%xmm5 \n"
2124 "punpcklbw %%xmm0,%%xmm1 \n"
2125 "punpcklbw %%xmm2,%%xmm5 \n"
2126 "movdqa %%xmm5,%%xmm0 \n"
2127 "punpcklwd %%xmm1,%%xmm5 \n"
2128 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002129 "movdqa %%xmm5,(%[argb_buf]) \n"
2130 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2131 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2132 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002133 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002134 : [y_buf]"+r"(y_buf), // %[y_buf]
2135 [u_buf]"+r"(u_buf), // %[u_buf]
2136 [v_buf]"+r"(v_buf), // %[v_buf]
2137 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2138 [width]"+rm"(width) // %[width]
2139 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002140 : "memory", "cc"
2141#if defined(__SSE2__)
2142 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2143#endif
2144 );
2145}
2146
fbarchard@google.come214fe32012-06-04 23:47:11 +00002147void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002148 const uint8* u_buf,
2149 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002150 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002151 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002152 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002153 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002154 "pcmpeqb %%xmm5,%%xmm5 \n"
2155 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002156 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002157 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002158 READYUV422
2159 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002160 "punpcklbw %%xmm1,%%xmm2 \n"
2161 "punpcklbw %%xmm5,%%xmm0 \n"
2162 "movdqa %%xmm2,%%xmm1 \n"
2163 "punpcklwd %%xmm0,%%xmm2 \n"
2164 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002165 "movdqa %%xmm2,(%[argb_buf]) \n"
2166 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
2167 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2168 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002169 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002170 : [y_buf]"+r"(y_buf), // %[y_buf]
2171 [u_buf]"+r"(u_buf), // %[u_buf]
2172 [v_buf]"+r"(v_buf), // %[v_buf]
2173 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2174 [width]"+rm"(width) // %[width]
2175 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002176 : "memory", "cc"
2177#if defined(__SSE2__)
2178 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2179#endif
2180 );
2181}
2182
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002183void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2184 const uint8* u_buf,
2185 const uint8* v_buf,
2186 uint8* rgba_buf,
2187 int width) {
2188 asm volatile (
2189 "sub %[u_buf],%[v_buf] \n"
2190 "pcmpeqb %%xmm5,%%xmm5 \n"
2191 "pxor %%xmm4,%%xmm4 \n"
2192 ".p2align 4 \n"
2193 "1: \n"
2194 READYUV422
2195 YUVTORGB
2196 "pcmpeqb %%xmm5,%%xmm5 \n"
2197 "punpcklbw %%xmm2,%%xmm1 \n"
2198 "punpcklbw %%xmm0,%%xmm5 \n"
2199 "movdqa %%xmm5,%%xmm0 \n"
2200 "punpcklwd %%xmm1,%%xmm5 \n"
2201 "punpckhwd %%xmm1,%%xmm0 \n"
2202 "movdqa %%xmm5,(%[argb_buf]) \n"
2203 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2204 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2205 "sub $0x8,%[width] \n"
2206 "jg 1b \n"
2207 : [y_buf]"+r"(y_buf), // %[y_buf]
2208 [u_buf]"+r"(u_buf), // %[u_buf]
2209 [v_buf]"+r"(v_buf), // %[v_buf]
2210 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2211 [width]"+rm"(width) // %[width]
2212 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2213 : "memory", "cc"
2214#if defined(__SSE2__)
2215 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2216#endif
2217 );
2218}
2219
fbarchard@google.come214fe32012-06-04 23:47:11 +00002220void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002221 const uint8* u_buf,
2222 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002223 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002224 int width) {
2225 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002226 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002227 "pcmpeqb %%xmm5,%%xmm5 \n"
2228 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002229 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002230 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002231 READYUV422
2232 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002233 "pcmpeqb %%xmm5,%%xmm5 \n"
2234 "punpcklbw %%xmm0,%%xmm1 \n"
2235 "punpcklbw %%xmm2,%%xmm5 \n"
2236 "movdqa %%xmm5,%%xmm0 \n"
2237 "punpcklwd %%xmm1,%%xmm5 \n"
2238 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002239 "movdqu %%xmm5,(%[argb_buf]) \n"
2240 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
2241 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2242 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002243 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002244 : [y_buf]"+r"(y_buf), // %[y_buf]
2245 [u_buf]"+r"(u_buf), // %[u_buf]
2246 [v_buf]"+r"(v_buf), // %[v_buf]
2247 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2248 [width]"+rm"(width) // %[width]
2249 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002250 : "memory", "cc"
2251#if defined(__SSE2__)
2252 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2253#endif
2254 );
2255}
2256
fbarchard@google.come214fe32012-06-04 23:47:11 +00002257void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002258 const uint8* u_buf,
2259 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002260 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002261 int width) {
2262 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002263 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002264 "pcmpeqb %%xmm5,%%xmm5 \n"
2265 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002266 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002267 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002268 READYUV422
2269 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002270 "punpcklbw %%xmm1,%%xmm2 \n"
2271 "punpcklbw %%xmm5,%%xmm0 \n"
2272 "movdqa %%xmm2,%%xmm1 \n"
2273 "punpcklwd %%xmm0,%%xmm2 \n"
2274 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002275 "movdqu %%xmm2,(%[argb_buf]) \n"
2276 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2277 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2278 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002279 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002280 : [y_buf]"+r"(y_buf), // %[y_buf]
2281 [u_buf]"+r"(u_buf), // %[u_buf]
2282 [v_buf]"+r"(v_buf), // %[v_buf]
2283 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2284 [width]"+rm"(width) // %[width]
2285 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002286 : "memory", "cc"
2287#if defined(__SSE2__)
2288 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2289#endif
2290 );
2291}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002292
2293void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2294 const uint8* u_buf,
2295 const uint8* v_buf,
2296 uint8* rgba_buf,
2297 int width) {
2298 asm volatile (
2299 "sub %[u_buf],%[v_buf] \n"
2300 "pcmpeqb %%xmm5,%%xmm5 \n"
2301 "pxor %%xmm4,%%xmm4 \n"
2302 ".p2align 4 \n"
2303 "1: \n"
2304 READYUV422
2305 YUVTORGB
2306 "pcmpeqb %%xmm5,%%xmm5 \n"
2307 "punpcklbw %%xmm2,%%xmm1 \n"
2308 "punpcklbw %%xmm0,%%xmm5 \n"
2309 "movdqa %%xmm5,%%xmm0 \n"
2310 "punpcklwd %%xmm1,%%xmm5 \n"
2311 "punpckhwd %%xmm1,%%xmm0 \n"
2312 "movdqa %%xmm5,(%[argb_buf]) \n"
2313 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2314 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2315 "sub $0x8,%[width] \n"
2316 "jg 1b \n"
2317 : [y_buf]"+r"(y_buf), // %[y_buf]
2318 [u_buf]"+r"(u_buf), // %[u_buf]
2319 [v_buf]"+r"(v_buf), // %[v_buf]
2320 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2321 [width]"+rm"(width) // %[width]
2322 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2323 : "memory", "cc"
2324#if defined(__SSE2__)
2325 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2326#endif
2327 );
2328}
2329
fbarchard@google.come214fe32012-06-04 23:47:11 +00002330#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002331
2332#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002333void YToARGBRow_SSE2(const uint8* y_buf,
2334 uint8* rgb_buf,
2335 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002336 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002337 "pcmpeqb %%xmm4,%%xmm4 \n"
2338 "pslld $0x18,%%xmm4 \n"
2339 "mov $0x10001000,%%eax \n"
2340 "movd %%eax,%%xmm3 \n"
2341 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2342 "mov $0x012a012a,%%eax \n"
2343 "movd %%eax,%%xmm2 \n"
2344 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002345 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002346 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002347 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002348 "movq (%0),%%xmm0 \n"
2349 "lea 0x8(%0),%0 \n"
2350 "punpcklbw %%xmm0,%%xmm0 \n"
2351 "psubusw %%xmm3,%%xmm0 \n"
2352 "pmulhuw %%xmm2,%%xmm0 \n"
2353 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002354
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002355 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002356 "punpcklbw %%xmm0,%%xmm0 \n"
2357 "movdqa %%xmm0,%%xmm1 \n"
2358 "punpcklwd %%xmm0,%%xmm0 \n"
2359 "punpckhwd %%xmm1,%%xmm1 \n"
2360 "por %%xmm4,%%xmm0 \n"
2361 "por %%xmm4,%%xmm1 \n"
2362 "movdqa %%xmm0,(%1) \n"
2363 "movdqa %%xmm1,16(%1) \n"
2364 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002365
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002366 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002367 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00002368 : "+r"(y_buf), // %0
2369 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00002370 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002371 :
2372 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002373#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002374 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002375#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002376 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002377}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002378#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002379
fbarchard@google.com42831e02012-01-21 02:54:17 +00002380#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002381// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002382CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002383 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2384};
2385
fbarchard@google.com42831e02012-01-21 02:54:17 +00002386void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002387 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002388 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002389 "movdqa %3,%%xmm5 \n"
2390 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002391 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002392 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002393 "movdqa (%0,%2),%%xmm0 \n"
2394 "pshufb %%xmm5,%%xmm0 \n"
2395 "sub $0x10,%2 \n"
2396 "movdqa %%xmm0,(%1) \n"
2397 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002398 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002399 : "+r"(src), // %0
2400 "+r"(dst), // %1
2401 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002402 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002403 : "memory", "cc"
2404#if defined(__SSE2__)
2405 , "xmm0", "xmm5"
2406#endif
2407 );
2408}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002409#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002410
fbarchard@google.com42831e02012-01-21 02:54:17 +00002411#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002412void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002413 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002414 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002415 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002416 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002417 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002418 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002419 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002420 "psllw $0x8,%%xmm0 \n"
2421 "psrlw $0x8,%%xmm1 \n"
2422 "por %%xmm1,%%xmm0 \n"
2423 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2424 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2425 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2426 "sub $0x10,%2 \n"
2427 "movdqu %%xmm0,(%1) \n"
2428 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002429 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002430 : "+r"(src), // %0
2431 "+r"(dst), // %1
2432 "+r"(temp_width) // %2
2433 :
2434 : "memory", "cc"
2435#if defined(__SSE2__)
2436 , "xmm0", "xmm1"
2437#endif
2438 );
2439}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002440#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002441
fbarchard@google.com16a96642012-03-02 22:38:09 +00002442#ifdef HAS_MIRRORROW_UV_SSSE3
2443// Shuffle table for reversing the bytes of UV channels.
2444CONST uvec8 kShuffleMirrorUV = {
2445 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2446};
2447void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2448 int width) {
2449 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002450 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002451 "movdqa %4,%%xmm1 \n"
2452 "lea -16(%0,%3,2),%0 \n"
2453 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002454 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002455 "1: \n"
2456 "movdqa (%0),%%xmm0 \n"
2457 "lea -16(%0),%0 \n"
2458 "pshufb %%xmm1,%%xmm0 \n"
2459 "sub $8,%3 \n"
2460 "movlpd %%xmm0,(%1) \n"
2461 "movhpd %%xmm0,(%1,%2) \n"
2462 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002463 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002464 : "+r"(src), // %0
2465 "+r"(dst_u), // %1
2466 "+r"(dst_v), // %2
2467 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002468 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002469 : "memory", "cc"
2470#if defined(__SSE2__)
2471 , "xmm0", "xmm1"
2472#endif
2473 );
2474}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002475#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002476
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002477#ifdef HAS_ARGBMIRRORROW_SSSE3
2478// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002479CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002480 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2481};
2482
2483void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2484 intptr_t temp_width = static_cast<intptr_t>(width);
2485 asm volatile (
2486 "movdqa %3,%%xmm5 \n"
2487 "lea -0x10(%0),%0 \n"
2488 ".p2align 4 \n"
2489 "1: \n"
2490 "movdqa (%0,%2,4),%%xmm0 \n"
2491 "pshufb %%xmm5,%%xmm0 \n"
2492 "sub $0x4,%2 \n"
2493 "movdqa %%xmm0,(%1) \n"
2494 "lea 0x10(%1),%1 \n"
2495 "jg 1b \n"
2496 : "+r"(src), // %0
2497 "+r"(dst), // %1
2498 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002499 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002500 : "memory", "cc"
2501#if defined(__SSE2__)
2502 , "xmm0", "xmm5"
2503#endif
2504 );
2505}
2506#endif // HAS_ARGBMIRRORROW_SSSE3
2507
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002508#ifdef HAS_SPLITUV_SSE2
2509void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002510 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002511 "pcmpeqb %%xmm5,%%xmm5 \n"
2512 "psrlw $0x8,%%xmm5 \n"
2513 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002514 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002515 "1: \n"
2516 "movdqa (%0),%%xmm0 \n"
2517 "movdqa 0x10(%0),%%xmm1 \n"
2518 "lea 0x20(%0),%0 \n"
2519 "movdqa %%xmm0,%%xmm2 \n"
2520 "movdqa %%xmm1,%%xmm3 \n"
2521 "pand %%xmm5,%%xmm0 \n"
2522 "pand %%xmm5,%%xmm1 \n"
2523 "packuswb %%xmm1,%%xmm0 \n"
2524 "psrlw $0x8,%%xmm2 \n"
2525 "psrlw $0x8,%%xmm3 \n"
2526 "packuswb %%xmm3,%%xmm2 \n"
2527 "movdqa %%xmm0,(%1) \n"
2528 "movdqa %%xmm2,(%1,%2) \n"
2529 "lea 0x10(%1),%1 \n"
2530 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002531 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002532 : "+r"(src_uv), // %0
2533 "+r"(dst_u), // %1
2534 "+r"(dst_v), // %2
2535 "+r"(pix) // %3
2536 :
2537 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002538#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002539 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002540#endif
2541 );
2542}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002543
2544void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2545 int pix) {
2546 asm volatile (
2547 "pcmpeqb %%xmm5,%%xmm5 \n"
2548 "psrlw $0x8,%%xmm5 \n"
2549 "sub %1,%2 \n"
2550 ".p2align 4 \n"
2551 "1: \n"
2552 "movdqu (%0),%%xmm0 \n"
2553 "movdqu 0x10(%0),%%xmm1 \n"
2554 "lea 0x20(%0),%0 \n"
2555 "movdqa %%xmm0,%%xmm2 \n"
2556 "movdqa %%xmm1,%%xmm3 \n"
2557 "pand %%xmm5,%%xmm0 \n"
2558 "pand %%xmm5,%%xmm1 \n"
2559 "packuswb %%xmm1,%%xmm0 \n"
2560 "psrlw $0x8,%%xmm2 \n"
2561 "psrlw $0x8,%%xmm3 \n"
2562 "packuswb %%xmm3,%%xmm2 \n"
2563 "movdqu %%xmm0,(%1) \n"
2564 "movdqu %%xmm2,(%1,%2) \n"
2565 "lea 0x10(%1),%1 \n"
2566 "sub $0x10,%3 \n"
2567 "jg 1b \n"
2568 : "+r"(src_uv), // %0
2569 "+r"(dst_u), // %1
2570 "+r"(dst_v), // %2
2571 "+r"(pix) // %3
2572 :
2573 : "memory", "cc"
2574#if defined(__SSE2__)
2575 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2576#endif
2577 );
2578}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002579#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002580
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002581#ifdef HAS_MERGEUV_SSE2
2582void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2583 int width) {
2584 asm volatile (
2585 "sub %0,%1 \n"
2586 ".p2align 4 \n"
2587 "1: \n"
2588 "movdqa (%0),%%xmm0 \n"
2589 "movdqa (%0,%1,1),%%xmm1 \n"
2590 "lea 0x10(%0),%0 \n"
2591 "movdqa %%xmm0,%%xmm2 \n"
2592 "punpcklbw %%xmm1,%%xmm0 \n"
2593 "punpckhbw %%xmm1,%%xmm2 \n"
2594 "movdqa %%xmm0,(%2) \n"
2595 "movdqa %%xmm2,0x10(%2) \n"
2596 "lea 0x20(%2),%2 \n"
2597 "sub $0x10,%3 \n"
2598 "jg 1b \n"
2599 : "+r"(src_u), // %0
2600 "+r"(src_v), // %1
2601 "+r"(dst_uv), // %2
2602 "+r"(width) // %3
2603 :
2604 : "memory", "cc"
2605#if defined(__SSE2__)
2606 , "xmm0", "xmm1", "xmm2"
2607#endif
2608 );
2609}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002610
2611void MergeUV_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2612 uint8* dst_uv, int width) {
2613 asm volatile (
2614 "sub %0,%1 \n"
2615 ".p2align 4 \n"
2616 "1: \n"
2617 "movdqu (%0),%%xmm0 \n"
2618 "movdqu (%0,%1,1),%%xmm1 \n"
2619 "lea 0x10(%0),%0 \n"
2620 "movdqa %%xmm0,%%xmm2 \n"
2621 "punpcklbw %%xmm1,%%xmm0 \n"
2622 "punpckhbw %%xmm1,%%xmm2 \n"
2623 "movdqu %%xmm0,(%2) \n"
2624 "movdqu %%xmm2,0x10(%2) \n"
2625 "lea 0x20(%2),%2 \n"
2626 "sub $0x10,%3 \n"
2627 "jg 1b \n"
2628 : "+r"(src_u), // %0
2629 "+r"(src_v), // %1
2630 "+r"(dst_uv), // %2
2631 "+r"(width) // %3
2632 :
2633 : "memory", "cc"
2634#if defined(__SSE2__)
2635 , "xmm0", "xmm1", "xmm2"
2636#endif
2637 );
2638}
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002639#endif // HAS_MERGEUV_SSE2
2640
fbarchard@google.com19932f82012-02-16 22:19:14 +00002641#ifdef HAS_COPYROW_SSE2
2642void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002643 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002644 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002645 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002646 "1: \n"
2647 "movdqa (%0),%%xmm0 \n"
2648 "movdqa 0x10(%0),%%xmm1 \n"
2649 "movdqa %%xmm0,(%0,%1) \n"
2650 "movdqa %%xmm1,0x10(%0,%1) \n"
2651 "lea 0x20(%0),%0 \n"
2652 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002653 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002654 : "+r"(src), // %0
2655 "+r"(dst), // %1
2656 "+r"(count) // %2
2657 :
2658 : "memory", "cc"
2659#if defined(__SSE2__)
2660 , "xmm0", "xmm1"
2661#endif
2662 );
2663}
2664#endif // HAS_COPYROW_SSE2
2665
2666#ifdef HAS_COPYROW_X86
2667void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2668 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002669 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002670 "shr $0x2,%2 \n"
2671 "rep movsl \n"
2672 : "+S"(src), // %0
2673 "+D"(dst), // %1
2674 "+c"(width_tmp) // %2
2675 :
2676 : "memory", "cc"
2677 );
2678}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002679#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002680
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002681#ifdef HAS_SETROW_X86
2682void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2683 size_t width_tmp = static_cast<size_t>(width);
2684 asm volatile (
2685 "shr $0x2,%1 \n"
2686 "rep stosl \n"
2687 : "+D"(dst), // %0
2688 "+c"(width_tmp) // %1
2689 : "a"(v32) // %2
2690 : "memory", "cc");
2691}
2692
2693void SetRows32_X86(uint8* dst, uint32 v32, int width,
2694 int dst_stride, int height) {
2695 for (int y = 0; y < height; ++y) {
2696 size_t width_tmp = static_cast<size_t>(width);
2697 uint32* d = reinterpret_cast<uint32*>(dst);
2698 asm volatile (
2699 "rep stosl \n"
2700 : "+D"(d), // %0
2701 "+c"(width_tmp) // %1
2702 : "a"(v32) // %2
2703 : "memory", "cc");
2704 dst += dst_stride;
2705 }
2706}
2707#endif // HAS_SETROW_X86
2708
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002709#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002710void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002711 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002712 "pcmpeqb %%xmm5,%%xmm5 \n"
2713 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002714 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002715 "1: \n"
2716 "movdqa (%0),%%xmm0 \n"
2717 "movdqa 0x10(%0),%%xmm1 \n"
2718 "lea 0x20(%0),%0 \n"
2719 "pand %%xmm5,%%xmm0 \n"
2720 "pand %%xmm5,%%xmm1 \n"
2721 "packuswb %%xmm1,%%xmm0 \n"
2722 "movdqa %%xmm0,(%1) \n"
2723 "lea 0x10(%1),%1 \n"
2724 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002725 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002726 : "+r"(src_yuy2), // %0
2727 "+r"(dst_y), // %1
2728 "+r"(pix) // %2
2729 :
2730 : "memory", "cc"
2731#if defined(__SSE2__)
2732 , "xmm0", "xmm1", "xmm5"
2733#endif
2734 );
2735}
2736
2737void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002738 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002739 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002740 "pcmpeqb %%xmm5,%%xmm5 \n"
2741 "psrlw $0x8,%%xmm5 \n"
2742 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002743 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002744 "1: \n"
2745 "movdqa (%0),%%xmm0 \n"
2746 "movdqa 0x10(%0),%%xmm1 \n"
2747 "movdqa (%0,%4,1),%%xmm2 \n"
2748 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2749 "lea 0x20(%0),%0 \n"
2750 "pavgb %%xmm2,%%xmm0 \n"
2751 "pavgb %%xmm3,%%xmm1 \n"
2752 "psrlw $0x8,%%xmm0 \n"
2753 "psrlw $0x8,%%xmm1 \n"
2754 "packuswb %%xmm1,%%xmm0 \n"
2755 "movdqa %%xmm0,%%xmm1 \n"
2756 "pand %%xmm5,%%xmm0 \n"
2757 "packuswb %%xmm0,%%xmm0 \n"
2758 "psrlw $0x8,%%xmm1 \n"
2759 "packuswb %%xmm1,%%xmm1 \n"
2760 "movq %%xmm0,(%1) \n"
2761 "movq %%xmm1,(%1,%2) \n"
2762 "lea 0x8(%1),%1 \n"
2763 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002764 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002765 : "+r"(src_yuy2), // %0
2766 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002767 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002768 "+r"(pix) // %3
2769 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2770 : "memory", "cc"
2771#if defined(__SSE2__)
2772 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2773#endif
2774 );
2775}
2776
fbarchard@google.comc704f782012-08-30 19:53:48 +00002777void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2778 uint8* dst_u, uint8* dst_v, int pix) {
2779 asm volatile (
2780 "pcmpeqb %%xmm5,%%xmm5 \n"
2781 "psrlw $0x8,%%xmm5 \n"
2782 "sub %1,%2 \n"
2783 ".p2align 4 \n"
2784 "1: \n"
2785 "movdqa (%0),%%xmm0 \n"
2786 "movdqa 0x10(%0),%%xmm1 \n"
2787 "lea 0x20(%0),%0 \n"
2788 "psrlw $0x8,%%xmm0 \n"
2789 "psrlw $0x8,%%xmm1 \n"
2790 "packuswb %%xmm1,%%xmm0 \n"
2791 "movdqa %%xmm0,%%xmm1 \n"
2792 "pand %%xmm5,%%xmm0 \n"
2793 "packuswb %%xmm0,%%xmm0 \n"
2794 "psrlw $0x8,%%xmm1 \n"
2795 "packuswb %%xmm1,%%xmm1 \n"
2796 "movq %%xmm0,(%1) \n"
2797 "movq %%xmm1,(%1,%2) \n"
2798 "lea 0x8(%1),%1 \n"
2799 "sub $0x10,%3 \n"
2800 "jg 1b \n"
2801 : "+r"(src_yuy2), // %0
2802 "+r"(dst_u), // %1
2803 "+r"(dst_v), // %2
2804 "+r"(pix) // %3
2805 :
2806 : "memory", "cc"
2807#if defined(__SSE2__)
2808 , "xmm0", "xmm1", "xmm5"
2809#endif
2810 );
2811}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002812
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002813void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2814 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002815 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002816 "pcmpeqb %%xmm5,%%xmm5 \n"
2817 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002818 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002819 "1: \n"
2820 "movdqu (%0),%%xmm0 \n"
2821 "movdqu 0x10(%0),%%xmm1 \n"
2822 "lea 0x20(%0),%0 \n"
2823 "pand %%xmm5,%%xmm0 \n"
2824 "pand %%xmm5,%%xmm1 \n"
2825 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002826 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002827 "movdqu %%xmm0,(%1) \n"
2828 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002829 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002830 : "+r"(src_yuy2), // %0
2831 "+r"(dst_y), // %1
2832 "+r"(pix) // %2
2833 :
2834 : "memory", "cc"
2835#if defined(__SSE2__)
2836 , "xmm0", "xmm1", "xmm5"
2837#endif
2838 );
2839}
2840
2841void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2842 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002843 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002844 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002845 "pcmpeqb %%xmm5,%%xmm5 \n"
2846 "psrlw $0x8,%%xmm5 \n"
2847 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002848 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002849 "1: \n"
2850 "movdqu (%0),%%xmm0 \n"
2851 "movdqu 0x10(%0),%%xmm1 \n"
2852 "movdqu (%0,%4,1),%%xmm2 \n"
2853 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2854 "lea 0x20(%0),%0 \n"
2855 "pavgb %%xmm2,%%xmm0 \n"
2856 "pavgb %%xmm3,%%xmm1 \n"
2857 "psrlw $0x8,%%xmm0 \n"
2858 "psrlw $0x8,%%xmm1 \n"
2859 "packuswb %%xmm1,%%xmm0 \n"
2860 "movdqa %%xmm0,%%xmm1 \n"
2861 "pand %%xmm5,%%xmm0 \n"
2862 "packuswb %%xmm0,%%xmm0 \n"
2863 "psrlw $0x8,%%xmm1 \n"
2864 "packuswb %%xmm1,%%xmm1 \n"
2865 "movq %%xmm0,(%1) \n"
2866 "movq %%xmm1,(%1,%2) \n"
2867 "lea 0x8(%1),%1 \n"
2868 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002869 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002870 : "+r"(src_yuy2), // %0
2871 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002872 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002873 "+r"(pix) // %3
2874 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2875 : "memory", "cc"
2876#if defined(__SSE2__)
2877 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2878#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002879 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002880}
2881
fbarchard@google.comc704f782012-08-30 19:53:48 +00002882void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2883 uint8* dst_u, uint8* dst_v, int pix) {
2884 asm volatile (
2885 "pcmpeqb %%xmm5,%%xmm5 \n"
2886 "psrlw $0x8,%%xmm5 \n"
2887 "sub %1,%2 \n"
2888 ".p2align 4 \n"
2889 "1: \n"
2890 "movdqu (%0),%%xmm0 \n"
2891 "movdqu 0x10(%0),%%xmm1 \n"
2892 "lea 0x20(%0),%0 \n"
2893 "psrlw $0x8,%%xmm0 \n"
2894 "psrlw $0x8,%%xmm1 \n"
2895 "packuswb %%xmm1,%%xmm0 \n"
2896 "movdqa %%xmm0,%%xmm1 \n"
2897 "pand %%xmm5,%%xmm0 \n"
2898 "packuswb %%xmm0,%%xmm0 \n"
2899 "psrlw $0x8,%%xmm1 \n"
2900 "packuswb %%xmm1,%%xmm1 \n"
2901 "movq %%xmm0,(%1) \n"
2902 "movq %%xmm1,(%1,%2) \n"
2903 "lea 0x8(%1),%1 \n"
2904 "sub $0x10,%3 \n"
2905 "jg 1b \n"
2906 : "+r"(src_yuy2), // %0
2907 "+r"(dst_u), // %1
2908 "+r"(dst_v), // %2
2909 "+r"(pix) // %3
2910 :
2911 : "memory", "cc"
2912#if defined(__SSE2__)
2913 , "xmm0", "xmm1", "xmm5"
2914#endif
2915 );
2916}
2917
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002918void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002919 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002920 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002921 "1: \n"
2922 "movdqa (%0),%%xmm0 \n"
2923 "movdqa 0x10(%0),%%xmm1 \n"
2924 "lea 0x20(%0),%0 \n"
2925 "psrlw $0x8,%%xmm0 \n"
2926 "psrlw $0x8,%%xmm1 \n"
2927 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002928 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002929 "movdqa %%xmm0,(%1) \n"
2930 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002931 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002932 : "+r"(src_uyvy), // %0
2933 "+r"(dst_y), // %1
2934 "+r"(pix) // %2
2935 :
2936 : "memory", "cc"
2937#if defined(__SSE2__)
2938 , "xmm0", "xmm1"
2939#endif
2940 );
2941}
2942
2943void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002944 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002945 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002946 "pcmpeqb %%xmm5,%%xmm5 \n"
2947 "psrlw $0x8,%%xmm5 \n"
2948 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002949 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002950 "1: \n"
2951 "movdqa (%0),%%xmm0 \n"
2952 "movdqa 0x10(%0),%%xmm1 \n"
2953 "movdqa (%0,%4,1),%%xmm2 \n"
2954 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2955 "lea 0x20(%0),%0 \n"
2956 "pavgb %%xmm2,%%xmm0 \n"
2957 "pavgb %%xmm3,%%xmm1 \n"
2958 "pand %%xmm5,%%xmm0 \n"
2959 "pand %%xmm5,%%xmm1 \n"
2960 "packuswb %%xmm1,%%xmm0 \n"
2961 "movdqa %%xmm0,%%xmm1 \n"
2962 "pand %%xmm5,%%xmm0 \n"
2963 "packuswb %%xmm0,%%xmm0 \n"
2964 "psrlw $0x8,%%xmm1 \n"
2965 "packuswb %%xmm1,%%xmm1 \n"
2966 "movq %%xmm0,(%1) \n"
2967 "movq %%xmm1,(%1,%2) \n"
2968 "lea 0x8(%1),%1 \n"
2969 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002970 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002971 : "+r"(src_uyvy), // %0
2972 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002973 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002974 "+r"(pix) // %3
2975 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2976 : "memory", "cc"
2977#if defined(__SSE2__)
2978 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2979#endif
2980 );
2981}
2982
fbarchard@google.comc704f782012-08-30 19:53:48 +00002983void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2984 uint8* dst_u, uint8* dst_v, int pix) {
2985 asm volatile (
2986 "pcmpeqb %%xmm5,%%xmm5 \n"
2987 "psrlw $0x8,%%xmm5 \n"
2988 "sub %1,%2 \n"
2989 ".p2align 4 \n"
2990 "1: \n"
2991 "movdqa (%0),%%xmm0 \n"
2992 "movdqa 0x10(%0),%%xmm1 \n"
2993 "lea 0x20(%0),%0 \n"
2994 "pand %%xmm5,%%xmm0 \n"
2995 "pand %%xmm5,%%xmm1 \n"
2996 "packuswb %%xmm1,%%xmm0 \n"
2997 "movdqa %%xmm0,%%xmm1 \n"
2998 "pand %%xmm5,%%xmm0 \n"
2999 "packuswb %%xmm0,%%xmm0 \n"
3000 "psrlw $0x8,%%xmm1 \n"
3001 "packuswb %%xmm1,%%xmm1 \n"
3002 "movq %%xmm0,(%1) \n"
3003 "movq %%xmm1,(%1,%2) \n"
3004 "lea 0x8(%1),%1 \n"
3005 "sub $0x10,%3 \n"
3006 "jg 1b \n"
3007 : "+r"(src_uyvy), // %0
3008 "+r"(dst_u), // %1
3009 "+r"(dst_v), // %2
3010 "+r"(pix) // %3
3011 :
3012 : "memory", "cc"
3013#if defined(__SSE2__)
3014 , "xmm0", "xmm1", "xmm5"
3015#endif
3016 );
3017}
3018
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003019void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3020 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003021 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003022 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003023 "1: \n"
3024 "movdqu (%0),%%xmm0 \n"
3025 "movdqu 0x10(%0),%%xmm1 \n"
3026 "lea 0x20(%0),%0 \n"
3027 "psrlw $0x8,%%xmm0 \n"
3028 "psrlw $0x8,%%xmm1 \n"
3029 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003030 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003031 "movdqu %%xmm0,(%1) \n"
3032 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003033 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003034 : "+r"(src_uyvy), // %0
3035 "+r"(dst_y), // %1
3036 "+r"(pix) // %2
3037 :
3038 : "memory", "cc"
3039#if defined(__SSE2__)
3040 , "xmm0", "xmm1"
3041#endif
3042 );
3043}
3044
3045void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003046 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003047 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003048 "pcmpeqb %%xmm5,%%xmm5 \n"
3049 "psrlw $0x8,%%xmm5 \n"
3050 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003051 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003052 "1: \n"
3053 "movdqu (%0),%%xmm0 \n"
3054 "movdqu 0x10(%0),%%xmm1 \n"
3055 "movdqu (%0,%4,1),%%xmm2 \n"
3056 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3057 "lea 0x20(%0),%0 \n"
3058 "pavgb %%xmm2,%%xmm0 \n"
3059 "pavgb %%xmm3,%%xmm1 \n"
3060 "pand %%xmm5,%%xmm0 \n"
3061 "pand %%xmm5,%%xmm1 \n"
3062 "packuswb %%xmm1,%%xmm0 \n"
3063 "movdqa %%xmm0,%%xmm1 \n"
3064 "pand %%xmm5,%%xmm0 \n"
3065 "packuswb %%xmm0,%%xmm0 \n"
3066 "psrlw $0x8,%%xmm1 \n"
3067 "packuswb %%xmm1,%%xmm1 \n"
3068 "movq %%xmm0,(%1) \n"
3069 "movq %%xmm1,(%1,%2) \n"
3070 "lea 0x8(%1),%1 \n"
3071 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003072 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003073 : "+r"(src_uyvy), // %0
3074 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003075 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003076 "+r"(pix) // %3
3077 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3078 : "memory", "cc"
3079#if defined(__SSE2__)
3080 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3081#endif
3082 );
3083}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003084
3085void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3086 uint8* dst_u, uint8* dst_v, int pix) {
3087 asm volatile (
3088 "pcmpeqb %%xmm5,%%xmm5 \n"
3089 "psrlw $0x8,%%xmm5 \n"
3090 "sub %1,%2 \n"
3091 ".p2align 4 \n"
3092 "1: \n"
3093 "movdqu (%0),%%xmm0 \n"
3094 "movdqu 0x10(%0),%%xmm1 \n"
3095 "lea 0x20(%0),%0 \n"
3096 "pand %%xmm5,%%xmm0 \n"
3097 "pand %%xmm5,%%xmm1 \n"
3098 "packuswb %%xmm1,%%xmm0 \n"
3099 "movdqa %%xmm0,%%xmm1 \n"
3100 "pand %%xmm5,%%xmm0 \n"
3101 "packuswb %%xmm0,%%xmm0 \n"
3102 "psrlw $0x8,%%xmm1 \n"
3103 "packuswb %%xmm1,%%xmm1 \n"
3104 "movq %%xmm0,(%1) \n"
3105 "movq %%xmm1,(%1,%2) \n"
3106 "lea 0x8(%1),%1 \n"
3107 "sub $0x10,%3 \n"
3108 "jg 1b \n"
3109 : "+r"(src_uyvy), // %0
3110 "+r"(dst_u), // %1
3111 "+r"(dst_v), // %2
3112 "+r"(pix) // %3
3113 :
3114 : "memory", "cc"
3115#if defined(__SSE2__)
3116 , "xmm0", "xmm1", "xmm5"
3117#endif
3118 );
3119}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003120#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003121
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003122#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003123// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003124void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3125 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003126 asm volatile (
3127 "pcmpeqb %%xmm7,%%xmm7 \n"
3128 "psrlw $0xf,%%xmm7 \n"
3129 "pcmpeqb %%xmm6,%%xmm6 \n"
3130 "psrlw $0x8,%%xmm6 \n"
3131 "pcmpeqb %%xmm5,%%xmm5 \n"
3132 "psllw $0x8,%%xmm5 \n"
3133 "pcmpeqb %%xmm4,%%xmm4 \n"
3134 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003135 "sub $0x1,%3 \n"
3136 "je 91f \n"
3137 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003138
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003139 // 1 pixel loop until destination pointer is aligned.
3140 "10: \n"
3141 "test $0xf,%2 \n"
3142 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003143 "movd (%0),%%xmm3 \n"
3144 "lea 0x4(%0),%0 \n"
3145 "movdqa %%xmm3,%%xmm0 \n"
3146 "pxor %%xmm4,%%xmm3 \n"
3147 "movd (%1),%%xmm2 \n"
3148 "psrlw $0x8,%%xmm3 \n"
3149 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3150 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3151 "pand %%xmm6,%%xmm2 \n"
3152 "paddw %%xmm7,%%xmm3 \n"
3153 "pmullw %%xmm3,%%xmm2 \n"
3154 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003155 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003156 "psrlw $0x8,%%xmm1 \n"
3157 "por %%xmm4,%%xmm0 \n"
3158 "pmullw %%xmm3,%%xmm1 \n"
3159 "psrlw $0x8,%%xmm2 \n"
3160 "paddusb %%xmm2,%%xmm0 \n"
3161 "pand %%xmm5,%%xmm1 \n"
3162 "paddusb %%xmm1,%%xmm0 \n"
3163 "sub $0x1,%3 \n"
3164 "movd %%xmm0,(%2) \n"
3165 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003166 "jge 10b \n"
3167
3168 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003169 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003170 "jl 49f \n"
3171
fbarchard@google.com794fe122012-06-15 01:05:01 +00003172 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003173 ".p2align 2 \n"
3174 "41: \n"
3175 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003176 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003177 "movdqa %%xmm3,%%xmm0 \n"
3178 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003179 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003180 "psrlw $0x8,%%xmm3 \n"
3181 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3182 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003183 "pand %%xmm6,%%xmm2 \n"
3184 "paddw %%xmm7,%%xmm3 \n"
3185 "pmullw %%xmm3,%%xmm2 \n"
3186 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003187 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003188 "psrlw $0x8,%%xmm1 \n"
3189 "por %%xmm4,%%xmm0 \n"
3190 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003191 "psrlw $0x8,%%xmm2 \n"
3192 "paddusb %%xmm2,%%xmm0 \n"
3193 "pand %%xmm5,%%xmm1 \n"
3194 "paddusb %%xmm1,%%xmm0 \n"
3195 "sub $0x4,%3 \n"
3196 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003197 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003198 "jge 41b \n"
3199
3200 "49: \n"
3201 "add $0x3,%3 \n"
3202 "jl 99f \n"
3203
fbarchard@google.com794fe122012-06-15 01:05:01 +00003204 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003205 "91: \n"
3206 "movd (%0),%%xmm3 \n"
3207 "lea 0x4(%0),%0 \n"
3208 "movdqa %%xmm3,%%xmm0 \n"
3209 "pxor %%xmm4,%%xmm3 \n"
3210 "movd (%1),%%xmm2 \n"
3211 "psrlw $0x8,%%xmm3 \n"
3212 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3213 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3214 "pand %%xmm6,%%xmm2 \n"
3215 "paddw %%xmm7,%%xmm3 \n"
3216 "pmullw %%xmm3,%%xmm2 \n"
3217 "movd (%1),%%xmm1 \n"
3218 "lea 0x4(%1),%1 \n"
3219 "psrlw $0x8,%%xmm1 \n"
3220 "por %%xmm4,%%xmm0 \n"
3221 "pmullw %%xmm3,%%xmm1 \n"
3222 "psrlw $0x8,%%xmm2 \n"
3223 "paddusb %%xmm2,%%xmm0 \n"
3224 "pand %%xmm5,%%xmm1 \n"
3225 "paddusb %%xmm1,%%xmm0 \n"
3226 "sub $0x1,%3 \n"
3227 "movd %%xmm0,(%2) \n"
3228 "lea 0x4(%2),%2 \n"
3229 "jge 91b \n"
3230 "99: \n"
3231 : "+r"(src_argb0), // %0
3232 "+r"(src_argb1), // %1
3233 "+r"(dst_argb), // %2
3234 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003235 :
3236 : "memory", "cc"
3237#if defined(__SSE2__)
3238 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3239#endif
3240 );
3241}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003242#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003243
fbarchard@google.com96af8702012-04-06 18:22:27 +00003244#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003245// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003246CONST uvec8 kShuffleAlpha = {
3247 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3248 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3249};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003250
3251// Blend 8 pixels at a time
3252// Shuffle table for reversing the bytes.
3253
3254// Same as SSE2, but replaces
3255// psrlw xmm3, 8 // alpha
3256// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3257// pshuflw xmm3, xmm3,0F5h
3258// with..
3259// pshufb xmm3, kShuffleAlpha // alpha
3260
3261void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3262 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003263 asm volatile (
3264 "pcmpeqb %%xmm7,%%xmm7 \n"
3265 "psrlw $0xf,%%xmm7 \n"
3266 "pcmpeqb %%xmm6,%%xmm6 \n"
3267 "psrlw $0x8,%%xmm6 \n"
3268 "pcmpeqb %%xmm5,%%xmm5 \n"
3269 "psllw $0x8,%%xmm5 \n"
3270 "pcmpeqb %%xmm4,%%xmm4 \n"
3271 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003272 "sub $0x1,%3 \n"
3273 "je 91f \n"
3274 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003275
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003276 // 1 pixel loop until destination pointer is aligned.
3277 "10: \n"
3278 "test $0xf,%2 \n"
3279 "je 19f \n"
3280 "movd (%0),%%xmm3 \n"
3281 "lea 0x4(%0),%0 \n"
3282 "movdqa %%xmm3,%%xmm0 \n"
3283 "pxor %%xmm4,%%xmm3 \n"
3284 "movd (%1),%%xmm2 \n"
3285 "pshufb %4,%%xmm3 \n"
3286 "pand %%xmm6,%%xmm2 \n"
3287 "paddw %%xmm7,%%xmm3 \n"
3288 "pmullw %%xmm3,%%xmm2 \n"
3289 "movd (%1),%%xmm1 \n"
3290 "lea 0x4(%1),%1 \n"
3291 "psrlw $0x8,%%xmm1 \n"
3292 "por %%xmm4,%%xmm0 \n"
3293 "pmullw %%xmm3,%%xmm1 \n"
3294 "psrlw $0x8,%%xmm2 \n"
3295 "paddusb %%xmm2,%%xmm0 \n"
3296 "pand %%xmm5,%%xmm1 \n"
3297 "paddusb %%xmm1,%%xmm0 \n"
3298 "sub $0x1,%3 \n"
3299 "movd %%xmm0,(%2) \n"
3300 "lea 0x4(%2),%2 \n"
3301 "jge 10b \n"
3302
3303 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003304 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003305 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003306 "test $0xf,%0 \n"
3307 "jne 41f \n"
3308 "test $0xf,%1 \n"
3309 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003310
fbarchard@google.com794fe122012-06-15 01:05:01 +00003311 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003312 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003313 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003314 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003315 "lea 0x10(%0),%0 \n"
3316 "movdqa %%xmm3,%%xmm0 \n"
3317 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003318 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003319 "pshufb %4,%%xmm3 \n"
3320 "pand %%xmm6,%%xmm2 \n"
3321 "paddw %%xmm7,%%xmm3 \n"
3322 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003323 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003324 "lea 0x10(%1),%1 \n"
3325 "psrlw $0x8,%%xmm1 \n"
3326 "por %%xmm4,%%xmm0 \n"
3327 "pmullw %%xmm3,%%xmm1 \n"
3328 "psrlw $0x8,%%xmm2 \n"
3329 "paddusb %%xmm2,%%xmm0 \n"
3330 "pand %%xmm5,%%xmm1 \n"
3331 "paddusb %%xmm1,%%xmm0 \n"
3332 "sub $0x4,%3 \n"
3333 "movdqa %%xmm0,(%2) \n"
3334 "lea 0x10(%2),%2 \n"
3335 "jge 40b \n"
3336 "jmp 49f \n"
3337
3338 // 4 pixel unaligned loop.
3339 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003340 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003341 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003342 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003343 "movdqa %%xmm3,%%xmm0 \n"
3344 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003345 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003346 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003347 "pand %%xmm6,%%xmm2 \n"
3348 "paddw %%xmm7,%%xmm3 \n"
3349 "pmullw %%xmm3,%%xmm2 \n"
3350 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003351 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003352 "psrlw $0x8,%%xmm1 \n"
3353 "por %%xmm4,%%xmm0 \n"
3354 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003355 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003356 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003357 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003358 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003359 "sub $0x4,%3 \n"
3360 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003361 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003362 "jge 41b \n"
3363
3364 "49: \n"
3365 "add $0x3,%3 \n"
3366 "jl 99f \n"
3367
fbarchard@google.com794fe122012-06-15 01:05:01 +00003368 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003369 "91: \n"
3370 "movd (%0),%%xmm3 \n"
3371 "lea 0x4(%0),%0 \n"
3372 "movdqa %%xmm3,%%xmm0 \n"
3373 "pxor %%xmm4,%%xmm3 \n"
3374 "movd (%1),%%xmm2 \n"
3375 "pshufb %4,%%xmm3 \n"
3376 "pand %%xmm6,%%xmm2 \n"
3377 "paddw %%xmm7,%%xmm3 \n"
3378 "pmullw %%xmm3,%%xmm2 \n"
3379 "movd (%1),%%xmm1 \n"
3380 "lea 0x4(%1),%1 \n"
3381 "psrlw $0x8,%%xmm1 \n"
3382 "por %%xmm4,%%xmm0 \n"
3383 "pmullw %%xmm3,%%xmm1 \n"
3384 "psrlw $0x8,%%xmm2 \n"
3385 "paddusb %%xmm2,%%xmm0 \n"
3386 "pand %%xmm5,%%xmm1 \n"
3387 "paddusb %%xmm1,%%xmm0 \n"
3388 "sub $0x1,%3 \n"
3389 "movd %%xmm0,(%2) \n"
3390 "lea 0x4(%2),%2 \n"
3391 "jge 91b \n"
3392 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003393 : "+r"(src_argb0), // %0
3394 "+r"(src_argb1), // %1
3395 "+r"(dst_argb), // %2
3396 "+r"(width) // %3
3397 : "m"(kShuffleAlpha) // %4
3398 : "memory", "cc"
3399#if defined(__SSE2__)
3400 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3401#endif
3402 );
3403}
3404#endif // HAS_ARGBBLENDROW_SSSE3
3405
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003406#ifdef HAS_ARGBATTENUATE_SSE2
3407// Attenuate 4 pixels at a time.
3408// aligned to 16 bytes
3409void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3410 asm volatile (
3411 "sub %0,%1 \n"
3412 "pcmpeqb %%xmm4,%%xmm4 \n"
3413 "pslld $0x18,%%xmm4 \n"
3414 "pcmpeqb %%xmm5,%%xmm5 \n"
3415 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003416
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003417 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003418 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003419 "1: \n"
3420 "movdqa (%0),%%xmm0 \n"
3421 "punpcklbw %%xmm0,%%xmm0 \n"
3422 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3423 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3424 "pmulhuw %%xmm2,%%xmm0 \n"
3425 "movdqa (%0),%%xmm1 \n"
3426 "punpckhbw %%xmm1,%%xmm1 \n"
3427 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3428 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3429 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003430 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003431 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003432 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003433 "psrlw $0x8,%%xmm1 \n"
3434 "packuswb %%xmm1,%%xmm0 \n"
3435 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003436 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003437 "sub $0x4,%2 \n"
3438 "movdqa %%xmm0,(%0,%1,1) \n"
3439 "lea 0x10(%0),%0 \n"
3440 "jg 1b \n"
3441 : "+r"(src_argb), // %0
3442 "+r"(dst_argb), // %1
3443 "+r"(width) // %2
3444 :
3445 : "memory", "cc"
3446#if defined(__SSE2__)
3447 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3448#endif
3449 );
3450}
3451#endif // HAS_ARGBATTENUATE_SSE2
3452
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003453#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003454// Shuffle table duplicating alpha
3455CONST uvec8 kShuffleAlpha0 = {
3456 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3457};
3458CONST uvec8 kShuffleAlpha1 = {
3459 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3460 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3461};
3462// Attenuate 4 pixels at a time.
3463// aligned to 16 bytes
3464void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3465 asm volatile (
3466 "sub %0,%1 \n"
3467 "pcmpeqb %%xmm3,%%xmm3 \n"
3468 "pslld $0x18,%%xmm3 \n"
3469 "movdqa %3,%%xmm4 \n"
3470 "movdqa %4,%%xmm5 \n"
3471
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003472 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003473 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003474 "1: \n"
3475 "movdqa (%0),%%xmm0 \n"
3476 "pshufb %%xmm4,%%xmm0 \n"
3477 "movdqa (%0),%%xmm1 \n"
3478 "punpcklbw %%xmm1,%%xmm1 \n"
3479 "pmulhuw %%xmm1,%%xmm0 \n"
3480 "movdqa (%0),%%xmm1 \n"
3481 "pshufb %%xmm5,%%xmm1 \n"
3482 "movdqa (%0),%%xmm2 \n"
3483 "punpckhbw %%xmm2,%%xmm2 \n"
3484 "pmulhuw %%xmm2,%%xmm1 \n"
3485 "movdqa (%0),%%xmm2 \n"
3486 "pand %%xmm3,%%xmm2 \n"
3487 "psrlw $0x8,%%xmm0 \n"
3488 "psrlw $0x8,%%xmm1 \n"
3489 "packuswb %%xmm1,%%xmm0 \n"
3490 "por %%xmm2,%%xmm0 \n"
3491 "sub $0x4,%2 \n"
3492 "movdqa %%xmm0,(%0,%1,1) \n"
3493 "lea 0x10(%0),%0 \n"
3494 "jg 1b \n"
3495 : "+r"(src_argb), // %0
3496 "+r"(dst_argb), // %1
3497 "+r"(width) // %2
3498 : "m"(kShuffleAlpha0), // %3
3499 "m"(kShuffleAlpha1) // %4
3500 : "memory", "cc"
3501#if defined(__SSE2__)
3502 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3503#endif
3504 );
3505}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003506#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003507
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003508#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003509// Unattenuate 4 pixels at a time.
3510// aligned to 16 bytes
3511void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3512 int width) {
3513 uintptr_t alpha = 0;
3514 asm volatile (
3515 "sub %0,%1 \n"
3516 "pcmpeqb %%xmm4,%%xmm4 \n"
3517 "pslld $0x18,%%xmm4 \n"
3518
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003519 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003520 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003521 "1: \n"
3522 "movdqa (%0),%%xmm0 \n"
3523 "movzb 0x3(%0),%3 \n"
3524 "punpcklbw %%xmm0,%%xmm0 \n"
3525 "movd 0x0(%4,%3,4),%%xmm2 \n"
3526 "movzb 0x7(%0),%3 \n"
3527 "movd 0x0(%4,%3,4),%%xmm3 \n"
3528 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3529 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3530 "movlhps %%xmm3,%%xmm2 \n"
3531 "pmulhuw %%xmm2,%%xmm0 \n"
3532 "movdqa (%0),%%xmm1 \n"
3533 "movzb 0xb(%0),%3 \n"
3534 "punpckhbw %%xmm1,%%xmm1 \n"
3535 "movd 0x0(%4,%3,4),%%xmm2 \n"
3536 "movzb 0xf(%0),%3 \n"
3537 "movd 0x0(%4,%3,4),%%xmm3 \n"
3538 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3539 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3540 "movlhps %%xmm3,%%xmm2 \n"
3541 "pmulhuw %%xmm2,%%xmm1 \n"
3542 "movdqa (%0),%%xmm2 \n"
3543 "pand %%xmm4,%%xmm2 \n"
3544 "packuswb %%xmm1,%%xmm0 \n"
3545 "por %%xmm2,%%xmm0 \n"
3546 "sub $0x4,%2 \n"
3547 "movdqa %%xmm0,(%0,%1,1) \n"
3548 "lea 0x10(%0),%0 \n"
3549 "jg 1b \n"
3550 : "+r"(src_argb), // %0
3551 "+r"(dst_argb), // %1
3552 "+r"(width), // %2
3553 "+r"(alpha) // %3
3554 : "r"(fixed_invtbl8) // %4
3555 : "memory", "cc"
3556#if defined(__SSE2__)
3557 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3558#endif
3559 );
3560}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003561#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003562
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003563#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003564// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003565CONST vec8 kARGBToGray = {
3566 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3567};
3568
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003569// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003570void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003571 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003572 "movdqa %3,%%xmm4 \n"
3573 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003574
3575 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003576 ".p2align 4 \n"
3577 "1: \n"
3578 "movdqa (%0),%%xmm0 \n"
3579 "movdqa 0x10(%0),%%xmm1 \n"
3580 "pmaddubsw %%xmm4,%%xmm0 \n"
3581 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003582 "phaddw %%xmm1,%%xmm0 \n"
3583 "psrlw $0x7,%%xmm0 \n"
3584 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003585 "movdqa (%0),%%xmm2 \n"
3586 "movdqa 0x10(%0),%%xmm3 \n"
3587 "psrld $0x18,%%xmm2 \n"
3588 "psrld $0x18,%%xmm3 \n"
3589 "packuswb %%xmm3,%%xmm2 \n"
3590 "packuswb %%xmm2,%%xmm2 \n"
3591 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003592 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003593 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003594 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003595 "punpcklwd %%xmm3,%%xmm0 \n"
3596 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003597 "sub $0x8,%2 \n"
3598 "movdqa %%xmm0,(%0,%1,1) \n"
3599 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003600 "lea 0x20(%0),%0 \n"
3601 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003602 : "+r"(src_argb), // %0
3603 "+r"(dst_argb), // %1
3604 "+r"(width) // %2
3605 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003606 : "memory", "cc"
3607#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003608 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003609#endif
3610 );
3611}
3612#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003613
3614#ifdef HAS_ARGBSEPIAROW_SSSE3
3615// b = (r * 35 + g * 68 + b * 17) >> 7
3616// g = (r * 45 + g * 88 + b * 22) >> 7
3617// r = (r * 50 + g * 98 + b * 24) >> 7
3618// Constant for ARGB color to sepia tone
3619CONST vec8 kARGBToSepiaB = {
3620 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3621};
3622
3623CONST vec8 kARGBToSepiaG = {
3624 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3625};
3626
3627CONST vec8 kARGBToSepiaR = {
3628 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3629};
3630
fbarchard@google.come442dc42012-06-18 17:37:09 +00003631// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003632void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3633 asm volatile (
3634 "movdqa %2,%%xmm2 \n"
3635 "movdqa %3,%%xmm3 \n"
3636 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003637
3638 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003639 ".p2align 4 \n"
3640 "1: \n"
3641 "movdqa (%0),%%xmm0 \n"
3642 "movdqa 0x10(%0),%%xmm6 \n"
3643 "pmaddubsw %%xmm2,%%xmm0 \n"
3644 "pmaddubsw %%xmm2,%%xmm6 \n"
3645 "phaddw %%xmm6,%%xmm0 \n"
3646 "psrlw $0x7,%%xmm0 \n"
3647 "packuswb %%xmm0,%%xmm0 \n"
3648 "movdqa (%0),%%xmm5 \n"
3649 "movdqa 0x10(%0),%%xmm1 \n"
3650 "pmaddubsw %%xmm3,%%xmm5 \n"
3651 "pmaddubsw %%xmm3,%%xmm1 \n"
3652 "phaddw %%xmm1,%%xmm5 \n"
3653 "psrlw $0x7,%%xmm5 \n"
3654 "packuswb %%xmm5,%%xmm5 \n"
3655 "punpcklbw %%xmm5,%%xmm0 \n"
3656 "movdqa (%0),%%xmm5 \n"
3657 "movdqa 0x10(%0),%%xmm1 \n"
3658 "pmaddubsw %%xmm4,%%xmm5 \n"
3659 "pmaddubsw %%xmm4,%%xmm1 \n"
3660 "phaddw %%xmm1,%%xmm5 \n"
3661 "psrlw $0x7,%%xmm5 \n"
3662 "packuswb %%xmm5,%%xmm5 \n"
3663 "movdqa (%0),%%xmm6 \n"
3664 "movdqa 0x10(%0),%%xmm1 \n"
3665 "psrld $0x18,%%xmm6 \n"
3666 "psrld $0x18,%%xmm1 \n"
3667 "packuswb %%xmm1,%%xmm6 \n"
3668 "packuswb %%xmm6,%%xmm6 \n"
3669 "punpcklbw %%xmm6,%%xmm5 \n"
3670 "movdqa %%xmm0,%%xmm1 \n"
3671 "punpcklwd %%xmm5,%%xmm0 \n"
3672 "punpckhwd %%xmm5,%%xmm1 \n"
3673 "sub $0x8,%1 \n"
3674 "movdqa %%xmm0,(%0) \n"
3675 "movdqa %%xmm1,0x10(%0) \n"
3676 "lea 0x20(%0),%0 \n"
3677 "jg 1b \n"
3678 : "+r"(dst_argb), // %0
3679 "+r"(width) // %1
3680 : "m"(kARGBToSepiaB), // %2
3681 "m"(kARGBToSepiaG), // %3
3682 "m"(kARGBToSepiaR) // %4
3683 : "memory", "cc"
3684#if defined(__SSE2__)
3685 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3686#endif
3687 );
3688}
3689#endif // HAS_ARGBSEPIAROW_SSSE3
3690
fbarchard@google.come442dc42012-06-18 17:37:09 +00003691#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3692// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3693// Same as Sepia except matrix is provided.
3694void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3695 int width) {
3696 asm volatile (
3697 "movd (%2),%%xmm2 \n"
3698 "movd 0x4(%2),%%xmm3 \n"
3699 "movd 0x8(%2),%%xmm4 \n"
3700 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3701 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3702 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003703
3704 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003705 ".p2align 4 \n"
3706 "1: \n"
3707 "movdqa (%0),%%xmm0 \n"
3708 "movdqa 0x10(%0),%%xmm6 \n"
3709 "pmaddubsw %%xmm2,%%xmm0 \n"
3710 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003711 "movdqa (%0),%%xmm5 \n"
3712 "movdqa 0x10(%0),%%xmm1 \n"
3713 "pmaddubsw %%xmm3,%%xmm5 \n"
3714 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003715 "phaddsw %%xmm6,%%xmm0 \n"
3716 "phaddsw %%xmm1,%%xmm5 \n"
3717 "psraw $0x7,%%xmm0 \n"
3718 "psraw $0x7,%%xmm5 \n"
3719 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003720 "packuswb %%xmm5,%%xmm5 \n"
3721 "punpcklbw %%xmm5,%%xmm0 \n"
3722 "movdqa (%0),%%xmm5 \n"
3723 "movdqa 0x10(%0),%%xmm1 \n"
3724 "pmaddubsw %%xmm4,%%xmm5 \n"
3725 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003726 "phaddsw %%xmm1,%%xmm5 \n"
3727 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003728 "packuswb %%xmm5,%%xmm5 \n"
3729 "movdqa (%0),%%xmm6 \n"
3730 "movdqa 0x10(%0),%%xmm1 \n"
3731 "psrld $0x18,%%xmm6 \n"
3732 "psrld $0x18,%%xmm1 \n"
3733 "packuswb %%xmm1,%%xmm6 \n"
3734 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003735 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003736 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003737 "punpcklwd %%xmm5,%%xmm0 \n"
3738 "punpckhwd %%xmm5,%%xmm1 \n"
3739 "sub $0x8,%1 \n"
3740 "movdqa %%xmm0,(%0) \n"
3741 "movdqa %%xmm1,0x10(%0) \n"
3742 "lea 0x20(%0),%0 \n"
3743 "jg 1b \n"
3744 : "+r"(dst_argb), // %0
3745 "+r"(width) // %1
3746 : "r"(matrix_argb) // %2
3747 : "memory", "cc"
3748#if defined(__SSE2__)
3749 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3750#endif
3751 );
3752}
3753#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3754
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003755#ifdef HAS_ARGBQUANTIZEROW_SSE2
3756// Quantize 4 ARGB pixels (16 bytes).
3757// aligned to 16 bytes
3758void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3759 int interval_offset, int width) {
3760 asm volatile (
3761 "movd %2,%%xmm2 \n"
3762 "movd %3,%%xmm3 \n"
3763 "movd %4,%%xmm4 \n"
3764 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3765 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3766 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3767 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3768 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3769 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3770 "pxor %%xmm5,%%xmm5 \n"
3771 "pcmpeqb %%xmm6,%%xmm6 \n"
3772 "pslld $0x18,%%xmm6 \n"
3773
3774 // 4 pixel loop.
3775 ".p2align 2 \n"
3776 "1: \n"
3777 "movdqa (%0),%%xmm0 \n"
3778 "punpcklbw %%xmm5,%%xmm0 \n"
3779 "pmulhuw %%xmm2,%%xmm0 \n"
3780 "movdqa (%0),%%xmm1 \n"
3781 "punpckhbw %%xmm5,%%xmm1 \n"
3782 "pmulhuw %%xmm2,%%xmm1 \n"
3783 "pmullw %%xmm3,%%xmm0 \n"
3784 "movdqa (%0),%%xmm7 \n"
3785 "pmullw %%xmm3,%%xmm1 \n"
3786 "pand %%xmm6,%%xmm7 \n"
3787 "paddw %%xmm4,%%xmm0 \n"
3788 "paddw %%xmm4,%%xmm1 \n"
3789 "packuswb %%xmm1,%%xmm0 \n"
3790 "por %%xmm7,%%xmm0 \n"
3791 "sub $0x4,%1 \n"
3792 "movdqa %%xmm0,(%0) \n"
3793 "lea 0x10(%0),%0 \n"
3794 "jg 1b \n"
3795 : "+r"(dst_argb), // %0
3796 "+r"(width) // %1
3797 : "r"(scale), // %2
3798 "r"(interval_size), // %3
3799 "r"(interval_offset) // %4
3800 : "memory", "cc"
3801#if defined(__SSE2__)
3802 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3803#endif
3804 );
3805}
3806#endif // HAS_ARGBQUANTIZEROW_SSE2
3807
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003808#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3809// Creates a table of cumulative sums where each value is a sum of all values
3810// above and to the left of the value, inclusive of the value.
3811void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003812 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003813 asm volatile (
3814 "sub %1,%2 \n"
3815 "pxor %%xmm0,%%xmm0 \n"
3816 "pxor %%xmm1,%%xmm1 \n"
3817 "sub $0x4,%3 \n"
3818 "jl 49f \n"
3819 "test $0xf,%1 \n"
3820 "jne 49f \n"
3821
3822 // 4 pixel loop \n"
3823 ".p2align 2 \n"
3824 "40: \n"
3825 "movdqu (%0),%%xmm2 \n"
3826 "lea 0x10(%0),%0 \n"
3827 "movdqa %%xmm2,%%xmm4 \n"
3828 "punpcklbw %%xmm1,%%xmm2 \n"
3829 "movdqa %%xmm2,%%xmm3 \n"
3830 "punpcklwd %%xmm1,%%xmm2 \n"
3831 "punpckhwd %%xmm1,%%xmm3 \n"
3832 "punpckhbw %%xmm1,%%xmm4 \n"
3833 "movdqa %%xmm4,%%xmm5 \n"
3834 "punpcklwd %%xmm1,%%xmm4 \n"
3835 "punpckhwd %%xmm1,%%xmm5 \n"
3836 "paddd %%xmm2,%%xmm0 \n"
3837 "movdqa (%1,%2,1),%%xmm2 \n"
3838 "paddd %%xmm0,%%xmm2 \n"
3839 "paddd %%xmm3,%%xmm0 \n"
3840 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3841 "paddd %%xmm0,%%xmm3 \n"
3842 "paddd %%xmm4,%%xmm0 \n"
3843 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3844 "paddd %%xmm0,%%xmm4 \n"
3845 "paddd %%xmm5,%%xmm0 \n"
3846 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3847 "paddd %%xmm0,%%xmm5 \n"
3848 "movdqa %%xmm2,(%1) \n"
3849 "movdqa %%xmm3,0x10(%1) \n"
3850 "movdqa %%xmm4,0x20(%1) \n"
3851 "movdqa %%xmm5,0x30(%1) \n"
3852 "lea 0x40(%1),%1 \n"
3853 "sub $0x4,%3 \n"
3854 "jge 40b \n"
3855
3856 "49: \n"
3857 "add $0x3,%3 \n"
3858 "jl 19f \n"
3859
3860 // 1 pixel loop \n"
3861 ".p2align 2 \n"
3862 "10: \n"
3863 "movd (%0),%%xmm2 \n"
3864 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003865 "punpcklbw %%xmm1,%%xmm2 \n"
3866 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003867 "paddd %%xmm2,%%xmm0 \n"
3868 "movdqu (%1,%2,1),%%xmm2 \n"
3869 "paddd %%xmm0,%%xmm2 \n"
3870 "movdqu %%xmm2,(%1) \n"
3871 "lea 0x10(%1),%1 \n"
3872 "sub $0x1,%3 \n"
3873 "jge 10b \n"
3874
3875 "19: \n"
3876 : "+r"(row), // %0
3877 "+r"(cumsum), // %1
3878 "+r"(previous_cumsum), // %2
3879 "+r"(width) // %3
3880 :
3881 : "memory", "cc"
3882#if defined(__SSE2__)
3883 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3884#endif
3885 );
3886}
3887#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3888
3889#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3890void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3891 int width, int area, uint8* dst, int count) {
3892 asm volatile (
3893 "movd %5,%%xmm4 \n"
3894 "cvtdq2ps %%xmm4,%%xmm4 \n"
3895 "rcpss %%xmm4,%%xmm4 \n"
3896 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3897 "sub $0x4,%3 \n"
3898 "jl 49f \n"
3899
3900 // 4 pixel loop \n"
3901 ".p2align 2 \n"
3902 "40: \n"
3903 "movdqa (%0),%%xmm0 \n"
3904 "movdqa 0x10(%0),%%xmm1 \n"
3905 "movdqa 0x20(%0),%%xmm2 \n"
3906 "movdqa 0x30(%0),%%xmm3 \n"
3907 "psubd (%0,%4,4),%%xmm0 \n"
3908 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3909 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3910 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3911 "lea 0x40(%0),%0 \n"
3912 "psubd (%1),%%xmm0 \n"
3913 "psubd 0x10(%1),%%xmm1 \n"
3914 "psubd 0x20(%1),%%xmm2 \n"
3915 "psubd 0x30(%1),%%xmm3 \n"
3916 "paddd (%1,%4,4),%%xmm0 \n"
3917 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3918 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3919 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3920 "lea 0x40(%1),%1 \n"
3921 "cvtdq2ps %%xmm0,%%xmm0 \n"
3922 "cvtdq2ps %%xmm1,%%xmm1 \n"
3923 "mulps %%xmm4,%%xmm0 \n"
3924 "mulps %%xmm4,%%xmm1 \n"
3925 "cvtdq2ps %%xmm2,%%xmm2 \n"
3926 "cvtdq2ps %%xmm3,%%xmm3 \n"
3927 "mulps %%xmm4,%%xmm2 \n"
3928 "mulps %%xmm4,%%xmm3 \n"
3929 "cvtps2dq %%xmm0,%%xmm0 \n"
3930 "cvtps2dq %%xmm1,%%xmm1 \n"
3931 "cvtps2dq %%xmm2,%%xmm2 \n"
3932 "cvtps2dq %%xmm3,%%xmm3 \n"
3933 "packssdw %%xmm1,%%xmm0 \n"
3934 "packssdw %%xmm3,%%xmm2 \n"
3935 "packuswb %%xmm2,%%xmm0 \n"
3936 "movdqu %%xmm0,(%2) \n"
3937 "lea 0x10(%2),%2 \n"
3938 "sub $0x4,%3 \n"
3939 "jge 40b \n"
3940
3941 "49: \n"
3942 "add $0x3,%3 \n"
3943 "jl 19f \n"
3944
3945 // 1 pixel loop \n"
3946 ".p2align 2 \n"
3947 "10: \n"
3948 "movdqa (%0),%%xmm0 \n"
3949 "psubd (%0,%4,4),%%xmm0 \n"
3950 "lea 0x10(%0),%0 \n"
3951 "psubd (%1),%%xmm0 \n"
3952 "paddd (%1,%4,4),%%xmm0 \n"
3953 "lea 0x10(%1),%1 \n"
3954 "cvtdq2ps %%xmm0,%%xmm0 \n"
3955 "mulps %%xmm4,%%xmm0 \n"
3956 "cvtps2dq %%xmm0,%%xmm0 \n"
3957 "packssdw %%xmm0,%%xmm0 \n"
3958 "packuswb %%xmm0,%%xmm0 \n"
3959 "movd %%xmm0,(%2) \n"
3960 "lea 0x4(%2),%2 \n"
3961 "sub $0x1,%3 \n"
3962 "jge 10b \n"
3963 "19: \n"
3964 : "+r"(topleft), // %0
3965 "+r"(botleft), // %1
3966 "+r"(dst), // %2
3967 "+rm"(count) // %3
3968 : "r"(static_cast<intptr_t>(width)), // %4
3969 "rm"(area) // %5
3970 : "memory", "cc"
3971#if defined(__SSE2__)
3972 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3973#endif
3974 );
3975}
3976#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003977#ifdef HAS_ARGBSHADE_SSE2
3978// Shade 4 pixels at a time by specified value.
3979// Aligned to 16 bytes.
3980void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3981 uint32 value) {
3982 asm volatile (
3983 "movd %3,%%xmm2 \n"
3984 "sub %0,%1 \n"
3985 "punpcklbw %%xmm2,%%xmm2 \n"
3986 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003987
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003988 // 4 pixel loop.
3989 ".p2align 2 \n"
3990 "1: \n"
3991 "movdqa (%0),%%xmm0 \n"
3992 "movdqa %%xmm0,%%xmm1 \n"
3993 "punpcklbw %%xmm0,%%xmm0 \n"
3994 "punpckhbw %%xmm1,%%xmm1 \n"
3995 "pmulhuw %%xmm2,%%xmm0 \n"
3996 "pmulhuw %%xmm2,%%xmm1 \n"
3997 "psrlw $0x8,%%xmm0 \n"
3998 "psrlw $0x8,%%xmm1 \n"
3999 "packuswb %%xmm1,%%xmm0 \n"
4000 "sub $0x4,%2 \n"
4001 "movdqa %%xmm0,(%0,%1,1) \n"
4002 "lea 0x10(%0),%0 \n"
4003 "jg 1b \n"
4004 : "+r"(src_argb), // %0
4005 "+r"(dst_argb), // %1
4006 "+r"(width) // %2
4007 : "r"(value) // %3
4008 : "memory", "cc"
4009#if defined(__SSE2__)
4010 , "xmm0", "xmm1", "xmm2"
4011#endif
4012 );
4013}
4014#endif // HAS_ARGBSHADE_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004015
fbarchard@google.com73444402012-08-09 17:33:29 +00004016#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004017// TODO(fbarchard): Find 64 bit way to avoid masking.
4018// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
fbarchard@google.com73444402012-08-09 17:33:29 +00004019// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004020// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00004021// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004022
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00004023LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00004024void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4025 uint8* dst_argb, const float* uv_dudv, int width) {
4026 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004027 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00004028 asm volatile (
4029 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004030 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004031 "shl $0x10,%1 \n"
4032 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004033 "movd %1,%%xmm5 \n"
4034 "sub $0x4,%4 \n"
4035 "jl 49f \n"
4036
4037 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4038 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004039 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004040 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004041 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004042 "movdqa %%xmm7,%%xmm4 \n"
4043 "addps %%xmm4,%%xmm4 \n"
4044 "movdqa %%xmm2,%%xmm3 \n"
4045 "addps %%xmm4,%%xmm3 \n"
4046 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004047
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004048 // 4 pixel loop \n"
4049 ".p2align 4 \n"
4050 "40: \n"
4051 "cvttps2dq %%xmm2,%%xmm0 \n"
4052 "cvttps2dq %%xmm3,%%xmm1 \n"
4053 "packssdw %%xmm1,%%xmm0 \n"
4054 "pmaddwd %%xmm5,%%xmm0 \n"
4055#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004056 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004057 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004058 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004059 "shr $32,%5 \n"
4060 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4061#else
4062 "movd %%xmm0,%1 \n"
4063 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4064 "movd %%xmm0,%5 \n"
4065 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4066#endif
4067 "movd (%0,%1,1),%%xmm1 \n"
4068 "movd (%0,%5,1),%%xmm6 \n"
4069 "punpckldq %%xmm6,%%xmm1 \n"
4070 "addps %%xmm4,%%xmm2 \n"
4071 "movq %%xmm1,(%2) \n"
4072#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004073 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004074 "mov %1,%5 \n"
4075 "and $0x0fffffff,%1 \n"
4076 "shr $32,%5 \n"
4077#else
4078 "movd %%xmm0,%1 \n"
4079 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4080 "movd %%xmm0,%5 \n"
4081#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004082 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004083 "movd (%0,%5,1),%%xmm6 \n"
4084 "punpckldq %%xmm6,%%xmm0 \n"
4085 "addps %%xmm4,%%xmm3 \n"
4086 "sub $0x4,%4 \n"
4087 "movq %%xmm0,0x08(%2) \n"
4088 "lea 0x10(%2),%2 \n"
4089 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004090
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004091 "49: \n"
4092 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004093 "jl 19f \n"
4094
4095 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004096 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004097 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004098 "cvttps2dq %%xmm2,%%xmm0 \n"
4099 "packssdw %%xmm0,%%xmm0 \n"
4100 "pmaddwd %%xmm5,%%xmm0 \n"
4101 "addps %%xmm7,%%xmm2 \n"
4102 "movd %%xmm0,%1 \n"
4103#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004104 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004105#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004106 "movd (%0,%1,1),%%xmm0 \n"
4107 "sub $0x1,%4 \n"
4108 "movd %%xmm0,(%2) \n"
4109 "lea 0x4(%2),%2 \n"
4110 "jge 10b \n"
4111 "19: \n"
4112 : "+r"(src_argb), // %0
4113 "+r"(src_argb_stride_temp), // %1
4114 "+r"(dst_argb), // %2
4115 "+r"(uv_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004116 "+rm"(width), // %4
4117 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004118 :
4119 : "memory", "cc"
4120#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004121 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004122#endif
4123 );
4124}
4125#endif // HAS_ARGBAFFINEROW_SSE2
4126
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004127// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
4128void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4129 ptrdiff_t src_stride, int dst_width,
4130 int source_y_fraction) {
4131 asm volatile (
4132 "sub %1,%0 \n"
4133 "shr %3 \n"
4134 "cmp $0x0,%3 \n"
4135 "je 2f \n"
4136 "cmp $0x40,%3 \n"
4137 "je 3f \n"
4138 "movd %3,%%xmm0 \n"
4139 "neg %3 \n"
4140 "add $0x80,%3 \n"
4141 "movd %3,%%xmm5 \n"
4142 "punpcklbw %%xmm0,%%xmm5 \n"
4143 "punpcklwd %%xmm5,%%xmm5 \n"
4144 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4145 ".p2align 4 \n"
4146 "1: \n"
4147 "movdqa (%1),%%xmm0 \n"
4148 "movdqa (%1,%4,1),%%xmm2 \n"
4149 "movdqa %%xmm0,%%xmm1 \n"
4150 "punpcklbw %%xmm2,%%xmm0 \n"
4151 "punpckhbw %%xmm2,%%xmm1 \n"
4152 "pmaddubsw %%xmm5,%%xmm0 \n"
4153 "pmaddubsw %%xmm5,%%xmm1 \n"
4154 "psrlw $0x7,%%xmm0 \n"
4155 "psrlw $0x7,%%xmm1 \n"
4156 "packuswb %%xmm1,%%xmm0 \n"
4157 "sub $0x4,%2 \n"
4158 "movdqa %%xmm0,(%1,%0,1) \n"
4159 "lea 0x10(%1),%1 \n"
4160 "jg 1b \n"
4161 "jmp 4f \n"
4162 ".p2align 4 \n"
4163 "2: \n"
4164 "movdqa (%1),%%xmm0 \n"
4165 "sub $0x4,%2 \n"
4166 "movdqa %%xmm0,(%1,%0,1) \n"
4167 "lea 0x10(%1),%1 \n"
4168 "jg 2b \n"
4169 "jmp 4f \n"
4170 ".p2align 4 \n"
4171 "3: \n"
4172 "movdqa (%1),%%xmm0 \n"
4173 "pavgb (%1,%4,1),%%xmm0 \n"
4174 "sub $0x4,%2 \n"
4175 "movdqa %%xmm0,(%1,%0,1) \n"
4176 "lea 0x10(%1),%1 \n"
4177 "jg 3b \n"
4178 "4: \n"
4179 ".p2align 4 \n"
4180 : "+r"(dst_ptr), // %0
4181 "+r"(src_ptr), // %1
4182 "+r"(dst_width), // %2
4183 "+r"(source_y_fraction) // %3
4184 : "r"(static_cast<intptr_t>(src_stride)) // %4
4185 : "memory", "cc"
4186#if defined(__SSE2__)
4187 , "xmm0", "xmm1", "xmm2", "xmm5"
4188#endif
4189 );
4190}
4191
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004192void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4193 uint8* dst_uv, int pix) {
4194 asm volatile (
4195 "sub %0,%1 \n"
4196 ".p2align 4 \n"
4197 "1: \n"
4198 "movdqa (%0),%%xmm0 \n"
4199 "pavgb (%0,%3),%%xmm0 \n"
4200 "sub $0x10,%2 \n"
4201 "movdqa %%xmm0,(%0,%1) \n"
4202 "lea 0x10(%0),%0 \n"
4203 "jg 1b \n"
4204 : "+r"(src_uv), // %0
4205 "+r"(dst_uv), // %1
4206 "+r"(pix) // %2
4207 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4208 : "memory", "cc"
4209#if defined(__SSE2__)
4210 , "xmm0"
4211#endif
4212 );
4213}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004214
4215void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4216 uint32 selector, int pix) {
4217 asm volatile (
4218 "movd %3,%%xmm5 \n"
4219 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4220 ".p2align 4 \n"
4221 "1: \n"
4222 "movdqa (%0),%%xmm0 \n"
4223 "lea 0x10(%0),%0 \n"
4224 "pshufb %%xmm5,%%xmm0 \n"
4225 "sub $0x4,%2 \n"
4226 "movd %%xmm0,(%1) \n"
4227 "lea 0x4(%1),%1 \n"
4228 "jg 1b \n"
4229 : "+r"(src_argb), // %0
4230 "+r"(dst_bayer), // %1
4231 "+r"(pix) // %2
4232 : "g"(selector) // %3
4233 : "memory", "cc"
4234#if defined(__SSE2__)
4235 , "xmm0", "xmm5"
4236#endif
4237 );
4238}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004239
4240void I422ToYUY2Row_SSE2(const uint8* src_y,
4241 const uint8* src_u,
4242 const uint8* src_v,
4243 uint8* dst_frame, int width) {
4244 asm volatile (
4245 "sub %1,%2 \n"
4246 ".p2align 4 \n"
4247 "1: \n"
4248 "movq (%1),%%xmm2 \n"
4249 "movq (%1,%2,1),%%xmm3 \n"
4250 "lea 0x8(%1),%1 \n"
4251 "punpcklbw %%xmm3,%%xmm2 \n"
4252 "movdqa (%0),%%xmm0 \n"
4253 "lea 0x10(%0),%0 \n"
4254 "movdqa %%xmm0,%%xmm1 \n"
4255 "punpcklbw %%xmm2,%%xmm0 \n"
4256 "punpckhbw %%xmm2,%%xmm1 \n"
4257 "movdqa %%xmm0,(%3) \n"
4258 "movdqa %%xmm1,0x10(%3) \n"
4259 "lea 0x20(%3),%3 \n"
4260 "sub $0x10,%4 \n"
4261 "jg 1b \n"
4262 : "+r"(src_y), // %0
4263 "+r"(src_u), // %1
4264 "+r"(src_v), // %2
4265 "+r"(dst_frame), // %3
4266 "+rm"(width) // %4
4267 :
4268 : "memory", "cc"
4269#if defined(__SSE2__)
4270 , "xmm0", "xmm1", "xmm2", "xmm3"
4271#endif
4272 );
4273}
4274
4275void I422ToUYVYRow_SSE2(const uint8* src_y,
4276 const uint8* src_u,
4277 const uint8* src_v,
4278 uint8* dst_frame, int width) {
4279 asm volatile (
4280 "sub %1,%2 \n"
4281 ".p2align 4 \n"
4282 "1: \n"
4283 "movq (%1),%%xmm2 \n"
4284 "movq (%1,%2,1),%%xmm3 \n"
4285 "lea 0x8(%1),%1 \n"
4286 "punpcklbw %%xmm3,%%xmm2 \n"
4287 "movdqa (%0),%%xmm0 \n"
4288 "movdqa %%xmm2,%%xmm1 \n"
4289 "lea 0x10(%0),%0 \n"
4290 "punpcklbw %%xmm0,%%xmm1 \n"
4291 "punpckhbw %%xmm0,%%xmm2 \n"
4292 "movdqa %%xmm1,(%3) \n"
4293 "movdqa %%xmm2,0x10(%3) \n"
4294 "lea 0x20(%3),%3 \n"
4295 "sub $0x10,%4 \n"
4296 "jg 1b \n"
4297 : "+r"(src_y), // %0
4298 "+r"(src_u), // %1
4299 "+r"(src_v), // %2
4300 "+r"(dst_frame), // %3
4301 "+rm"(width) // %4
4302 :
4303 : "memory", "cc"
4304#if defined(__SSE2__)
4305 , "xmm0", "xmm1", "xmm2", "xmm3"
4306#endif
4307 );
4308}
4309
fbarchard@google.com2d11d432012-02-16 02:50:39 +00004310#endif // defined(__x86_64__) || defined(__i386__)
4311
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004312#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00004313} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004314} // namespace libyuv
4315#endif