blob: 920a8c404a699cfc34a1f034180497f5c86b5eb5 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
fbarchard@google.com4de0c432012-10-11 01:25:46 +000072// Constants for RGBA.
73CONST vec8 kRGBAToY = {
74 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
75};
76
77CONST vec8 kRGBAToU = {
78 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
79};
80
81CONST vec8 kRGBAToV = {
82 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
83};
84
fbarchard@google.com714050a2012-02-17 22:59:56 +000085CONST uvec8 kAddY16 = {
86 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000087};
fbarchard@google.com2430e042011-11-11 21:57:06 +000088
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000089CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
92};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000093
fbarchard@google.comba1f5262012-01-12 19:22:41 +000094// Shuffle table for converting RGB24 to ARGB.
95CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000096 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
97};
98
99// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000100CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000101 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
102};
103
fbarchard@google.comb6149762011-11-07 21:58:52 +0000104// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000105CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000106 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
107};
108
109// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000110CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
112};
113
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000114// Shuffle table for converting RGBA to ARGB.
115CONST uvec8 kShuffleMaskRGBAToARGB = {
116 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
117};
118
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000119// Shuffle table for converting ARGB to RGBA.
120CONST uvec8 kShuffleMaskARGBToRGBA = {
121 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
122};
123
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000124// Shuffle table for converting ARGB to RGB24.
125CONST uvec8 kShuffleMaskARGBToRGB24 = {
126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
127};
128
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000129// Shuffle table for converting ARGB to RAW.
130CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000131 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000132};
133
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000134// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000135CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
137};
138
139// Shuffle table for converting ARGB to RAW.
140CONST uvec8 kShuffleMaskARGBToRAW_0 = {
141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142};
143
fbarchard@google.comb6149762011-11-07 21:58:52 +0000144void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000145 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pcmpeqb %%xmm5,%%xmm5 \n"
147 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000148 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "1: \n"
150 "movq (%0),%%xmm0 \n"
151 "lea 0x8(%0),%0 \n"
152 "punpcklbw %%xmm0,%%xmm0 \n"
153 "movdqa %%xmm0,%%xmm1 \n"
154 "punpcklwd %%xmm0,%%xmm0 \n"
155 "punpckhwd %%xmm1,%%xmm1 \n"
156 "por %%xmm5,%%xmm0 \n"
157 "por %%xmm5,%%xmm1 \n"
158 "movdqa %%xmm0,(%1) \n"
159 "movdqa %%xmm1,0x10(%1) \n"
160 "lea 0x20(%1),%1 \n"
161 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000162 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163 : "+r"(src_y), // %0
164 "+r"(dst_argb), // %1
165 "+r"(pix) // %2
166 :
167 : "memory", "cc"
168#if defined(__SSE2__)
169 , "xmm0", "xmm1", "xmm5"
170#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000171 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000172}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173
fbarchard@google.com00b69a22012-11-02 06:03:28 +0000174void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
175 int pix) {
176 asm volatile (
177 "pcmpeqb %%xmm5,%%xmm5 \n"
178 "pslld $0x18,%%xmm5 \n"
179 ".p2align 4 \n"
180 "1: \n"
181 "movq (%0),%%xmm0 \n"
182 "lea 0x8(%0),%0 \n"
183 "punpcklbw %%xmm0,%%xmm0 \n"
184 "movdqa %%xmm0,%%xmm1 \n"
185 "punpcklwd %%xmm0,%%xmm0 \n"
186 "punpckhwd %%xmm1,%%xmm1 \n"
187 "por %%xmm5,%%xmm0 \n"
188 "por %%xmm5,%%xmm1 \n"
189 "movdqu %%xmm0,(%1) \n"
190 "movdqu %%xmm1,0x10(%1) \n"
191 "lea 0x20(%1),%1 \n"
192 "sub $0x8,%2 \n"
193 "jg 1b \n"
194 : "+r"(src_y), // %0
195 "+r"(dst_argb), // %1
196 "+r"(pix) // %2
197 :
198 : "memory", "cc"
199#if defined(__SSE2__)
200 , "xmm0", "xmm1", "xmm5"
201#endif
202 );
203}
204
fbarchard@google.comb6149762011-11-07 21:58:52 +0000205void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000206 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000207 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000208 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000209 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000210 "1: \n"
211 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000212 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000213 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "movdqa %%xmm0,(%0,%1,1) \n"
215 "lea 0x10(%0),%0 \n"
216 "jg 1b \n"
217
fbarchard@google.comb6149762011-11-07 21:58:52 +0000218 : "+r"(src_abgr), // %0
219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
221 : "m"(kShuffleMaskABGRToARGB) // %3
222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000227}
228
229void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000232 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000233 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000234 "1: \n"
235 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000236 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000237 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000238 "movdqa %%xmm0,(%0,%1,1) \n"
239 "lea 0x10(%0),%0 \n"
240 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000241 : "+r"(src_bgra), // %0
242 "+r"(dst_argb), // %1
243 "+r"(pix) // %2
244 : "m"(kShuffleMaskBGRAToARGB) // %3
245 : "memory", "cc"
246#if defined(__SSE2__)
247 , "xmm0", "xmm5"
248#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000249 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000250}
251
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000252void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
253 asm volatile (
254 "movdqa %3,%%xmm5 \n"
255 "sub %0,%1 \n"
256 ".p2align 4 \n"
257 "1: \n"
258 "movdqa (%0),%%xmm0 \n"
259 "pshufb %%xmm5,%%xmm0 \n"
260 "sub $0x4,%2 \n"
261 "movdqa %%xmm0,(%0,%1,1) \n"
262 "lea 0x10(%0),%0 \n"
263 "jg 1b \n"
264
265 : "+r"(src_rgba), // %0
266 "+r"(dst_argb), // %1
267 "+r"(pix) // %2
268 : "m"(kShuffleMaskRGBAToARGB) // %3
269 : "memory", "cc"
270#if defined(__SSE2__)
271 , "xmm0", "xmm5"
272#endif
273 );
274}
275
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000276void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
277 asm volatile (
278 "movdqa %3,%%xmm5 \n"
279 "sub %0,%1 \n"
280 ".p2align 4 \n"
281 "1: \n"
282 "movdqa (%0),%%xmm0 \n"
283 "pshufb %%xmm5,%%xmm0 \n"
284 "sub $0x4,%2 \n"
285 "movdqa %%xmm0,(%0,%1,1) \n"
286 "lea 0x10(%0),%0 \n"
287 "jg 1b \n"
288
289 : "+r"(src_argb), // %0
290 "+r"(dst_rgba), // %1
291 "+r"(pix) // %2
292 : "m"(kShuffleMaskARGBToRGBA) // %3
293 : "memory", "cc"
294#if defined(__SSE2__)
295 , "xmm0", "xmm5"
296#endif
297 );
298}
299
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000300void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000301 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000302 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
303 "pslld $0x18,%%xmm5 \n"
304 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000305 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000306 "1: \n"
307 "movdqu (%0),%%xmm0 \n"
308 "movdqu 0x10(%0),%%xmm1 \n"
309 "movdqu 0x20(%0),%%xmm3 \n"
310 "lea 0x30(%0),%0 \n"
311 "movdqa %%xmm3,%%xmm2 \n"
312 "palignr $0x8,%%xmm1,%%xmm2 \n"
313 "pshufb %%xmm4,%%xmm2 \n"
314 "por %%xmm5,%%xmm2 \n"
315 "palignr $0xc,%%xmm0,%%xmm1 \n"
316 "pshufb %%xmm4,%%xmm0 \n"
317 "movdqa %%xmm2,0x20(%1) \n"
318 "por %%xmm5,%%xmm0 \n"
319 "pshufb %%xmm4,%%xmm1 \n"
320 "movdqa %%xmm0,(%1) \n"
321 "por %%xmm5,%%xmm1 \n"
322 "palignr $0x4,%%xmm3,%%xmm3 \n"
323 "pshufb %%xmm4,%%xmm3 \n"
324 "movdqa %%xmm1,0x10(%1) \n"
325 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000326 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000327 "movdqa %%xmm3,0x30(%1) \n"
328 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000329 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000330 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000331 "+r"(dst_argb), // %1
332 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000333 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000334 : "memory", "cc"
335#if defined(__SSE2__)
336 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
337#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000338 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000339}
340
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000341void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000342 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000343 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
344 "pslld $0x18,%%xmm5 \n"
345 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000346 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000347 "1: \n"
348 "movdqu (%0),%%xmm0 \n"
349 "movdqu 0x10(%0),%%xmm1 \n"
350 "movdqu 0x20(%0),%%xmm3 \n"
351 "lea 0x30(%0),%0 \n"
352 "movdqa %%xmm3,%%xmm2 \n"
353 "palignr $0x8,%%xmm1,%%xmm2 \n"
354 "pshufb %%xmm4,%%xmm2 \n"
355 "por %%xmm5,%%xmm2 \n"
356 "palignr $0xc,%%xmm0,%%xmm1 \n"
357 "pshufb %%xmm4,%%xmm0 \n"
358 "movdqa %%xmm2,0x20(%1) \n"
359 "por %%xmm5,%%xmm0 \n"
360 "pshufb %%xmm4,%%xmm1 \n"
361 "movdqa %%xmm0,(%1) \n"
362 "por %%xmm5,%%xmm1 \n"
363 "palignr $0x4,%%xmm3,%%xmm3 \n"
364 "pshufb %%xmm4,%%xmm3 \n"
365 "movdqa %%xmm1,0x10(%1) \n"
366 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000367 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000368 "movdqa %%xmm3,0x30(%1) \n"
369 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000370 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000371 : "+r"(src_raw), // %0
372 "+r"(dst_argb), // %1
373 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000374 : "m"(kShuffleMaskRAWToARGB) // %3
375 : "memory", "cc"
376#if defined(__SSE2__)
377 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
378#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000379 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000380}
381
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000382void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000383 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000384 "mov $0x1080108,%%eax \n"
385 "movd %%eax,%%xmm5 \n"
386 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000387 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000388 "movd %%eax,%%xmm6 \n"
389 "pshufd $0x0,%%xmm6,%%xmm6 \n"
390 "pcmpeqb %%xmm3,%%xmm3 \n"
391 "psllw $0xb,%%xmm3 \n"
392 "pcmpeqb %%xmm4,%%xmm4 \n"
393 "psllw $0xa,%%xmm4 \n"
394 "psrlw $0x5,%%xmm4 \n"
395 "pcmpeqb %%xmm7,%%xmm7 \n"
396 "psllw $0x8,%%xmm7 \n"
397 "sub %0,%1 \n"
398 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000399 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000400 "1: \n"
401 "movdqu (%0),%%xmm0 \n"
402 "movdqa %%xmm0,%%xmm1 \n"
403 "movdqa %%xmm0,%%xmm2 \n"
404 "pand %%xmm3,%%xmm1 \n"
405 "psllw $0xb,%%xmm2 \n"
406 "pmulhuw %%xmm5,%%xmm1 \n"
407 "pmulhuw %%xmm5,%%xmm2 \n"
408 "psllw $0x8,%%xmm1 \n"
409 "por %%xmm2,%%xmm1 \n"
410 "pand %%xmm4,%%xmm0 \n"
411 "pmulhuw %%xmm6,%%xmm0 \n"
412 "por %%xmm7,%%xmm0 \n"
413 "movdqa %%xmm1,%%xmm2 \n"
414 "punpcklbw %%xmm0,%%xmm1 \n"
415 "punpckhbw %%xmm0,%%xmm2 \n"
416 "movdqa %%xmm1,(%1,%0,2) \n"
417 "movdqa %%xmm2,0x10(%1,%0,2) \n"
418 "lea 0x10(%0),%0 \n"
419 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000420 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000421 : "+r"(src), // %0
422 "+r"(dst), // %1
423 "+r"(pix) // %2
424 :
425 : "memory", "cc", "eax"
426#if defined(__SSE2__)
427 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
428#endif
429 );
430}
431
432void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000433 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000434 "mov $0x1080108,%%eax \n"
435 "movd %%eax,%%xmm5 \n"
436 "pshufd $0x0,%%xmm5,%%xmm5 \n"
437 "mov $0x42004200,%%eax \n"
438 "movd %%eax,%%xmm6 \n"
439 "pshufd $0x0,%%xmm6,%%xmm6 \n"
440 "pcmpeqb %%xmm3,%%xmm3 \n"
441 "psllw $0xb,%%xmm3 \n"
442 "movdqa %%xmm3,%%xmm4 \n"
443 "psrlw $0x6,%%xmm4 \n"
444 "pcmpeqb %%xmm7,%%xmm7 \n"
445 "psllw $0x8,%%xmm7 \n"
446 "sub %0,%1 \n"
447 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000448 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000449 "1: \n"
450 "movdqu (%0),%%xmm0 \n"
451 "movdqa %%xmm0,%%xmm1 \n"
452 "movdqa %%xmm0,%%xmm2 \n"
453 "psllw $0x1,%%xmm1 \n"
454 "psllw $0xb,%%xmm2 \n"
455 "pand %%xmm3,%%xmm1 \n"
456 "pmulhuw %%xmm5,%%xmm2 \n"
457 "pmulhuw %%xmm5,%%xmm1 \n"
458 "psllw $0x8,%%xmm1 \n"
459 "por %%xmm2,%%xmm1 \n"
460 "movdqa %%xmm0,%%xmm2 \n"
461 "pand %%xmm4,%%xmm0 \n"
462 "psraw $0x8,%%xmm2 \n"
463 "pmulhuw %%xmm6,%%xmm0 \n"
464 "pand %%xmm7,%%xmm2 \n"
465 "por %%xmm2,%%xmm0 \n"
466 "movdqa %%xmm1,%%xmm2 \n"
467 "punpcklbw %%xmm0,%%xmm1 \n"
468 "punpckhbw %%xmm0,%%xmm2 \n"
469 "movdqa %%xmm1,(%1,%0,2) \n"
470 "movdqa %%xmm2,0x10(%1,%0,2) \n"
471 "lea 0x10(%0),%0 \n"
472 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000473 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000474 : "+r"(src), // %0
475 "+r"(dst), // %1
476 "+r"(pix) // %2
477 :
478 : "memory", "cc", "eax"
479#if defined(__SSE2__)
480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
481#endif
482 );
483}
484
485void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000486 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000487 "mov $0xf0f0f0f,%%eax \n"
488 "movd %%eax,%%xmm4 \n"
489 "pshufd $0x0,%%xmm4,%%xmm4 \n"
490 "movdqa %%xmm4,%%xmm5 \n"
491 "pslld $0x4,%%xmm5 \n"
492 "sub %0,%1 \n"
493 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000494 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000495 "1: \n"
496 "movdqu (%0),%%xmm0 \n"
497 "movdqa %%xmm0,%%xmm2 \n"
498 "pand %%xmm4,%%xmm0 \n"
499 "pand %%xmm5,%%xmm2 \n"
500 "movdqa %%xmm0,%%xmm1 \n"
501 "movdqa %%xmm2,%%xmm3 \n"
502 "psllw $0x4,%%xmm1 \n"
503 "psrlw $0x4,%%xmm3 \n"
504 "por %%xmm1,%%xmm0 \n"
505 "por %%xmm3,%%xmm2 \n"
506 "movdqa %%xmm0,%%xmm1 \n"
507 "punpcklbw %%xmm2,%%xmm0 \n"
508 "punpckhbw %%xmm2,%%xmm1 \n"
509 "movdqa %%xmm0,(%1,%0,2) \n"
510 "movdqa %%xmm1,0x10(%1,%0,2) \n"
511 "lea 0x10(%0),%0 \n"
512 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000513 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000514 : "+r"(src), // %0
515 "+r"(dst), // %1
516 "+r"(pix) // %2
517 :
518 : "memory", "cc", "eax"
519#if defined(__SSE2__)
520 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
521#endif
522 );
523}
524
525void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000526 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000527 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000528 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000529 "1: \n"
530 "movdqa (%0),%%xmm0 \n"
531 "movdqa 0x10(%0),%%xmm1 \n"
532 "movdqa 0x20(%0),%%xmm2 \n"
533 "movdqa 0x30(%0),%%xmm3 \n"
534 "lea 0x40(%0),%0 \n"
535 "pshufb %%xmm6,%%xmm0 \n"
536 "pshufb %%xmm6,%%xmm1 \n"
537 "pshufb %%xmm6,%%xmm2 \n"
538 "pshufb %%xmm6,%%xmm3 \n"
539 "movdqa %%xmm1,%%xmm4 \n"
540 "psrldq $0x4,%%xmm1 \n"
541 "pslldq $0xc,%%xmm4 \n"
542 "movdqa %%xmm2,%%xmm5 \n"
543 "por %%xmm4,%%xmm0 \n"
544 "pslldq $0x8,%%xmm5 \n"
545 "movdqa %%xmm0,(%1) \n"
546 "por %%xmm5,%%xmm1 \n"
547 "psrldq $0x8,%%xmm2 \n"
548 "pslldq $0x4,%%xmm3 \n"
549 "por %%xmm3,%%xmm2 \n"
550 "movdqa %%xmm1,0x10(%1) \n"
551 "movdqa %%xmm2,0x20(%1) \n"
552 "lea 0x30(%1),%1 \n"
553 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000554 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000555 : "+r"(src), // %0
556 "+r"(dst), // %1
557 "+r"(pix) // %2
558 : "m"(kShuffleMaskARGBToRGB24) // %3
559 : "memory", "cc"
560#if defined(__SSE2__)
561 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
562#endif
563 );
564}
565
566void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000567 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000569 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000570 "1: \n"
571 "movdqa (%0),%%xmm0 \n"
572 "movdqa 0x10(%0),%%xmm1 \n"
573 "movdqa 0x20(%0),%%xmm2 \n"
574 "movdqa 0x30(%0),%%xmm3 \n"
575 "lea 0x40(%0),%0 \n"
576 "pshufb %%xmm6,%%xmm0 \n"
577 "pshufb %%xmm6,%%xmm1 \n"
578 "pshufb %%xmm6,%%xmm2 \n"
579 "pshufb %%xmm6,%%xmm3 \n"
580 "movdqa %%xmm1,%%xmm4 \n"
581 "psrldq $0x4,%%xmm1 \n"
582 "pslldq $0xc,%%xmm4 \n"
583 "movdqa %%xmm2,%%xmm5 \n"
584 "por %%xmm4,%%xmm0 \n"
585 "pslldq $0x8,%%xmm5 \n"
586 "movdqa %%xmm0,(%1) \n"
587 "por %%xmm5,%%xmm1 \n"
588 "psrldq $0x8,%%xmm2 \n"
589 "pslldq $0x4,%%xmm3 \n"
590 "por %%xmm3,%%xmm2 \n"
591 "movdqa %%xmm1,0x10(%1) \n"
592 "movdqa %%xmm2,0x20(%1) \n"
593 "lea 0x30(%1),%1 \n"
594 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000595 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000596 : "+r"(src), // %0
597 "+r"(dst), // %1
598 "+r"(pix) // %2
599 : "m"(kShuffleMaskARGBToRAW) // %3
600 : "memory", "cc"
601#if defined(__SSE2__)
602 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
603#endif
604 );
605}
606
fbarchard@google.comdbcabea2012-10-29 21:20:25 +0000607void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000608 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000609 "pcmpeqb %%xmm3,%%xmm3 \n"
610 "psrld $0x1b,%%xmm3 \n"
611 "pcmpeqb %%xmm4,%%xmm4 \n"
612 "psrld $0x1a,%%xmm4 \n"
613 "pslld $0x5,%%xmm4 \n"
614 "pcmpeqb %%xmm5,%%xmm5 \n"
615 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000616 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000617 "1: \n"
618 "movdqa (%0),%%xmm0 \n"
619 "movdqa %%xmm0,%%xmm1 \n"
620 "movdqa %%xmm0,%%xmm2 \n"
621 "pslld $0x8,%%xmm0 \n"
622 "psrld $0x3,%%xmm1 \n"
623 "psrld $0x5,%%xmm2 \n"
624 "psrad $0x10,%%xmm0 \n"
625 "pand %%xmm3,%%xmm1 \n"
626 "pand %%xmm4,%%xmm2 \n"
627 "pand %%xmm5,%%xmm0 \n"
628 "por %%xmm2,%%xmm1 \n"
629 "por %%xmm1,%%xmm0 \n"
630 "packssdw %%xmm0,%%xmm0 \n"
631 "lea 0x10(%0),%0 \n"
632 "movq %%xmm0,(%1) \n"
633 "lea 0x8(%1),%1 \n"
634 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000636 : "+r"(src), // %0
637 "+r"(dst), // %1
638 "+r"(pix) // %2
639 :
640 : "memory", "cc"
641#if defined(__SSE2__)
642 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
643#endif
644 );
645}
646
647void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000648 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000649 "pcmpeqb %%xmm4,%%xmm4 \n"
650 "psrld $0x1b,%%xmm4 \n"
651 "movdqa %%xmm4,%%xmm5 \n"
652 "pslld $0x5,%%xmm5 \n"
653 "movdqa %%xmm4,%%xmm6 \n"
654 "pslld $0xa,%%xmm6 \n"
655 "pcmpeqb %%xmm7,%%xmm7 \n"
656 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000657 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000658 "1: \n"
659 "movdqa (%0),%%xmm0 \n"
660 "movdqa %%xmm0,%%xmm1 \n"
661 "movdqa %%xmm0,%%xmm2 \n"
662 "movdqa %%xmm0,%%xmm3 \n"
663 "psrad $0x10,%%xmm0 \n"
664 "psrld $0x3,%%xmm1 \n"
665 "psrld $0x6,%%xmm2 \n"
666 "psrld $0x9,%%xmm3 \n"
667 "pand %%xmm7,%%xmm0 \n"
668 "pand %%xmm4,%%xmm1 \n"
669 "pand %%xmm5,%%xmm2 \n"
670 "pand %%xmm6,%%xmm3 \n"
671 "por %%xmm1,%%xmm0 \n"
672 "por %%xmm3,%%xmm2 \n"
673 "por %%xmm2,%%xmm0 \n"
674 "packssdw %%xmm0,%%xmm0 \n"
675 "lea 0x10(%0),%0 \n"
676 "movq %%xmm0,(%1) \n"
677 "lea 0x8(%1),%1 \n"
678 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000679 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000680 : "+r"(src), // %0
681 "+r"(dst), // %1
682 "+r"(pix) // %2
683 :
684 : "memory", "cc"
685#if defined(__SSE2__)
686 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
687#endif
688 );
689}
690
691void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000693 "pcmpeqb %%xmm4,%%xmm4 \n"
694 "psllw $0xc,%%xmm4 \n"
695 "movdqa %%xmm4,%%xmm3 \n"
696 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000697 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000698 "1: \n"
699 "movdqa (%0),%%xmm0 \n"
700 "movdqa %%xmm0,%%xmm1 \n"
701 "pand %%xmm3,%%xmm0 \n"
702 "pand %%xmm4,%%xmm1 \n"
703 "psrlq $0x4,%%xmm0 \n"
704 "psrlq $0x8,%%xmm1 \n"
705 "por %%xmm1,%%xmm0 \n"
706 "packuswb %%xmm0,%%xmm0 \n"
707 "lea 0x10(%0),%0 \n"
708 "movq %%xmm0,(%1) \n"
709 "lea 0x8(%1),%1 \n"
710 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000711 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000712 : "+r"(src), // %0
713 "+r"(dst), // %1
714 "+r"(pix) // %2
715 :
716 : "memory", "cc"
717#if defined(__SSE2__)
718 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
719#endif
720 );
721}
722
fbarchard@google.comb6149762011-11-07 21:58:52 +0000723void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000724 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000725 "movdqa %4,%%xmm5 \n"
726 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000727 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000728 "1: \n"
729 "movdqa (%0),%%xmm0 \n"
730 "movdqa 0x10(%0),%%xmm1 \n"
731 "movdqa 0x20(%0),%%xmm2 \n"
732 "movdqa 0x30(%0),%%xmm3 \n"
733 "pmaddubsw %%xmm4,%%xmm0 \n"
734 "pmaddubsw %%xmm4,%%xmm1 \n"
735 "pmaddubsw %%xmm4,%%xmm2 \n"
736 "pmaddubsw %%xmm4,%%xmm3 \n"
737 "lea 0x40(%0),%0 \n"
738 "phaddw %%xmm1,%%xmm0 \n"
739 "phaddw %%xmm3,%%xmm2 \n"
740 "psrlw $0x7,%%xmm0 \n"
741 "psrlw $0x7,%%xmm2 \n"
742 "packuswb %%xmm2,%%xmm0 \n"
743 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000744 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000745 "movdqa %%xmm0,(%1) \n"
746 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000747 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000748 : "+r"(src_argb), // %0
749 "+r"(dst_y), // %1
750 "+r"(pix) // %2
751 : "m"(kARGBToY), // %3
752 "m"(kAddY16) // %4
753 : "memory", "cc"
754#if defined(__SSE2__)
755 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
756#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000757 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000758}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000759
760void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000761 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000762 "movdqa %4,%%xmm5 \n"
763 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm3 \n"
770 "pmaddubsw %%xmm4,%%xmm0 \n"
771 "pmaddubsw %%xmm4,%%xmm1 \n"
772 "pmaddubsw %%xmm4,%%xmm2 \n"
773 "pmaddubsw %%xmm4,%%xmm3 \n"
774 "lea 0x40(%0),%0 \n"
775 "phaddw %%xmm1,%%xmm0 \n"
776 "phaddw %%xmm3,%%xmm2 \n"
777 "psrlw $0x7,%%xmm0 \n"
778 "psrlw $0x7,%%xmm2 \n"
779 "packuswb %%xmm2,%%xmm0 \n"
780 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000781 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000782 "movdqu %%xmm0,(%1) \n"
783 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000784 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000785 : "+r"(src_argb), // %0
786 "+r"(dst_y), // %1
787 "+r"(pix) // %2
788 : "m"(kARGBToY), // %3
789 "m"(kAddY16) // %4
790 : "memory", "cc"
791#if defined(__SSE2__)
792 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
793#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000794 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000795}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000796
fbarchard@google.com714050a2012-02-17 22:59:56 +0000797// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000798// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
799// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
800// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000801// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000802void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
803 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000804 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000805 "movdqa %0,%%xmm4 \n"
806 "movdqa %1,%%xmm3 \n"
807 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000808 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000809 : "m"(kARGBToU), // %0
810 "m"(kARGBToV), // %1
811 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000812 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000813 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000814 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000815 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000816 "1: \n"
817 "movdqa (%0),%%xmm0 \n"
818 "movdqa 0x10(%0),%%xmm1 \n"
819 "movdqa 0x20(%0),%%xmm2 \n"
820 "movdqa 0x30(%0),%%xmm6 \n"
821 "pavgb (%0,%4,1),%%xmm0 \n"
822 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
823 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
824 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
825 "lea 0x40(%0),%0 \n"
826 "movdqa %%xmm0,%%xmm7 \n"
827 "shufps $0x88,%%xmm1,%%xmm0 \n"
828 "shufps $0xdd,%%xmm1,%%xmm7 \n"
829 "pavgb %%xmm7,%%xmm0 \n"
830 "movdqa %%xmm2,%%xmm7 \n"
831 "shufps $0x88,%%xmm6,%%xmm2 \n"
832 "shufps $0xdd,%%xmm6,%%xmm7 \n"
833 "pavgb %%xmm7,%%xmm2 \n"
834 "movdqa %%xmm0,%%xmm1 \n"
835 "movdqa %%xmm2,%%xmm6 \n"
836 "pmaddubsw %%xmm4,%%xmm0 \n"
837 "pmaddubsw %%xmm4,%%xmm2 \n"
838 "pmaddubsw %%xmm3,%%xmm1 \n"
839 "pmaddubsw %%xmm3,%%xmm6 \n"
840 "phaddw %%xmm2,%%xmm0 \n"
841 "phaddw %%xmm6,%%xmm1 \n"
842 "psraw $0x8,%%xmm0 \n"
843 "psraw $0x8,%%xmm1 \n"
844 "packsswb %%xmm1,%%xmm0 \n"
845 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000846 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000847 "movlps %%xmm0,(%1) \n"
848 "movhps %%xmm0,(%1,%2,1) \n"
849 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000850 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000851 : "+r"(src_argb0), // %0
852 "+r"(dst_u), // %1
853 "+r"(dst_v), // %2
854 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000855 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000856 : "memory", "cc"
857#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000858 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000859#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000860 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000861}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000862
863void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
864 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000865 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000866 "movdqa %0,%%xmm4 \n"
867 "movdqa %1,%%xmm3 \n"
868 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000869 :
870 : "m"(kARGBToU), // %0
871 "m"(kARGBToV), // %1
872 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000873 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000874 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000875 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000876 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000877 "1: \n"
878 "movdqu (%0),%%xmm0 \n"
879 "movdqu 0x10(%0),%%xmm1 \n"
880 "movdqu 0x20(%0),%%xmm2 \n"
881 "movdqu 0x30(%0),%%xmm6 \n"
882 "movdqu (%0,%4,1),%%xmm7 \n"
883 "pavgb %%xmm7,%%xmm0 \n"
884 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
885 "pavgb %%xmm7,%%xmm1 \n"
886 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
887 "pavgb %%xmm7,%%xmm2 \n"
888 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
889 "pavgb %%xmm7,%%xmm6 \n"
890 "lea 0x40(%0),%0 \n"
891 "movdqa %%xmm0,%%xmm7 \n"
892 "shufps $0x88,%%xmm1,%%xmm0 \n"
893 "shufps $0xdd,%%xmm1,%%xmm7 \n"
894 "pavgb %%xmm7,%%xmm0 \n"
895 "movdqa %%xmm2,%%xmm7 \n"
896 "shufps $0x88,%%xmm6,%%xmm2 \n"
897 "shufps $0xdd,%%xmm6,%%xmm7 \n"
898 "pavgb %%xmm7,%%xmm2 \n"
899 "movdqa %%xmm0,%%xmm1 \n"
900 "movdqa %%xmm2,%%xmm6 \n"
901 "pmaddubsw %%xmm4,%%xmm0 \n"
902 "pmaddubsw %%xmm4,%%xmm2 \n"
903 "pmaddubsw %%xmm3,%%xmm1 \n"
904 "pmaddubsw %%xmm3,%%xmm6 \n"
905 "phaddw %%xmm2,%%xmm0 \n"
906 "phaddw %%xmm6,%%xmm1 \n"
907 "psraw $0x8,%%xmm0 \n"
908 "psraw $0x8,%%xmm1 \n"
909 "packsswb %%xmm1,%%xmm0 \n"
910 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000911 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000912 "movlps %%xmm0,(%1) \n"
913 "movhps %%xmm0,(%1,%2,1) \n"
914 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000915 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000916 : "+r"(src_argb0), // %0
917 "+r"(dst_u), // %1
918 "+r"(dst_v), // %2
919 "+rm"(width) // %3
920 : "r"(static_cast<intptr_t>(src_stride_argb))
921 : "memory", "cc"
922#if defined(__SSE2__)
923 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
924#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000925 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000926}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000927
fbarchard@google.combdf7cb52012-11-05 23:40:11 +0000928void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
929 uint8* dst_u, uint8* dst_v, int width) {
930 asm volatile (
931 "movdqa %0,%%xmm4 \n"
932 "movdqa %1,%%xmm3 \n"
933 "movdqa %2,%%xmm5 \n"
934 :
935 : "m"(kARGBToU), // %0
936 "m"(kARGBToV), // %1
937 "m"(kAddUV128) // %2
938 );
939 asm volatile (
940 "sub %1,%2 \n"
941 ".p2align 4 \n"
942 "1: \n"
943 "movdqa (%0),%%xmm0 \n"
944 "movdqa 0x10(%0),%%xmm1 \n"
945 "movdqa 0x20(%0),%%xmm2 \n"
946 "movdqa 0x30(%0),%%xmm6 \n"
947 "lea 0x40(%0),%0 \n"
948 "movdqa %%xmm0,%%xmm7 \n"
949 "shufps $0x88,%%xmm1,%%xmm0 \n"
950 "shufps $0xdd,%%xmm1,%%xmm7 \n"
951 "pavgb %%xmm7,%%xmm0 \n"
952 "movdqa %%xmm2,%%xmm7 \n"
953 "shufps $0x88,%%xmm6,%%xmm2 \n"
954 "shufps $0xdd,%%xmm6,%%xmm7 \n"
955 "pavgb %%xmm7,%%xmm2 \n"
956 "movdqa %%xmm0,%%xmm1 \n"
957 "movdqa %%xmm2,%%xmm6 \n"
958 "pmaddubsw %%xmm4,%%xmm0 \n"
959 "pmaddubsw %%xmm4,%%xmm2 \n"
960 "pmaddubsw %%xmm3,%%xmm1 \n"
961 "pmaddubsw %%xmm3,%%xmm6 \n"
962 "phaddw %%xmm2,%%xmm0 \n"
963 "phaddw %%xmm6,%%xmm1 \n"
964 "psraw $0x8,%%xmm0 \n"
965 "psraw $0x8,%%xmm1 \n"
966 "packsswb %%xmm1,%%xmm0 \n"
967 "paddb %%xmm5,%%xmm0 \n"
968 "sub $0x10,%3 \n"
969 "movlps %%xmm0,(%1) \n"
970 "movhps %%xmm0,(%1,%2,1) \n"
971 "lea 0x8(%1),%1 \n"
972 "jg 1b \n"
973 : "+r"(src_argb0), // %0
974 "+r"(dst_u), // %1
975 "+r"(dst_v), // %2
976 "+rm"(width) // %3
977 :
978 : "memory", "cc"
979#if defined(__SSE2__)
980 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
981#endif
982 );
983}
984
985void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
986 uint8* dst_u, uint8* dst_v, int width) {
987 asm volatile (
988 "movdqa %0,%%xmm4 \n"
989 "movdqa %1,%%xmm3 \n"
990 "movdqa %2,%%xmm5 \n"
991 :
992 : "m"(kARGBToU), // %0
993 "m"(kARGBToV), // %1
994 "m"(kAddUV128) // %2
995 );
996 asm volatile (
997 "sub %1,%2 \n"
998 ".p2align 4 \n"
999 "1: \n"
1000 "movdqu (%0),%%xmm0 \n"
1001 "movdqu 0x10(%0),%%xmm1 \n"
1002 "movdqu 0x20(%0),%%xmm2 \n"
1003 "movdqu 0x30(%0),%%xmm6 \n"
1004 "lea 0x40(%0),%0 \n"
1005 "movdqa %%xmm0,%%xmm7 \n"
1006 "shufps $0x88,%%xmm1,%%xmm0 \n"
1007 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1008 "pavgb %%xmm7,%%xmm0 \n"
1009 "movdqa %%xmm2,%%xmm7 \n"
1010 "shufps $0x88,%%xmm6,%%xmm2 \n"
1011 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1012 "pavgb %%xmm7,%%xmm2 \n"
1013 "movdqa %%xmm0,%%xmm1 \n"
1014 "movdqa %%xmm2,%%xmm6 \n"
1015 "pmaddubsw %%xmm4,%%xmm0 \n"
1016 "pmaddubsw %%xmm4,%%xmm2 \n"
1017 "pmaddubsw %%xmm3,%%xmm1 \n"
1018 "pmaddubsw %%xmm3,%%xmm6 \n"
1019 "phaddw %%xmm2,%%xmm0 \n"
1020 "phaddw %%xmm6,%%xmm1 \n"
1021 "psraw $0x8,%%xmm0 \n"
1022 "psraw $0x8,%%xmm1 \n"
1023 "packsswb %%xmm1,%%xmm0 \n"
1024 "paddb %%xmm5,%%xmm0 \n"
1025 "sub $0x10,%3 \n"
1026 "movlps %%xmm0,(%1) \n"
1027 "movhps %%xmm0,(%1,%2,1) \n"
1028 "lea 0x8(%1),%1 \n"
1029 "jg 1b \n"
1030 : "+r"(src_argb0), // %0
1031 "+r"(dst_u), // %1
1032 "+r"(dst_v), // %2
1033 "+rm"(width) // %3
1034 :
1035 : "memory", "cc"
1036#if defined(__SSE2__)
1037 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1038#endif
1039 );
1040}
1041
fbarchard@google.com714050a2012-02-17 22:59:56 +00001042void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001043 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001044 "movdqa %4,%%xmm5 \n"
1045 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001046 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001047 "1: \n"
1048 "movdqa (%0),%%xmm0 \n"
1049 "movdqa 0x10(%0),%%xmm1 \n"
1050 "movdqa 0x20(%0),%%xmm2 \n"
1051 "movdqa 0x30(%0),%%xmm3 \n"
1052 "pmaddubsw %%xmm4,%%xmm0 \n"
1053 "pmaddubsw %%xmm4,%%xmm1 \n"
1054 "pmaddubsw %%xmm4,%%xmm2 \n"
1055 "pmaddubsw %%xmm4,%%xmm3 \n"
1056 "lea 0x40(%0),%0 \n"
1057 "phaddw %%xmm1,%%xmm0 \n"
1058 "phaddw %%xmm3,%%xmm2 \n"
1059 "psrlw $0x7,%%xmm0 \n"
1060 "psrlw $0x7,%%xmm2 \n"
1061 "packuswb %%xmm2,%%xmm0 \n"
1062 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001063 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001064 "movdqa %%xmm0,(%1) \n"
1065 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001066 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001067 : "+r"(src_bgra), // %0
1068 "+r"(dst_y), // %1
1069 "+r"(pix) // %2
1070 : "m"(kBGRAToY), // %3
1071 "m"(kAddY16) // %4
1072 : "memory", "cc"
1073#if defined(__SSE2__)
1074 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001075#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +00001076 );
1077}
1078
1079void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001080 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001081 "movdqa %4,%%xmm5 \n"
1082 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001083 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001084 "1: \n"
1085 "movdqu (%0),%%xmm0 \n"
1086 "movdqu 0x10(%0),%%xmm1 \n"
1087 "movdqu 0x20(%0),%%xmm2 \n"
1088 "movdqu 0x30(%0),%%xmm3 \n"
1089 "pmaddubsw %%xmm4,%%xmm0 \n"
1090 "pmaddubsw %%xmm4,%%xmm1 \n"
1091 "pmaddubsw %%xmm4,%%xmm2 \n"
1092 "pmaddubsw %%xmm4,%%xmm3 \n"
1093 "lea 0x40(%0),%0 \n"
1094 "phaddw %%xmm1,%%xmm0 \n"
1095 "phaddw %%xmm3,%%xmm2 \n"
1096 "psrlw $0x7,%%xmm0 \n"
1097 "psrlw $0x7,%%xmm2 \n"
1098 "packuswb %%xmm2,%%xmm0 \n"
1099 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001100 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001101 "movdqu %%xmm0,(%1) \n"
1102 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001103 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001104 : "+r"(src_bgra), // %0
1105 "+r"(dst_y), // %1
1106 "+r"(pix) // %2
1107 : "m"(kBGRAToY), // %3
1108 "m"(kAddY16) // %4
1109 : "memory", "cc"
1110#if defined(__SSE2__)
1111 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1112#endif
1113 );
1114}
1115
1116void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1117 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001118 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001119 "movdqa %0,%%xmm4 \n"
1120 "movdqa %1,%%xmm3 \n"
1121 "movdqa %2,%%xmm5 \n"
1122 :
1123 : "m"(kBGRAToU), // %0
1124 "m"(kBGRAToV), // %1
1125 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001126 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001127 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001128 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001129 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001130 "1: \n"
1131 "movdqa (%0),%%xmm0 \n"
1132 "movdqa 0x10(%0),%%xmm1 \n"
1133 "movdqa 0x20(%0),%%xmm2 \n"
1134 "movdqa 0x30(%0),%%xmm6 \n"
1135 "pavgb (%0,%4,1),%%xmm0 \n"
1136 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1137 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1138 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1139 "lea 0x40(%0),%0 \n"
1140 "movdqa %%xmm0,%%xmm7 \n"
1141 "shufps $0x88,%%xmm1,%%xmm0 \n"
1142 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1143 "pavgb %%xmm7,%%xmm0 \n"
1144 "movdqa %%xmm2,%%xmm7 \n"
1145 "shufps $0x88,%%xmm6,%%xmm2 \n"
1146 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1147 "pavgb %%xmm7,%%xmm2 \n"
1148 "movdqa %%xmm0,%%xmm1 \n"
1149 "movdqa %%xmm2,%%xmm6 \n"
1150 "pmaddubsw %%xmm4,%%xmm0 \n"
1151 "pmaddubsw %%xmm4,%%xmm2 \n"
1152 "pmaddubsw %%xmm3,%%xmm1 \n"
1153 "pmaddubsw %%xmm3,%%xmm6 \n"
1154 "phaddw %%xmm2,%%xmm0 \n"
1155 "phaddw %%xmm6,%%xmm1 \n"
1156 "psraw $0x8,%%xmm0 \n"
1157 "psraw $0x8,%%xmm1 \n"
1158 "packsswb %%xmm1,%%xmm0 \n"
1159 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001160 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001161 "movlps %%xmm0,(%1) \n"
1162 "movhps %%xmm0,(%1,%2,1) \n"
1163 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001164 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001165 : "+r"(src_bgra0), // %0
1166 "+r"(dst_u), // %1
1167 "+r"(dst_v), // %2
1168 "+rm"(width) // %3
1169 : "r"(static_cast<intptr_t>(src_stride_bgra))
1170 : "memory", "cc"
1171#if defined(__SSE2__)
1172 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1173#endif
1174 );
1175}
1176
1177void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1178 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001179 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001180 "movdqa %0,%%xmm4 \n"
1181 "movdqa %1,%%xmm3 \n"
1182 "movdqa %2,%%xmm5 \n"
1183 :
1184 : "m"(kBGRAToU), // %0
1185 "m"(kBGRAToV), // %1
1186 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001187 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001188 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001189 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001190 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001191 "1: \n"
1192 "movdqu (%0),%%xmm0 \n"
1193 "movdqu 0x10(%0),%%xmm1 \n"
1194 "movdqu 0x20(%0),%%xmm2 \n"
1195 "movdqu 0x30(%0),%%xmm6 \n"
1196 "movdqu (%0,%4,1),%%xmm7 \n"
1197 "pavgb %%xmm7,%%xmm0 \n"
1198 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1199 "pavgb %%xmm7,%%xmm1 \n"
1200 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1201 "pavgb %%xmm7,%%xmm2 \n"
1202 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1203 "pavgb %%xmm7,%%xmm6 \n"
1204 "lea 0x40(%0),%0 \n"
1205 "movdqa %%xmm0,%%xmm7 \n"
1206 "shufps $0x88,%%xmm1,%%xmm0 \n"
1207 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1208 "pavgb %%xmm7,%%xmm0 \n"
1209 "movdqa %%xmm2,%%xmm7 \n"
1210 "shufps $0x88,%%xmm6,%%xmm2 \n"
1211 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1212 "pavgb %%xmm7,%%xmm2 \n"
1213 "movdqa %%xmm0,%%xmm1 \n"
1214 "movdqa %%xmm2,%%xmm6 \n"
1215 "pmaddubsw %%xmm4,%%xmm0 \n"
1216 "pmaddubsw %%xmm4,%%xmm2 \n"
1217 "pmaddubsw %%xmm3,%%xmm1 \n"
1218 "pmaddubsw %%xmm3,%%xmm6 \n"
1219 "phaddw %%xmm2,%%xmm0 \n"
1220 "phaddw %%xmm6,%%xmm1 \n"
1221 "psraw $0x8,%%xmm0 \n"
1222 "psraw $0x8,%%xmm1 \n"
1223 "packsswb %%xmm1,%%xmm0 \n"
1224 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001225 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001226 "movlps %%xmm0,(%1) \n"
1227 "movhps %%xmm0,(%1,%2,1) \n"
1228 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001229 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001230 : "+r"(src_bgra0), // %0
1231 "+r"(dst_u), // %1
1232 "+r"(dst_v), // %2
1233 "+rm"(width) // %3
1234 : "r"(static_cast<intptr_t>(src_stride_bgra))
1235 : "memory", "cc"
1236#if defined(__SSE2__)
1237 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1238#endif
1239 );
1240}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001241
1242void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001243 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001244 "movdqa %4,%%xmm5 \n"
1245 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001246 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001247 "1: \n"
1248 "movdqa (%0),%%xmm0 \n"
1249 "movdqa 0x10(%0),%%xmm1 \n"
1250 "movdqa 0x20(%0),%%xmm2 \n"
1251 "movdqa 0x30(%0),%%xmm3 \n"
1252 "pmaddubsw %%xmm4,%%xmm0 \n"
1253 "pmaddubsw %%xmm4,%%xmm1 \n"
1254 "pmaddubsw %%xmm4,%%xmm2 \n"
1255 "pmaddubsw %%xmm4,%%xmm3 \n"
1256 "lea 0x40(%0),%0 \n"
1257 "phaddw %%xmm1,%%xmm0 \n"
1258 "phaddw %%xmm3,%%xmm2 \n"
1259 "psrlw $0x7,%%xmm0 \n"
1260 "psrlw $0x7,%%xmm2 \n"
1261 "packuswb %%xmm2,%%xmm0 \n"
1262 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001263 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001264 "movdqa %%xmm0,(%1) \n"
1265 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001266 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001267 : "+r"(src_abgr), // %0
1268 "+r"(dst_y), // %1
1269 "+r"(pix) // %2
1270 : "m"(kABGRToY), // %3
1271 "m"(kAddY16) // %4
1272 : "memory", "cc"
1273#if defined(__SSE2__)
1274 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1275#endif
1276 );
1277}
1278
1279void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001280 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001281 "movdqa %4,%%xmm5 \n"
1282 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001283 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001284 "1: \n"
1285 "movdqu (%0),%%xmm0 \n"
1286 "movdqu 0x10(%0),%%xmm1 \n"
1287 "movdqu 0x20(%0),%%xmm2 \n"
1288 "movdqu 0x30(%0),%%xmm3 \n"
1289 "pmaddubsw %%xmm4,%%xmm0 \n"
1290 "pmaddubsw %%xmm4,%%xmm1 \n"
1291 "pmaddubsw %%xmm4,%%xmm2 \n"
1292 "pmaddubsw %%xmm4,%%xmm3 \n"
1293 "lea 0x40(%0),%0 \n"
1294 "phaddw %%xmm1,%%xmm0 \n"
1295 "phaddw %%xmm3,%%xmm2 \n"
1296 "psrlw $0x7,%%xmm0 \n"
1297 "psrlw $0x7,%%xmm2 \n"
1298 "packuswb %%xmm2,%%xmm0 \n"
1299 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001300 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001301 "movdqu %%xmm0,(%1) \n"
1302 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001303 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001304 : "+r"(src_abgr), // %0
1305 "+r"(dst_y), // %1
1306 "+r"(pix) // %2
1307 : "m"(kABGRToY), // %3
1308 "m"(kAddY16) // %4
1309 : "memory", "cc"
1310#if defined(__SSE2__)
1311 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1312#endif
1313 );
1314}
1315
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001316void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1317 asm volatile (
1318 "movdqa %4,%%xmm5 \n"
1319 "movdqa %3,%%xmm4 \n"
1320 ".p2align 4 \n"
1321 "1: \n"
1322 "movdqa (%0),%%xmm0 \n"
1323 "movdqa 0x10(%0),%%xmm1 \n"
1324 "movdqa 0x20(%0),%%xmm2 \n"
1325 "movdqa 0x30(%0),%%xmm3 \n"
1326 "pmaddubsw %%xmm4,%%xmm0 \n"
1327 "pmaddubsw %%xmm4,%%xmm1 \n"
1328 "pmaddubsw %%xmm4,%%xmm2 \n"
1329 "pmaddubsw %%xmm4,%%xmm3 \n"
1330 "lea 0x40(%0),%0 \n"
1331 "phaddw %%xmm1,%%xmm0 \n"
1332 "phaddw %%xmm3,%%xmm2 \n"
1333 "psrlw $0x7,%%xmm0 \n"
1334 "psrlw $0x7,%%xmm2 \n"
1335 "packuswb %%xmm2,%%xmm0 \n"
1336 "paddb %%xmm5,%%xmm0 \n"
1337 "sub $0x10,%2 \n"
1338 "movdqa %%xmm0,(%1) \n"
1339 "lea 0x10(%1),%1 \n"
1340 "jg 1b \n"
1341 : "+r"(src_rgba), // %0
1342 "+r"(dst_y), // %1
1343 "+r"(pix) // %2
1344 : "m"(kRGBAToY), // %3
1345 "m"(kAddY16) // %4
1346 : "memory", "cc"
1347#if defined(__SSE2__)
1348 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1349#endif
1350 );
1351}
1352
1353void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1354 asm volatile (
1355 "movdqa %4,%%xmm5 \n"
1356 "movdqa %3,%%xmm4 \n"
1357 ".p2align 4 \n"
1358 "1: \n"
1359 "movdqu (%0),%%xmm0 \n"
1360 "movdqu 0x10(%0),%%xmm1 \n"
1361 "movdqu 0x20(%0),%%xmm2 \n"
1362 "movdqu 0x30(%0),%%xmm3 \n"
1363 "pmaddubsw %%xmm4,%%xmm0 \n"
1364 "pmaddubsw %%xmm4,%%xmm1 \n"
1365 "pmaddubsw %%xmm4,%%xmm2 \n"
1366 "pmaddubsw %%xmm4,%%xmm3 \n"
1367 "lea 0x40(%0),%0 \n"
1368 "phaddw %%xmm1,%%xmm0 \n"
1369 "phaddw %%xmm3,%%xmm2 \n"
1370 "psrlw $0x7,%%xmm0 \n"
1371 "psrlw $0x7,%%xmm2 \n"
1372 "packuswb %%xmm2,%%xmm0 \n"
1373 "paddb %%xmm5,%%xmm0 \n"
1374 "sub $0x10,%2 \n"
1375 "movdqu %%xmm0,(%1) \n"
1376 "lea 0x10(%1),%1 \n"
1377 "jg 1b \n"
1378 : "+r"(src_rgba), // %0
1379 "+r"(dst_y), // %1
1380 "+r"(pix) // %2
1381 : "m"(kRGBAToY), // %3
1382 "m"(kAddY16) // %4
1383 : "memory", "cc"
1384#if defined(__SSE2__)
1385 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1386#endif
1387 );
1388}
1389
fbarchard@google.com714050a2012-02-17 22:59:56 +00001390void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1391 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001392 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001393 "movdqa %0,%%xmm4 \n"
1394 "movdqa %1,%%xmm3 \n"
1395 "movdqa %2,%%xmm5 \n"
1396 :
1397 : "m"(kABGRToU), // %0
1398 "m"(kABGRToV), // %1
1399 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001400 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001401 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001402 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001403 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001404 "1: \n"
1405 "movdqa (%0),%%xmm0 \n"
1406 "movdqa 0x10(%0),%%xmm1 \n"
1407 "movdqa 0x20(%0),%%xmm2 \n"
1408 "movdqa 0x30(%0),%%xmm6 \n"
1409 "pavgb (%0,%4,1),%%xmm0 \n"
1410 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1411 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1412 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1413 "lea 0x40(%0),%0 \n"
1414 "movdqa %%xmm0,%%xmm7 \n"
1415 "shufps $0x88,%%xmm1,%%xmm0 \n"
1416 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1417 "pavgb %%xmm7,%%xmm0 \n"
1418 "movdqa %%xmm2,%%xmm7 \n"
1419 "shufps $0x88,%%xmm6,%%xmm2 \n"
1420 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1421 "pavgb %%xmm7,%%xmm2 \n"
1422 "movdqa %%xmm0,%%xmm1 \n"
1423 "movdqa %%xmm2,%%xmm6 \n"
1424 "pmaddubsw %%xmm4,%%xmm0 \n"
1425 "pmaddubsw %%xmm4,%%xmm2 \n"
1426 "pmaddubsw %%xmm3,%%xmm1 \n"
1427 "pmaddubsw %%xmm3,%%xmm6 \n"
1428 "phaddw %%xmm2,%%xmm0 \n"
1429 "phaddw %%xmm6,%%xmm1 \n"
1430 "psraw $0x8,%%xmm0 \n"
1431 "psraw $0x8,%%xmm1 \n"
1432 "packsswb %%xmm1,%%xmm0 \n"
1433 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001434 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001435 "movlps %%xmm0,(%1) \n"
1436 "movhps %%xmm0,(%1,%2,1) \n"
1437 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001438 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001439 : "+r"(src_abgr0), // %0
1440 "+r"(dst_u), // %1
1441 "+r"(dst_v), // %2
1442 "+rm"(width) // %3
1443 : "r"(static_cast<intptr_t>(src_stride_abgr))
1444 : "memory", "cc"
1445#if defined(__SSE2__)
1446 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1447#endif
1448 );
1449}
1450
1451void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1452 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001453 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001454 "movdqa %0,%%xmm4 \n"
1455 "movdqa %1,%%xmm3 \n"
1456 "movdqa %2,%%xmm5 \n"
1457 :
1458 : "m"(kABGRToU), // %0
1459 "m"(kABGRToV), // %1
1460 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001461 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001462 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001463 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001464 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001465 "1: \n"
1466 "movdqu (%0),%%xmm0 \n"
1467 "movdqu 0x10(%0),%%xmm1 \n"
1468 "movdqu 0x20(%0),%%xmm2 \n"
1469 "movdqu 0x30(%0),%%xmm6 \n"
1470 "movdqu (%0,%4,1),%%xmm7 \n"
1471 "pavgb %%xmm7,%%xmm0 \n"
1472 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1473 "pavgb %%xmm7,%%xmm1 \n"
1474 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1475 "pavgb %%xmm7,%%xmm2 \n"
1476 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1477 "pavgb %%xmm7,%%xmm6 \n"
1478 "lea 0x40(%0),%0 \n"
1479 "movdqa %%xmm0,%%xmm7 \n"
1480 "shufps $0x88,%%xmm1,%%xmm0 \n"
1481 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1482 "pavgb %%xmm7,%%xmm0 \n"
1483 "movdqa %%xmm2,%%xmm7 \n"
1484 "shufps $0x88,%%xmm6,%%xmm2 \n"
1485 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1486 "pavgb %%xmm7,%%xmm2 \n"
1487 "movdqa %%xmm0,%%xmm1 \n"
1488 "movdqa %%xmm2,%%xmm6 \n"
1489 "pmaddubsw %%xmm4,%%xmm0 \n"
1490 "pmaddubsw %%xmm4,%%xmm2 \n"
1491 "pmaddubsw %%xmm3,%%xmm1 \n"
1492 "pmaddubsw %%xmm3,%%xmm6 \n"
1493 "phaddw %%xmm2,%%xmm0 \n"
1494 "phaddw %%xmm6,%%xmm1 \n"
1495 "psraw $0x8,%%xmm0 \n"
1496 "psraw $0x8,%%xmm1 \n"
1497 "packsswb %%xmm1,%%xmm0 \n"
1498 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001499 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001500 "movlps %%xmm0,(%1) \n"
1501 "movhps %%xmm0,(%1,%2,1) \n"
1502 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001503 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001504 : "+r"(src_abgr0), // %0
1505 "+r"(dst_u), // %1
1506 "+r"(dst_v), // %2
1507 "+rm"(width) // %3
1508 : "r"(static_cast<intptr_t>(src_stride_abgr))
1509 : "memory", "cc"
1510#if defined(__SSE2__)
1511 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1512#endif
1513 );
1514}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001515
1516void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1517 uint8* dst_u, uint8* dst_v, int width) {
1518 asm volatile (
1519 "movdqa %0,%%xmm4 \n"
1520 "movdqa %1,%%xmm3 \n"
1521 "movdqa %2,%%xmm5 \n"
1522 :
1523 : "m"(kRGBAToU), // %0
1524 "m"(kRGBAToV), // %1
1525 "m"(kAddUV128) // %2
1526 );
1527 asm volatile (
1528 "sub %1,%2 \n"
1529 ".p2align 4 \n"
1530 "1: \n"
1531 "movdqa (%0),%%xmm0 \n"
1532 "movdqa 0x10(%0),%%xmm1 \n"
1533 "movdqa 0x20(%0),%%xmm2 \n"
1534 "movdqa 0x30(%0),%%xmm6 \n"
1535 "pavgb (%0,%4,1),%%xmm0 \n"
1536 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1537 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1538 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1539 "lea 0x40(%0),%0 \n"
1540 "movdqa %%xmm0,%%xmm7 \n"
1541 "shufps $0x88,%%xmm1,%%xmm0 \n"
1542 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1543 "pavgb %%xmm7,%%xmm0 \n"
1544 "movdqa %%xmm2,%%xmm7 \n"
1545 "shufps $0x88,%%xmm6,%%xmm2 \n"
1546 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1547 "pavgb %%xmm7,%%xmm2 \n"
1548 "movdqa %%xmm0,%%xmm1 \n"
1549 "movdqa %%xmm2,%%xmm6 \n"
1550 "pmaddubsw %%xmm4,%%xmm0 \n"
1551 "pmaddubsw %%xmm4,%%xmm2 \n"
1552 "pmaddubsw %%xmm3,%%xmm1 \n"
1553 "pmaddubsw %%xmm3,%%xmm6 \n"
1554 "phaddw %%xmm2,%%xmm0 \n"
1555 "phaddw %%xmm6,%%xmm1 \n"
1556 "psraw $0x8,%%xmm0 \n"
1557 "psraw $0x8,%%xmm1 \n"
1558 "packsswb %%xmm1,%%xmm0 \n"
1559 "paddb %%xmm5,%%xmm0 \n"
1560 "sub $0x10,%3 \n"
1561 "movlps %%xmm0,(%1) \n"
1562 "movhps %%xmm0,(%1,%2,1) \n"
1563 "lea 0x8(%1),%1 \n"
1564 "jg 1b \n"
1565 : "+r"(src_rgba0), // %0
1566 "+r"(dst_u), // %1
1567 "+r"(dst_v), // %2
1568 "+rm"(width) // %3
1569 : "r"(static_cast<intptr_t>(src_stride_rgba))
1570 : "memory", "cc"
1571#if defined(__SSE2__)
1572 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1573#endif
1574 );
1575}
1576
1577void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1578 uint8* dst_u, uint8* dst_v, int width) {
1579 asm volatile (
1580 "movdqa %0,%%xmm4 \n"
1581 "movdqa %1,%%xmm3 \n"
1582 "movdqa %2,%%xmm5 \n"
1583 :
1584 : "m"(kRGBAToU), // %0
1585 "m"(kRGBAToV), // %1
1586 "m"(kAddUV128) // %2
1587 );
1588 asm volatile (
1589 "sub %1,%2 \n"
1590 ".p2align 4 \n"
1591 "1: \n"
1592 "movdqu (%0),%%xmm0 \n"
1593 "movdqu 0x10(%0),%%xmm1 \n"
1594 "movdqu 0x20(%0),%%xmm2 \n"
1595 "movdqu 0x30(%0),%%xmm6 \n"
1596 "movdqu (%0,%4,1),%%xmm7 \n"
1597 "pavgb %%xmm7,%%xmm0 \n"
1598 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1599 "pavgb %%xmm7,%%xmm1 \n"
1600 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1601 "pavgb %%xmm7,%%xmm2 \n"
1602 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1603 "pavgb %%xmm7,%%xmm6 \n"
1604 "lea 0x40(%0),%0 \n"
1605 "movdqa %%xmm0,%%xmm7 \n"
1606 "shufps $0x88,%%xmm1,%%xmm0 \n"
1607 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1608 "pavgb %%xmm7,%%xmm0 \n"
1609 "movdqa %%xmm2,%%xmm7 \n"
1610 "shufps $0x88,%%xmm6,%%xmm2 \n"
1611 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1612 "pavgb %%xmm7,%%xmm2 \n"
1613 "movdqa %%xmm0,%%xmm1 \n"
1614 "movdqa %%xmm2,%%xmm6 \n"
1615 "pmaddubsw %%xmm4,%%xmm0 \n"
1616 "pmaddubsw %%xmm4,%%xmm2 \n"
1617 "pmaddubsw %%xmm3,%%xmm1 \n"
1618 "pmaddubsw %%xmm3,%%xmm6 \n"
1619 "phaddw %%xmm2,%%xmm0 \n"
1620 "phaddw %%xmm6,%%xmm1 \n"
1621 "psraw $0x8,%%xmm0 \n"
1622 "psraw $0x8,%%xmm1 \n"
1623 "packsswb %%xmm1,%%xmm0 \n"
1624 "paddb %%xmm5,%%xmm0 \n"
1625 "sub $0x10,%3 \n"
1626 "movlps %%xmm0,(%1) \n"
1627 "movhps %%xmm0,(%1,%2,1) \n"
1628 "lea 0x8(%1),%1 \n"
1629 "jg 1b \n"
1630 : "+r"(src_rgba0), // %0
1631 "+r"(dst_u), // %1
1632 "+r"(dst_v), // %2
1633 "+rm"(width) // %3
1634 : "r"(static_cast<intptr_t>(src_stride_rgba))
1635 : "memory", "cc"
1636#if defined(__SSE2__)
1637 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1638#endif
1639 );
1640}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001641#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001642
fbarchard@google.come214fe32012-06-04 23:47:11 +00001643#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001644#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1645#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1646#define UR 0
1647
1648#define VB 0
1649#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1650#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1651
1652// Bias
1653#define BB UB * 128 + VB * 128
1654#define BG UG * 128 + VG * 128
1655#define BR UR * 128 + VR * 128
1656
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001657#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001658
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001659struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001660 vec8 kUVToB; // 0
1661 vec8 kUVToG; // 16
1662 vec8 kUVToR; // 32
1663 vec16 kUVBiasB; // 48
1664 vec16 kUVBiasG; // 64
1665 vec16 kUVBiasR; // 80
1666 vec16 kYSub16; // 96
1667 vec16 kYToRgb; // 112
1668 vec8 kVUToB; // 128
1669 vec8 kVUToG; // 144
1670 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001671} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001672 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1673 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1674 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1675 { BB, BB, BB, BB, BB, BB, BB, BB },
1676 { BG, BG, BG, BG, BG, BG, BG, BG },
1677 { BR, BR, BR, BR, BR, BR, BR, BR },
1678 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001679 { YG, YG, YG, YG, YG, YG, YG, YG },
1680 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1681 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1682 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001683};
1684
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001685
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001686// Read 8 UV from 411
1687#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001688 "movq (%[u_buf]),%%xmm0 \n" \
1689 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1690 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001691 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001692
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001693// Read 4 UV from 422, upsample to 8 UV
1694#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001695 "movd (%[u_buf]),%%xmm0 \n" \
1696 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1697 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001698 "punpcklbw %%xmm1,%%xmm0 \n" \
1699 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001700
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001701// Read 2 UV from 411, upsample to 8 UV
1702#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001703 "movd (%[u_buf]),%%xmm0 \n" \
1704 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1705 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001706 "punpcklbw %%xmm1,%%xmm0 \n" \
1707 "punpcklwd %%xmm0,%%xmm0 \n" \
1708 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001709
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001710// Read 4 UV from NV12, upsample to 8 UV
1711#define READNV12 \
1712 "movq (%[uv_buf]),%%xmm0 \n" \
1713 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001714 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001715
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001716// Convert 8 pixels: 8 UV and 8 Y
1717#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001718 "movdqa %%xmm0,%%xmm1 \n" \
1719 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001720 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1721 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1722 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1723 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1724 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1725 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1726 "movq (%[y_buf]),%%xmm3 \n" \
1727 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001728 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001729 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1730 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001731 "paddsw %%xmm3,%%xmm0 \n" \
1732 "paddsw %%xmm3,%%xmm1 \n" \
1733 "paddsw %%xmm3,%%xmm2 \n" \
1734 "psraw $0x6,%%xmm0 \n" \
1735 "psraw $0x6,%%xmm1 \n" \
1736 "psraw $0x6,%%xmm2 \n" \
1737 "packuswb %%xmm0,%%xmm0 \n" \
1738 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001739 "packuswb %%xmm2,%%xmm2 \n" \
1740
1741// Convert 8 pixels: 8 VU and 8 Y
1742#define YVUTORGB \
1743 "movdqa %%xmm0,%%xmm1 \n" \
1744 "movdqa %%xmm0,%%xmm2 \n" \
1745 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1746 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1747 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1748 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1749 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1750 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1751 "movq (%[y_buf]),%%xmm3 \n" \
1752 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1753 "punpcklbw %%xmm4,%%xmm3 \n" \
1754 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1755 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1756 "paddsw %%xmm3,%%xmm0 \n" \
1757 "paddsw %%xmm3,%%xmm1 \n" \
1758 "paddsw %%xmm3,%%xmm2 \n" \
1759 "psraw $0x6,%%xmm0 \n" \
1760 "psraw $0x6,%%xmm1 \n" \
1761 "psraw $0x6,%%xmm2 \n" \
1762 "packuswb %%xmm0,%%xmm0 \n" \
1763 "packuswb %%xmm1,%%xmm1 \n" \
1764 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001765
1766void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001767 const uint8* u_buf,
1768 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001769 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001770 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001771 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001772 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001773 "pcmpeqb %%xmm5,%%xmm5 \n"
1774 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001775 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001776 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001777 READYUV444
1778 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001779 "punpcklbw %%xmm1,%%xmm0 \n"
1780 "punpcklbw %%xmm5,%%xmm2 \n"
1781 "movdqa %%xmm0,%%xmm1 \n"
1782 "punpcklwd %%xmm2,%%xmm0 \n"
1783 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001784 "movdqa %%xmm0,(%[dst_argb]) \n"
1785 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
1786 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001787 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001788 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001789 : [y_buf]"+r"(y_buf), // %[y_buf]
1790 [u_buf]"+r"(u_buf), // %[u_buf]
1791 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001792 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001793 [width]"+rm"(width) // %[width]
1794 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001795 : "memory", "cc"
1796#if defined(__SSE2__)
1797 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1798#endif
1799 );
1800}
1801
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001802void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1803 const uint8* u_buf,
1804 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001805 uint8* dst_rgb24,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001806 int width) {
1807// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1808#ifdef __APPLE__
1809 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001810 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1811 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1812 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1813 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001814#endif
1815
1816 asm volatile (
1817#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001818 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1819 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001820#endif
1821 "sub %[u_buf],%[v_buf] \n"
1822 "pxor %%xmm4,%%xmm4 \n"
1823 ".p2align 4 \n"
1824 "1: \n"
1825 READYUV422
1826 YUVTORGB
1827 "punpcklbw %%xmm1,%%xmm0 \n"
1828 "punpcklbw %%xmm2,%%xmm2 \n"
1829 "movdqa %%xmm0,%%xmm1 \n"
1830 "punpcklwd %%xmm2,%%xmm0 \n"
1831 "punpckhwd %%xmm2,%%xmm1 \n"
1832 "pshufb %%xmm5,%%xmm0 \n"
1833 "pshufb %%xmm6,%%xmm1 \n"
1834 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001835 "movq %%xmm0,(%[dst_rgb24]) \n"
1836 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
1837 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001838 "sub $0x8,%[width] \n"
1839 "jg 1b \n"
1840 : [y_buf]"+r"(y_buf), // %[y_buf]
1841 [u_buf]"+r"(u_buf), // %[u_buf]
1842 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001843 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001844 [width]"+rm"(width) // %[width]
1845 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1846#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001847 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1848 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001849#endif
1850 : "memory", "cc"
1851#if defined(__SSE2__)
1852 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1853#endif
1854 );
1855}
1856
1857void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1858 const uint8* u_buf,
1859 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001860 uint8* dst_raw,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001861 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001862// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001863#ifdef __APPLE__
1864 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001865 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1866 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1867 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1868 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001869#endif
1870
1871 asm volatile (
1872#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001873 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1874 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001875#endif
1876 "sub %[u_buf],%[v_buf] \n"
1877 "pxor %%xmm4,%%xmm4 \n"
1878 ".p2align 4 \n"
1879 "1: \n"
1880 READYUV422
1881 YUVTORGB
1882 "punpcklbw %%xmm1,%%xmm0 \n"
1883 "punpcklbw %%xmm2,%%xmm2 \n"
1884 "movdqa %%xmm0,%%xmm1 \n"
1885 "punpcklwd %%xmm2,%%xmm0 \n"
1886 "punpckhwd %%xmm2,%%xmm1 \n"
1887 "pshufb %%xmm5,%%xmm0 \n"
1888 "pshufb %%xmm6,%%xmm1 \n"
1889 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001890 "movq %%xmm0,(%[dst_raw]) \n"
1891 "movdqu %%xmm1,0x8(%[dst_raw]) \n"
1892 "lea 0x18(%[dst_raw]),%[dst_raw] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001893 "sub $0x8,%[width] \n"
1894 "jg 1b \n"
1895 : [y_buf]"+r"(y_buf), // %[y_buf]
1896 [u_buf]"+r"(u_buf), // %[u_buf]
1897 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001898 [dst_raw]"+r"(dst_raw), // %[dst_raw]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001899 [width]"+rm"(width) // %[width]
1900 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1901#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001902 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1903 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001904#endif
1905 : "memory", "cc"
1906#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001907 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001908#endif
1909 );
1910}
1911
fbarchard@google.come214fe32012-06-04 23:47:11 +00001912void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001913 const uint8* u_buf,
1914 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001915 uint8* dst_argb,
fbarchard@google.comdbcabea2012-10-29 21:20:25 +00001916 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001917 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001918 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001919 "pcmpeqb %%xmm5,%%xmm5 \n"
1920 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001921 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001922 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001923 READYUV422
1924 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001925 "punpcklbw %%xmm1,%%xmm0 \n"
1926 "punpcklbw %%xmm5,%%xmm2 \n"
1927 "movdqa %%xmm0,%%xmm1 \n"
1928 "punpcklwd %%xmm2,%%xmm0 \n"
1929 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001930 "movdqa %%xmm0,(%[dst_argb]) \n"
1931 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
1932 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001933 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001934 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001935 : [y_buf]"+r"(y_buf), // %[y_buf]
1936 [u_buf]"+r"(u_buf), // %[u_buf]
1937 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001938 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001939 [width]"+rm"(width) // %[width]
1940 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001941 : "memory", "cc"
1942#if defined(__SSE2__)
1943 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1944#endif
1945 );
1946}
1947
1948void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1949 const uint8* u_buf,
1950 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001951 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001952 int width) {
1953 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001954 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001955 "pcmpeqb %%xmm5,%%xmm5 \n"
1956 "pxor %%xmm4,%%xmm4 \n"
1957 ".p2align 4 \n"
1958 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001959 READYUV411
1960 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001961 "punpcklbw %%xmm1,%%xmm0 \n"
1962 "punpcklbw %%xmm5,%%xmm2 \n"
1963 "movdqa %%xmm0,%%xmm1 \n"
1964 "punpcklwd %%xmm2,%%xmm0 \n"
1965 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001966 "movdqa %%xmm0,(%[dst_argb]) \n"
1967 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
1968 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001969 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001970 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001971 : [y_buf]"+r"(y_buf), // %[y_buf]
1972 [u_buf]"+r"(u_buf), // %[u_buf]
1973 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001974 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001975 [width]"+rm"(width) // %[width]
1976 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1977 : "memory", "cc"
1978#if defined(__SSE2__)
1979 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1980#endif
1981 );
1982}
1983
1984void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1985 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001986 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001987 int width) {
1988 asm volatile (
1989 "pcmpeqb %%xmm5,%%xmm5 \n"
1990 "pxor %%xmm4,%%xmm4 \n"
1991 ".p2align 4 \n"
1992 "1: \n"
1993 READNV12
1994 YUVTORGB
1995 "punpcklbw %%xmm1,%%xmm0 \n"
1996 "punpcklbw %%xmm5,%%xmm2 \n"
1997 "movdqa %%xmm0,%%xmm1 \n"
1998 "punpcklwd %%xmm2,%%xmm0 \n"
1999 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002000 "movdqa %%xmm0,(%[dst_argb]) \n"
2001 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2002 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002003 "sub $0x8,%[width] \n"
2004 "jg 1b \n"
2005 : [y_buf]"+r"(y_buf), // %[y_buf]
2006 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002007 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002008 [width]"+rm"(width) // %[width]
2009 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2010 : "memory", "cc"
2011#if defined(__SSE2__)
2012 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2013#endif
2014 );
2015}
2016
2017void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002018 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002019 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002020 int width) {
2021 asm volatile (
2022 "pcmpeqb %%xmm5,%%xmm5 \n"
2023 "pxor %%xmm4,%%xmm4 \n"
2024 ".p2align 4 \n"
2025 "1: \n"
2026 READNV12
2027 YVUTORGB
2028 "punpcklbw %%xmm1,%%xmm0 \n"
2029 "punpcklbw %%xmm5,%%xmm2 \n"
2030 "movdqa %%xmm0,%%xmm1 \n"
2031 "punpcklwd %%xmm2,%%xmm0 \n"
2032 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002033 "movdqa %%xmm0,(%[dst_argb]) \n"
2034 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2035 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002036 "sub $0x8,%[width] \n"
2037 "jg 1b \n"
2038 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002039 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2040 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002041 [width]"+rm"(width) // %[width]
2042 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002043 : "memory", "cc"
2044#if defined(__SSE2__)
2045 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2046#endif
2047 );
2048}
2049
2050void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2051 const uint8* u_buf,
2052 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002053 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002054 int width) {
2055 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002056 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002057 "pcmpeqb %%xmm5,%%xmm5 \n"
2058 "pxor %%xmm4,%%xmm4 \n"
2059 ".p2align 4 \n"
2060 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002061 READYUV444
2062 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002063 "punpcklbw %%xmm1,%%xmm0 \n"
2064 "punpcklbw %%xmm5,%%xmm2 \n"
2065 "movdqa %%xmm0,%%xmm1 \n"
2066 "punpcklwd %%xmm2,%%xmm0 \n"
2067 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002068 "movdqu %%xmm0,(%[dst_argb]) \n"
2069 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2070 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002071 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002072 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002073 : [y_buf]"+r"(y_buf), // %[y_buf]
2074 [u_buf]"+r"(u_buf), // %[u_buf]
2075 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002076 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002077 [width]"+rm"(width) // %[width]
2078 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002079 : "memory", "cc"
2080#if defined(__SSE2__)
2081 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2082#endif
2083 );
2084}
2085
2086void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2087 const uint8* u_buf,
2088 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002089 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002090 int width) {
2091 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002092 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002093 "pcmpeqb %%xmm5,%%xmm5 \n"
2094 "pxor %%xmm4,%%xmm4 \n"
2095 ".p2align 4 \n"
2096 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002097 READYUV422
2098 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002099 "punpcklbw %%xmm1,%%xmm0 \n"
2100 "punpcklbw %%xmm5,%%xmm2 \n"
2101 "movdqa %%xmm0,%%xmm1 \n"
2102 "punpcklwd %%xmm2,%%xmm0 \n"
2103 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002104 "movdqu %%xmm0,(%[dst_argb]) \n"
2105 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2106 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002107 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002108 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002109 : [y_buf]"+r"(y_buf), // %[y_buf]
2110 [u_buf]"+r"(u_buf), // %[u_buf]
2111 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002112 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002113 [width]"+rm"(width) // %[width]
2114 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002115 : "memory", "cc"
2116#if defined(__SSE2__)
2117 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2118#endif
2119 );
2120}
2121
2122void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2123 const uint8* u_buf,
2124 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002125 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002126 int width) {
2127 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002128 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002129 "pcmpeqb %%xmm5,%%xmm5 \n"
2130 "pxor %%xmm4,%%xmm4 \n"
2131 ".p2align 4 \n"
2132 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002133 READYUV411
2134 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002135 "punpcklbw %%xmm1,%%xmm0 \n"
2136 "punpcklbw %%xmm5,%%xmm2 \n"
2137 "movdqa %%xmm0,%%xmm1 \n"
2138 "punpcklwd %%xmm2,%%xmm0 \n"
2139 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002140 "movdqu %%xmm0,(%[dst_argb]) \n"
2141 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2142 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002143 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002144 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002145 : [y_buf]"+r"(y_buf), // %[y_buf]
2146 [u_buf]"+r"(u_buf), // %[u_buf]
2147 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002148 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002149 [width]"+rm"(width) // %[width]
2150 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2151 : "memory", "cc"
2152#if defined(__SSE2__)
2153 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2154#endif
2155 );
2156}
2157
2158void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2159 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002160 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002161 int width) {
2162 asm volatile (
2163 "pcmpeqb %%xmm5,%%xmm5 \n"
2164 "pxor %%xmm4,%%xmm4 \n"
2165 ".p2align 4 \n"
2166 "1: \n"
2167 READNV12
2168 YUVTORGB
2169 "punpcklbw %%xmm1,%%xmm0 \n"
2170 "punpcklbw %%xmm5,%%xmm2 \n"
2171 "movdqa %%xmm0,%%xmm1 \n"
2172 "punpcklwd %%xmm2,%%xmm0 \n"
2173 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002174 "movdqu %%xmm0,(%[dst_argb]) \n"
2175 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2176 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002177 "sub $0x8,%[width] \n"
2178 "jg 1b \n"
2179 : [y_buf]"+r"(y_buf), // %[y_buf]
2180 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002181 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002182 [width]"+rm"(width) // %[width]
2183 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2184 : "memory", "cc"
2185#if defined(__SSE2__)
2186 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2187#endif
2188 );
2189}
2190
2191void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002192 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002193 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002194 int width) {
2195 asm volatile (
2196 "pcmpeqb %%xmm5,%%xmm5 \n"
2197 "pxor %%xmm4,%%xmm4 \n"
2198 ".p2align 4 \n"
2199 "1: \n"
2200 READNV12
2201 YVUTORGB
2202 "punpcklbw %%xmm1,%%xmm0 \n"
2203 "punpcklbw %%xmm5,%%xmm2 \n"
2204 "movdqa %%xmm0,%%xmm1 \n"
2205 "punpcklwd %%xmm2,%%xmm0 \n"
2206 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002207 "movdqu %%xmm0,(%[dst_argb]) \n"
2208 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2209 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002210 "sub $0x8,%[width] \n"
2211 "jg 1b \n"
2212 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002213 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2214 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002215 [width]"+rm"(width) // %[width]
2216 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002217 : "memory", "cc"
2218#if defined(__SSE2__)
2219 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2220#endif
2221 );
2222}
2223
2224void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2225 const uint8* u_buf,
2226 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002227 uint8* dst_bgra,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002228 int width) {
2229 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002230 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002231 "pcmpeqb %%xmm5,%%xmm5 \n"
2232 "pxor %%xmm4,%%xmm4 \n"
2233 ".p2align 4 \n"
2234 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002235 READYUV422
2236 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002237 "pcmpeqb %%xmm5,%%xmm5 \n"
2238 "punpcklbw %%xmm0,%%xmm1 \n"
2239 "punpcklbw %%xmm2,%%xmm5 \n"
2240 "movdqa %%xmm5,%%xmm0 \n"
2241 "punpcklwd %%xmm1,%%xmm5 \n"
2242 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002243 "movdqa %%xmm5,(%[dst_bgra]) \n"
2244 "movdqa %%xmm0,0x10(%[dst_bgra]) \n"
2245 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002246 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002247 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002248 : [y_buf]"+r"(y_buf), // %[y_buf]
2249 [u_buf]"+r"(u_buf), // %[u_buf]
2250 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002251 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002252 [width]"+rm"(width) // %[width]
2253 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002254 : "memory", "cc"
2255#if defined(__SSE2__)
2256 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2257#endif
2258 );
2259}
2260
fbarchard@google.come214fe32012-06-04 23:47:11 +00002261void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002262 const uint8* u_buf,
2263 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002264 uint8* dst_abgr,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002265 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002266 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002267 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002268 "pcmpeqb %%xmm5,%%xmm5 \n"
2269 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002270 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002271 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002272 READYUV422
2273 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002274 "punpcklbw %%xmm1,%%xmm2 \n"
2275 "punpcklbw %%xmm5,%%xmm0 \n"
2276 "movdqa %%xmm2,%%xmm1 \n"
2277 "punpcklwd %%xmm0,%%xmm2 \n"
2278 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002279 "movdqa %%xmm2,(%[dst_abgr]) \n"
2280 "movdqa %%xmm1,0x10(%[dst_abgr]) \n"
2281 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002282 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002283 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002284 : [y_buf]"+r"(y_buf), // %[y_buf]
2285 [u_buf]"+r"(u_buf), // %[u_buf]
2286 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002287 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002288 [width]"+rm"(width) // %[width]
2289 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002290 : "memory", "cc"
2291#if defined(__SSE2__)
2292 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2293#endif
2294 );
2295}
2296
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002297void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2298 const uint8* u_buf,
2299 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002300 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002301 int width) {
2302 asm volatile (
2303 "sub %[u_buf],%[v_buf] \n"
2304 "pcmpeqb %%xmm5,%%xmm5 \n"
2305 "pxor %%xmm4,%%xmm4 \n"
2306 ".p2align 4 \n"
2307 "1: \n"
2308 READYUV422
2309 YUVTORGB
2310 "pcmpeqb %%xmm5,%%xmm5 \n"
2311 "punpcklbw %%xmm2,%%xmm1 \n"
2312 "punpcklbw %%xmm0,%%xmm5 \n"
2313 "movdqa %%xmm5,%%xmm0 \n"
2314 "punpcklwd %%xmm1,%%xmm5 \n"
2315 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002316 "movdqa %%xmm5,(%[dst_rgba]) \n"
2317 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2318 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002319 "sub $0x8,%[width] \n"
2320 "jg 1b \n"
2321 : [y_buf]"+r"(y_buf), // %[y_buf]
2322 [u_buf]"+r"(u_buf), // %[u_buf]
2323 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002324 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002325 [width]"+rm"(width) // %[width]
2326 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2327 : "memory", "cc"
2328#if defined(__SSE2__)
2329 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2330#endif
2331 );
2332}
2333
fbarchard@google.come214fe32012-06-04 23:47:11 +00002334void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002335 const uint8* u_buf,
2336 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002337 uint8* dst_bgra,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002338 int width) {
2339 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002340 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002341 "pcmpeqb %%xmm5,%%xmm5 \n"
2342 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002343 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002344 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002345 READYUV422
2346 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002347 "pcmpeqb %%xmm5,%%xmm5 \n"
2348 "punpcklbw %%xmm0,%%xmm1 \n"
2349 "punpcklbw %%xmm2,%%xmm5 \n"
2350 "movdqa %%xmm5,%%xmm0 \n"
2351 "punpcklwd %%xmm1,%%xmm5 \n"
2352 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002353 "movdqu %%xmm5,(%[dst_bgra]) \n"
2354 "movdqu %%xmm0,0x10(%[dst_bgra]) \n"
2355 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002356 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002357 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002358 : [y_buf]"+r"(y_buf), // %[y_buf]
2359 [u_buf]"+r"(u_buf), // %[u_buf]
2360 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002361 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002362 [width]"+rm"(width) // %[width]
2363 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002364 : "memory", "cc"
2365#if defined(__SSE2__)
2366 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2367#endif
2368 );
2369}
2370
fbarchard@google.come214fe32012-06-04 23:47:11 +00002371void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002372 const uint8* u_buf,
2373 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002374 uint8* dst_abgr,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002375 int width) {
2376 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002377 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002378 "pcmpeqb %%xmm5,%%xmm5 \n"
2379 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002380 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002381 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002382 READYUV422
2383 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002384 "punpcklbw %%xmm1,%%xmm2 \n"
2385 "punpcklbw %%xmm5,%%xmm0 \n"
2386 "movdqa %%xmm2,%%xmm1 \n"
2387 "punpcklwd %%xmm0,%%xmm2 \n"
2388 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002389 "movdqu %%xmm2,(%[dst_abgr]) \n"
2390 "movdqu %%xmm1,0x10(%[dst_abgr]) \n"
2391 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002392 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002393 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002394 : [y_buf]"+r"(y_buf), // %[y_buf]
2395 [u_buf]"+r"(u_buf), // %[u_buf]
2396 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002397 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002398 [width]"+rm"(width) // %[width]
2399 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002400 : "memory", "cc"
2401#if defined(__SSE2__)
2402 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2403#endif
2404 );
2405}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002406
2407void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2408 const uint8* u_buf,
2409 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002410 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002411 int width) {
2412 asm volatile (
2413 "sub %[u_buf],%[v_buf] \n"
2414 "pcmpeqb %%xmm5,%%xmm5 \n"
2415 "pxor %%xmm4,%%xmm4 \n"
2416 ".p2align 4 \n"
2417 "1: \n"
2418 READYUV422
2419 YUVTORGB
2420 "pcmpeqb %%xmm5,%%xmm5 \n"
2421 "punpcklbw %%xmm2,%%xmm1 \n"
2422 "punpcklbw %%xmm0,%%xmm5 \n"
2423 "movdqa %%xmm5,%%xmm0 \n"
2424 "punpcklwd %%xmm1,%%xmm5 \n"
2425 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002426 "movdqa %%xmm5,(%[dst_rgba]) \n"
2427 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2428 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002429 "sub $0x8,%[width] \n"
2430 "jg 1b \n"
2431 : [y_buf]"+r"(y_buf), // %[y_buf]
2432 [u_buf]"+r"(u_buf), // %[u_buf]
2433 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002434 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002435 [width]"+rm"(width) // %[width]
2436 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2437 : "memory", "cc"
2438#if defined(__SSE2__)
2439 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2440#endif
2441 );
2442}
2443
fbarchard@google.come214fe32012-06-04 23:47:11 +00002444#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002445
2446#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002447void YToARGBRow_SSE2(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002448 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002449 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002450 asm volatile (
fbarchard@google.com30859f72012-11-02 09:51:29 +00002451 "pxor %%xmm5,%%xmm5 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002452 "pcmpeqb %%xmm4,%%xmm4 \n"
2453 "pslld $0x18,%%xmm4 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002454 "mov $0x00100010,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002455 "movd %%eax,%%xmm3 \n"
2456 "pshufd $0x0,%%xmm3,%%xmm3 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002457 "mov $0x004a004a,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002458 "movd %%eax,%%xmm2 \n"
2459 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002460 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002461 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002462 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002463 "movq (%0),%%xmm0 \n"
2464 "lea 0x8(%0),%0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002465 "punpcklbw %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002466 "psubusw %%xmm3,%%xmm0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002467 "pmullw %%xmm2,%%xmm0 \n"
2468 "psrlw $6, %%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002469 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002470
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002471 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002472 "punpcklbw %%xmm0,%%xmm0 \n"
2473 "movdqa %%xmm0,%%xmm1 \n"
2474 "punpcklwd %%xmm0,%%xmm0 \n"
2475 "punpckhwd %%xmm1,%%xmm1 \n"
2476 "por %%xmm4,%%xmm0 \n"
2477 "por %%xmm4,%%xmm1 \n"
2478 "movdqa %%xmm0,(%1) \n"
2479 "movdqa %%xmm1,16(%1) \n"
2480 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002481
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002482 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002483 "jg 1b \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002484 : "+r"(y_buf), // %0
2485 "+r"(dst_argb), // %1
2486 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002487 :
2488 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002489#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002491#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002492 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002493}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002494#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002495
fbarchard@google.com42831e02012-01-21 02:54:17 +00002496#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002497// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002498CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002499 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2500};
2501
fbarchard@google.com42831e02012-01-21 02:54:17 +00002502void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002503 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002504 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002505 "movdqa %3,%%xmm5 \n"
2506 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002507 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002508 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002509 "movdqa (%0,%2),%%xmm0 \n"
2510 "pshufb %%xmm5,%%xmm0 \n"
2511 "sub $0x10,%2 \n"
2512 "movdqa %%xmm0,(%1) \n"
2513 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002514 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002515 : "+r"(src), // %0
2516 "+r"(dst), // %1
2517 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002518 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002519 : "memory", "cc"
2520#if defined(__SSE2__)
2521 , "xmm0", "xmm5"
2522#endif
2523 );
2524}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002525#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002526
fbarchard@google.com42831e02012-01-21 02:54:17 +00002527#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002528void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002529 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002530 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002531 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002532 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002533 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002534 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002535 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002536 "psllw $0x8,%%xmm0 \n"
2537 "psrlw $0x8,%%xmm1 \n"
2538 "por %%xmm1,%%xmm0 \n"
2539 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2540 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2541 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2542 "sub $0x10,%2 \n"
2543 "movdqu %%xmm0,(%1) \n"
2544 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002545 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002546 : "+r"(src), // %0
2547 "+r"(dst), // %1
2548 "+r"(temp_width) // %2
2549 :
2550 : "memory", "cc"
2551#if defined(__SSE2__)
2552 , "xmm0", "xmm1"
2553#endif
2554 );
2555}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002556#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002557
fbarchard@google.com16a96642012-03-02 22:38:09 +00002558#ifdef HAS_MIRRORROW_UV_SSSE3
2559// Shuffle table for reversing the bytes of UV channels.
2560CONST uvec8 kShuffleMirrorUV = {
2561 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2562};
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002563void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
fbarchard@google.com16a96642012-03-02 22:38:09 +00002564 int width) {
2565 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002566 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002567 "movdqa %4,%%xmm1 \n"
2568 "lea -16(%0,%3,2),%0 \n"
2569 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002570 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002571 "1: \n"
2572 "movdqa (%0),%%xmm0 \n"
2573 "lea -16(%0),%0 \n"
2574 "pshufb %%xmm1,%%xmm0 \n"
2575 "sub $8,%3 \n"
2576 "movlpd %%xmm0,(%1) \n"
2577 "movhpd %%xmm0,(%1,%2) \n"
2578 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002579 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002580 : "+r"(src), // %0
2581 "+r"(dst_u), // %1
2582 "+r"(dst_v), // %2
2583 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002584 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002585 : "memory", "cc"
2586#if defined(__SSE2__)
2587 , "xmm0", "xmm1"
2588#endif
2589 );
2590}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002591#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002592
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002593#ifdef HAS_ARGBMIRRORROW_SSSE3
2594// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002595CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002596 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2597};
2598
2599void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2600 intptr_t temp_width = static_cast<intptr_t>(width);
2601 asm volatile (
2602 "movdqa %3,%%xmm5 \n"
2603 "lea -0x10(%0),%0 \n"
2604 ".p2align 4 \n"
2605 "1: \n"
2606 "movdqa (%0,%2,4),%%xmm0 \n"
2607 "pshufb %%xmm5,%%xmm0 \n"
2608 "sub $0x4,%2 \n"
2609 "movdqa %%xmm0,(%1) \n"
2610 "lea 0x10(%1),%1 \n"
2611 "jg 1b \n"
2612 : "+r"(src), // %0
2613 "+r"(dst), // %1
2614 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002615 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002616 : "memory", "cc"
2617#if defined(__SSE2__)
2618 , "xmm0", "xmm5"
2619#endif
2620 );
2621}
2622#endif // HAS_ARGBMIRRORROW_SSSE3
2623
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002624#ifdef HAS_SPLITUVROW_SSE2
2625void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002626 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002627 "pcmpeqb %%xmm5,%%xmm5 \n"
2628 "psrlw $0x8,%%xmm5 \n"
2629 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002630 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002631 "1: \n"
2632 "movdqa (%0),%%xmm0 \n"
2633 "movdqa 0x10(%0),%%xmm1 \n"
2634 "lea 0x20(%0),%0 \n"
2635 "movdqa %%xmm0,%%xmm2 \n"
2636 "movdqa %%xmm1,%%xmm3 \n"
2637 "pand %%xmm5,%%xmm0 \n"
2638 "pand %%xmm5,%%xmm1 \n"
2639 "packuswb %%xmm1,%%xmm0 \n"
2640 "psrlw $0x8,%%xmm2 \n"
2641 "psrlw $0x8,%%xmm3 \n"
2642 "packuswb %%xmm3,%%xmm2 \n"
2643 "movdqa %%xmm0,(%1) \n"
2644 "movdqa %%xmm2,(%1,%2) \n"
2645 "lea 0x10(%1),%1 \n"
2646 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002647 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002648 : "+r"(src_uv), // %0
2649 "+r"(dst_u), // %1
2650 "+r"(dst_v), // %2
2651 "+r"(pix) // %3
2652 :
2653 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002654#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002655 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002656#endif
2657 );
2658}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002659
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002660void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2661 int pix) {
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002662 asm volatile (
2663 "pcmpeqb %%xmm5,%%xmm5 \n"
2664 "psrlw $0x8,%%xmm5 \n"
2665 "sub %1,%2 \n"
2666 ".p2align 4 \n"
2667 "1: \n"
2668 "movdqu (%0),%%xmm0 \n"
2669 "movdqu 0x10(%0),%%xmm1 \n"
2670 "lea 0x20(%0),%0 \n"
2671 "movdqa %%xmm0,%%xmm2 \n"
2672 "movdqa %%xmm1,%%xmm3 \n"
2673 "pand %%xmm5,%%xmm0 \n"
2674 "pand %%xmm5,%%xmm1 \n"
2675 "packuswb %%xmm1,%%xmm0 \n"
2676 "psrlw $0x8,%%xmm2 \n"
2677 "psrlw $0x8,%%xmm3 \n"
2678 "packuswb %%xmm3,%%xmm2 \n"
2679 "movdqu %%xmm0,(%1) \n"
2680 "movdqu %%xmm2,(%1,%2) \n"
2681 "lea 0x10(%1),%1 \n"
2682 "sub $0x10,%3 \n"
2683 "jg 1b \n"
2684 : "+r"(src_uv), // %0
2685 "+r"(dst_u), // %1
2686 "+r"(dst_v), // %2
2687 "+r"(pix) // %3
2688 :
2689 : "memory", "cc"
2690#if defined(__SSE2__)
2691 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2692#endif
2693 );
2694}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002695#endif // HAS_SPLITUVROW_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002696
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002697#ifdef HAS_MERGEUVROW_SSE2
2698void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2699 int width) {
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002700 asm volatile (
2701 "sub %0,%1 \n"
2702 ".p2align 4 \n"
2703 "1: \n"
2704 "movdqa (%0),%%xmm0 \n"
2705 "movdqa (%0,%1,1),%%xmm1 \n"
2706 "lea 0x10(%0),%0 \n"
2707 "movdqa %%xmm0,%%xmm2 \n"
2708 "punpcklbw %%xmm1,%%xmm0 \n"
2709 "punpckhbw %%xmm1,%%xmm2 \n"
2710 "movdqa %%xmm0,(%2) \n"
2711 "movdqa %%xmm2,0x10(%2) \n"
2712 "lea 0x20(%2),%2 \n"
2713 "sub $0x10,%3 \n"
2714 "jg 1b \n"
2715 : "+r"(src_u), // %0
2716 "+r"(src_v), // %1
2717 "+r"(dst_uv), // %2
2718 "+r"(width) // %3
2719 :
2720 : "memory", "cc"
2721#if defined(__SSE2__)
2722 , "xmm0", "xmm1", "xmm2"
2723#endif
2724 );
2725}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002726
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002727void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2728 uint8* dst_uv, int width) {
fbarchard@google.come0d86482012-10-27 19:07:55 +00002729 asm volatile (
2730 "sub %0,%1 \n"
2731 ".p2align 4 \n"
2732 "1: \n"
2733 "movdqu (%0),%%xmm0 \n"
2734 "movdqu (%0,%1,1),%%xmm1 \n"
2735 "lea 0x10(%0),%0 \n"
2736 "movdqa %%xmm0,%%xmm2 \n"
2737 "punpcklbw %%xmm1,%%xmm0 \n"
2738 "punpckhbw %%xmm1,%%xmm2 \n"
2739 "movdqu %%xmm0,(%2) \n"
2740 "movdqu %%xmm2,0x10(%2) \n"
2741 "lea 0x20(%2),%2 \n"
2742 "sub $0x10,%3 \n"
2743 "jg 1b \n"
2744 : "+r"(src_u), // %0
2745 "+r"(src_v), // %1
2746 "+r"(dst_uv), // %2
2747 "+r"(width) // %3
2748 :
2749 : "memory", "cc"
2750#if defined(__SSE2__)
2751 , "xmm0", "xmm1", "xmm2"
2752#endif
2753 );
2754}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002755#endif // HAS_MERGEUVROW_SSE2
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002756
fbarchard@google.com19932f82012-02-16 22:19:14 +00002757#ifdef HAS_COPYROW_SSE2
2758void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002759 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002760 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002761 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002762 "1: \n"
2763 "movdqa (%0),%%xmm0 \n"
2764 "movdqa 0x10(%0),%%xmm1 \n"
2765 "movdqa %%xmm0,(%0,%1) \n"
2766 "movdqa %%xmm1,0x10(%0,%1) \n"
2767 "lea 0x20(%0),%0 \n"
2768 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002769 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002770 : "+r"(src), // %0
2771 "+r"(dst), // %1
2772 "+r"(count) // %2
2773 :
2774 : "memory", "cc"
2775#if defined(__SSE2__)
2776 , "xmm0", "xmm1"
2777#endif
2778 );
2779}
2780#endif // HAS_COPYROW_SSE2
2781
2782#ifdef HAS_COPYROW_X86
2783void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2784 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002785 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002786 "shr $0x2,%2 \n"
2787 "rep movsl \n"
2788 : "+S"(src), // %0
2789 "+D"(dst), // %1
2790 "+c"(width_tmp) // %2
2791 :
2792 : "memory", "cc"
2793 );
2794}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002795#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002796
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002797#ifdef HAS_SETROW_X86
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002798void SetRow_X86(uint8* dst, uint32 v32, int width) {
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002799 size_t width_tmp = static_cast<size_t>(width);
2800 asm volatile (
2801 "shr $0x2,%1 \n"
2802 "rep stosl \n"
2803 : "+D"(dst), // %0
2804 "+c"(width_tmp) // %1
2805 : "a"(v32) // %2
2806 : "memory", "cc");
2807}
2808
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002809void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002810 int dst_stride, int height) {
2811 for (int y = 0; y < height; ++y) {
2812 size_t width_tmp = static_cast<size_t>(width);
2813 uint32* d = reinterpret_cast<uint32*>(dst);
2814 asm volatile (
2815 "rep stosl \n"
2816 : "+D"(d), // %0
2817 "+c"(width_tmp) // %1
2818 : "a"(v32) // %2
2819 : "memory", "cc");
2820 dst += dst_stride;
2821 }
2822}
2823#endif // HAS_SETROW_X86
2824
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002825#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002826void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002827 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002828 "pcmpeqb %%xmm5,%%xmm5 \n"
2829 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002830 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002831 "1: \n"
2832 "movdqa (%0),%%xmm0 \n"
2833 "movdqa 0x10(%0),%%xmm1 \n"
2834 "lea 0x20(%0),%0 \n"
2835 "pand %%xmm5,%%xmm0 \n"
2836 "pand %%xmm5,%%xmm1 \n"
2837 "packuswb %%xmm1,%%xmm0 \n"
2838 "movdqa %%xmm0,(%1) \n"
2839 "lea 0x10(%1),%1 \n"
2840 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002841 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002842 : "+r"(src_yuy2), // %0
2843 "+r"(dst_y), // %1
2844 "+r"(pix) // %2
2845 :
2846 : "memory", "cc"
2847#if defined(__SSE2__)
2848 , "xmm0", "xmm1", "xmm5"
2849#endif
2850 );
2851}
2852
2853void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002854 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002855 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002856 "pcmpeqb %%xmm5,%%xmm5 \n"
2857 "psrlw $0x8,%%xmm5 \n"
2858 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002859 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002860 "1: \n"
2861 "movdqa (%0),%%xmm0 \n"
2862 "movdqa 0x10(%0),%%xmm1 \n"
2863 "movdqa (%0,%4,1),%%xmm2 \n"
2864 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2865 "lea 0x20(%0),%0 \n"
2866 "pavgb %%xmm2,%%xmm0 \n"
2867 "pavgb %%xmm3,%%xmm1 \n"
2868 "psrlw $0x8,%%xmm0 \n"
2869 "psrlw $0x8,%%xmm1 \n"
2870 "packuswb %%xmm1,%%xmm0 \n"
2871 "movdqa %%xmm0,%%xmm1 \n"
2872 "pand %%xmm5,%%xmm0 \n"
2873 "packuswb %%xmm0,%%xmm0 \n"
2874 "psrlw $0x8,%%xmm1 \n"
2875 "packuswb %%xmm1,%%xmm1 \n"
2876 "movq %%xmm0,(%1) \n"
2877 "movq %%xmm1,(%1,%2) \n"
2878 "lea 0x8(%1),%1 \n"
2879 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002880 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002881 : "+r"(src_yuy2), // %0
2882 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002883 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002884 "+r"(pix) // %3
2885 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2886 : "memory", "cc"
2887#if defined(__SSE2__)
2888 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2889#endif
2890 );
2891}
2892
fbarchard@google.comc704f782012-08-30 19:53:48 +00002893void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2894 uint8* dst_u, uint8* dst_v, int pix) {
2895 asm volatile (
2896 "pcmpeqb %%xmm5,%%xmm5 \n"
2897 "psrlw $0x8,%%xmm5 \n"
2898 "sub %1,%2 \n"
2899 ".p2align 4 \n"
2900 "1: \n"
2901 "movdqa (%0),%%xmm0 \n"
2902 "movdqa 0x10(%0),%%xmm1 \n"
2903 "lea 0x20(%0),%0 \n"
2904 "psrlw $0x8,%%xmm0 \n"
2905 "psrlw $0x8,%%xmm1 \n"
2906 "packuswb %%xmm1,%%xmm0 \n"
2907 "movdqa %%xmm0,%%xmm1 \n"
2908 "pand %%xmm5,%%xmm0 \n"
2909 "packuswb %%xmm0,%%xmm0 \n"
2910 "psrlw $0x8,%%xmm1 \n"
2911 "packuswb %%xmm1,%%xmm1 \n"
2912 "movq %%xmm0,(%1) \n"
2913 "movq %%xmm1,(%1,%2) \n"
2914 "lea 0x8(%1),%1 \n"
2915 "sub $0x10,%3 \n"
2916 "jg 1b \n"
2917 : "+r"(src_yuy2), // %0
2918 "+r"(dst_u), // %1
2919 "+r"(dst_v), // %2
2920 "+r"(pix) // %3
2921 :
2922 : "memory", "cc"
2923#if defined(__SSE2__)
2924 , "xmm0", "xmm1", "xmm5"
2925#endif
2926 );
2927}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002928
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002929void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2930 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002931 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002932 "pcmpeqb %%xmm5,%%xmm5 \n"
2933 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002934 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002935 "1: \n"
2936 "movdqu (%0),%%xmm0 \n"
2937 "movdqu 0x10(%0),%%xmm1 \n"
2938 "lea 0x20(%0),%0 \n"
2939 "pand %%xmm5,%%xmm0 \n"
2940 "pand %%xmm5,%%xmm1 \n"
2941 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002942 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002943 "movdqu %%xmm0,(%1) \n"
2944 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002945 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002946 : "+r"(src_yuy2), // %0
2947 "+r"(dst_y), // %1
2948 "+r"(pix) // %2
2949 :
2950 : "memory", "cc"
2951#if defined(__SSE2__)
2952 , "xmm0", "xmm1", "xmm5"
2953#endif
2954 );
2955}
2956
2957void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2958 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002959 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002960 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002961 "pcmpeqb %%xmm5,%%xmm5 \n"
2962 "psrlw $0x8,%%xmm5 \n"
2963 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002964 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002965 "1: \n"
2966 "movdqu (%0),%%xmm0 \n"
2967 "movdqu 0x10(%0),%%xmm1 \n"
2968 "movdqu (%0,%4,1),%%xmm2 \n"
2969 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2970 "lea 0x20(%0),%0 \n"
2971 "pavgb %%xmm2,%%xmm0 \n"
2972 "pavgb %%xmm3,%%xmm1 \n"
2973 "psrlw $0x8,%%xmm0 \n"
2974 "psrlw $0x8,%%xmm1 \n"
2975 "packuswb %%xmm1,%%xmm0 \n"
2976 "movdqa %%xmm0,%%xmm1 \n"
2977 "pand %%xmm5,%%xmm0 \n"
2978 "packuswb %%xmm0,%%xmm0 \n"
2979 "psrlw $0x8,%%xmm1 \n"
2980 "packuswb %%xmm1,%%xmm1 \n"
2981 "movq %%xmm0,(%1) \n"
2982 "movq %%xmm1,(%1,%2) \n"
2983 "lea 0x8(%1),%1 \n"
2984 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002985 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002986 : "+r"(src_yuy2), // %0
2987 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002988 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002989 "+r"(pix) // %3
2990 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2991 : "memory", "cc"
2992#if defined(__SSE2__)
2993 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2994#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002995 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002996}
2997
fbarchard@google.comc704f782012-08-30 19:53:48 +00002998void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2999 uint8* dst_u, uint8* dst_v, int pix) {
3000 asm volatile (
3001 "pcmpeqb %%xmm5,%%xmm5 \n"
3002 "psrlw $0x8,%%xmm5 \n"
3003 "sub %1,%2 \n"
3004 ".p2align 4 \n"
3005 "1: \n"
3006 "movdqu (%0),%%xmm0 \n"
3007 "movdqu 0x10(%0),%%xmm1 \n"
3008 "lea 0x20(%0),%0 \n"
3009 "psrlw $0x8,%%xmm0 \n"
3010 "psrlw $0x8,%%xmm1 \n"
3011 "packuswb %%xmm1,%%xmm0 \n"
3012 "movdqa %%xmm0,%%xmm1 \n"
3013 "pand %%xmm5,%%xmm0 \n"
3014 "packuswb %%xmm0,%%xmm0 \n"
3015 "psrlw $0x8,%%xmm1 \n"
3016 "packuswb %%xmm1,%%xmm1 \n"
3017 "movq %%xmm0,(%1) \n"
3018 "movq %%xmm1,(%1,%2) \n"
3019 "lea 0x8(%1),%1 \n"
3020 "sub $0x10,%3 \n"
3021 "jg 1b \n"
3022 : "+r"(src_yuy2), // %0
3023 "+r"(dst_u), // %1
3024 "+r"(dst_v), // %2
3025 "+r"(pix) // %3
3026 :
3027 : "memory", "cc"
3028#if defined(__SSE2__)
3029 , "xmm0", "xmm1", "xmm5"
3030#endif
3031 );
3032}
3033
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003034void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003035 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003036 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003037 "1: \n"
3038 "movdqa (%0),%%xmm0 \n"
3039 "movdqa 0x10(%0),%%xmm1 \n"
3040 "lea 0x20(%0),%0 \n"
3041 "psrlw $0x8,%%xmm0 \n"
3042 "psrlw $0x8,%%xmm1 \n"
3043 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003044 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003045 "movdqa %%xmm0,(%1) \n"
3046 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003047 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003048 : "+r"(src_uyvy), // %0
3049 "+r"(dst_y), // %1
3050 "+r"(pix) // %2
3051 :
3052 : "memory", "cc"
3053#if defined(__SSE2__)
3054 , "xmm0", "xmm1"
3055#endif
3056 );
3057}
3058
3059void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003060 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003061 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003062 "pcmpeqb %%xmm5,%%xmm5 \n"
3063 "psrlw $0x8,%%xmm5 \n"
3064 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003065 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003066 "1: \n"
3067 "movdqa (%0),%%xmm0 \n"
3068 "movdqa 0x10(%0),%%xmm1 \n"
3069 "movdqa (%0,%4,1),%%xmm2 \n"
3070 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
3071 "lea 0x20(%0),%0 \n"
3072 "pavgb %%xmm2,%%xmm0 \n"
3073 "pavgb %%xmm3,%%xmm1 \n"
3074 "pand %%xmm5,%%xmm0 \n"
3075 "pand %%xmm5,%%xmm1 \n"
3076 "packuswb %%xmm1,%%xmm0 \n"
3077 "movdqa %%xmm0,%%xmm1 \n"
3078 "pand %%xmm5,%%xmm0 \n"
3079 "packuswb %%xmm0,%%xmm0 \n"
3080 "psrlw $0x8,%%xmm1 \n"
3081 "packuswb %%xmm1,%%xmm1 \n"
3082 "movq %%xmm0,(%1) \n"
3083 "movq %%xmm1,(%1,%2) \n"
3084 "lea 0x8(%1),%1 \n"
3085 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003086 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003087 : "+r"(src_uyvy), // %0
3088 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003089 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003090 "+r"(pix) // %3
3091 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3092 : "memory", "cc"
3093#if defined(__SSE2__)
3094 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3095#endif
3096 );
3097}
3098
fbarchard@google.comc704f782012-08-30 19:53:48 +00003099void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3100 uint8* dst_u, uint8* dst_v, int pix) {
3101 asm volatile (
3102 "pcmpeqb %%xmm5,%%xmm5 \n"
3103 "psrlw $0x8,%%xmm5 \n"
3104 "sub %1,%2 \n"
3105 ".p2align 4 \n"
3106 "1: \n"
3107 "movdqa (%0),%%xmm0 \n"
3108 "movdqa 0x10(%0),%%xmm1 \n"
3109 "lea 0x20(%0),%0 \n"
3110 "pand %%xmm5,%%xmm0 \n"
3111 "pand %%xmm5,%%xmm1 \n"
3112 "packuswb %%xmm1,%%xmm0 \n"
3113 "movdqa %%xmm0,%%xmm1 \n"
3114 "pand %%xmm5,%%xmm0 \n"
3115 "packuswb %%xmm0,%%xmm0 \n"
3116 "psrlw $0x8,%%xmm1 \n"
3117 "packuswb %%xmm1,%%xmm1 \n"
3118 "movq %%xmm0,(%1) \n"
3119 "movq %%xmm1,(%1,%2) \n"
3120 "lea 0x8(%1),%1 \n"
3121 "sub $0x10,%3 \n"
3122 "jg 1b \n"
3123 : "+r"(src_uyvy), // %0
3124 "+r"(dst_u), // %1
3125 "+r"(dst_v), // %2
3126 "+r"(pix) // %3
3127 :
3128 : "memory", "cc"
3129#if defined(__SSE2__)
3130 , "xmm0", "xmm1", "xmm5"
3131#endif
3132 );
3133}
3134
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003135void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3136 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003137 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003138 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003139 "1: \n"
3140 "movdqu (%0),%%xmm0 \n"
3141 "movdqu 0x10(%0),%%xmm1 \n"
3142 "lea 0x20(%0),%0 \n"
3143 "psrlw $0x8,%%xmm0 \n"
3144 "psrlw $0x8,%%xmm1 \n"
3145 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003146 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003147 "movdqu %%xmm0,(%1) \n"
3148 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003149 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003150 : "+r"(src_uyvy), // %0
3151 "+r"(dst_y), // %1
3152 "+r"(pix) // %2
3153 :
3154 : "memory", "cc"
3155#if defined(__SSE2__)
3156 , "xmm0", "xmm1"
3157#endif
3158 );
3159}
3160
3161void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003162 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003163 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003164 "pcmpeqb %%xmm5,%%xmm5 \n"
3165 "psrlw $0x8,%%xmm5 \n"
3166 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003167 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003168 "1: \n"
3169 "movdqu (%0),%%xmm0 \n"
3170 "movdqu 0x10(%0),%%xmm1 \n"
3171 "movdqu (%0,%4,1),%%xmm2 \n"
3172 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3173 "lea 0x20(%0),%0 \n"
3174 "pavgb %%xmm2,%%xmm0 \n"
3175 "pavgb %%xmm3,%%xmm1 \n"
3176 "pand %%xmm5,%%xmm0 \n"
3177 "pand %%xmm5,%%xmm1 \n"
3178 "packuswb %%xmm1,%%xmm0 \n"
3179 "movdqa %%xmm0,%%xmm1 \n"
3180 "pand %%xmm5,%%xmm0 \n"
3181 "packuswb %%xmm0,%%xmm0 \n"
3182 "psrlw $0x8,%%xmm1 \n"
3183 "packuswb %%xmm1,%%xmm1 \n"
3184 "movq %%xmm0,(%1) \n"
3185 "movq %%xmm1,(%1,%2) \n"
3186 "lea 0x8(%1),%1 \n"
3187 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003188 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003189 : "+r"(src_uyvy), // %0
3190 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003191 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003192 "+r"(pix) // %3
3193 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3194 : "memory", "cc"
3195#if defined(__SSE2__)
3196 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3197#endif
3198 );
3199}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003200
3201void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3202 uint8* dst_u, uint8* dst_v, int pix) {
3203 asm volatile (
3204 "pcmpeqb %%xmm5,%%xmm5 \n"
3205 "psrlw $0x8,%%xmm5 \n"
3206 "sub %1,%2 \n"
3207 ".p2align 4 \n"
3208 "1: \n"
3209 "movdqu (%0),%%xmm0 \n"
3210 "movdqu 0x10(%0),%%xmm1 \n"
3211 "lea 0x20(%0),%0 \n"
3212 "pand %%xmm5,%%xmm0 \n"
3213 "pand %%xmm5,%%xmm1 \n"
3214 "packuswb %%xmm1,%%xmm0 \n"
3215 "movdqa %%xmm0,%%xmm1 \n"
3216 "pand %%xmm5,%%xmm0 \n"
3217 "packuswb %%xmm0,%%xmm0 \n"
3218 "psrlw $0x8,%%xmm1 \n"
3219 "packuswb %%xmm1,%%xmm1 \n"
3220 "movq %%xmm0,(%1) \n"
3221 "movq %%xmm1,(%1,%2) \n"
3222 "lea 0x8(%1),%1 \n"
3223 "sub $0x10,%3 \n"
3224 "jg 1b \n"
3225 : "+r"(src_uyvy), // %0
3226 "+r"(dst_u), // %1
3227 "+r"(dst_v), // %2
3228 "+r"(pix) // %3
3229 :
3230 : "memory", "cc"
3231#if defined(__SSE2__)
3232 , "xmm0", "xmm1", "xmm5"
3233#endif
3234 );
3235}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003236#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003237
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003238#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003239// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003240void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3241 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003242 asm volatile (
3243 "pcmpeqb %%xmm7,%%xmm7 \n"
3244 "psrlw $0xf,%%xmm7 \n"
3245 "pcmpeqb %%xmm6,%%xmm6 \n"
3246 "psrlw $0x8,%%xmm6 \n"
3247 "pcmpeqb %%xmm5,%%xmm5 \n"
3248 "psllw $0x8,%%xmm5 \n"
3249 "pcmpeqb %%xmm4,%%xmm4 \n"
3250 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003251 "sub $0x1,%3 \n"
3252 "je 91f \n"
3253 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003254
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003255 // 1 pixel loop until destination pointer is aligned.
3256 "10: \n"
3257 "test $0xf,%2 \n"
3258 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003259 "movd (%0),%%xmm3 \n"
3260 "lea 0x4(%0),%0 \n"
3261 "movdqa %%xmm3,%%xmm0 \n"
3262 "pxor %%xmm4,%%xmm3 \n"
3263 "movd (%1),%%xmm2 \n"
3264 "psrlw $0x8,%%xmm3 \n"
3265 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3266 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3267 "pand %%xmm6,%%xmm2 \n"
3268 "paddw %%xmm7,%%xmm3 \n"
3269 "pmullw %%xmm3,%%xmm2 \n"
3270 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003271 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003272 "psrlw $0x8,%%xmm1 \n"
3273 "por %%xmm4,%%xmm0 \n"
3274 "pmullw %%xmm3,%%xmm1 \n"
3275 "psrlw $0x8,%%xmm2 \n"
3276 "paddusb %%xmm2,%%xmm0 \n"
3277 "pand %%xmm5,%%xmm1 \n"
3278 "paddusb %%xmm1,%%xmm0 \n"
3279 "sub $0x1,%3 \n"
3280 "movd %%xmm0,(%2) \n"
3281 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003282 "jge 10b \n"
3283
3284 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003285 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003286 "jl 49f \n"
3287
fbarchard@google.com794fe122012-06-15 01:05:01 +00003288 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003289 ".p2align 2 \n"
3290 "41: \n"
3291 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003292 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003293 "movdqa %%xmm3,%%xmm0 \n"
3294 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003295 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003296 "psrlw $0x8,%%xmm3 \n"
3297 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3298 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003299 "pand %%xmm6,%%xmm2 \n"
3300 "paddw %%xmm7,%%xmm3 \n"
3301 "pmullw %%xmm3,%%xmm2 \n"
3302 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003303 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003304 "psrlw $0x8,%%xmm1 \n"
3305 "por %%xmm4,%%xmm0 \n"
3306 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003307 "psrlw $0x8,%%xmm2 \n"
3308 "paddusb %%xmm2,%%xmm0 \n"
3309 "pand %%xmm5,%%xmm1 \n"
3310 "paddusb %%xmm1,%%xmm0 \n"
3311 "sub $0x4,%3 \n"
3312 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003313 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003314 "jge 41b \n"
3315
3316 "49: \n"
3317 "add $0x3,%3 \n"
3318 "jl 99f \n"
3319
fbarchard@google.com794fe122012-06-15 01:05:01 +00003320 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003321 "91: \n"
3322 "movd (%0),%%xmm3 \n"
3323 "lea 0x4(%0),%0 \n"
3324 "movdqa %%xmm3,%%xmm0 \n"
3325 "pxor %%xmm4,%%xmm3 \n"
3326 "movd (%1),%%xmm2 \n"
3327 "psrlw $0x8,%%xmm3 \n"
3328 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3329 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3330 "pand %%xmm6,%%xmm2 \n"
3331 "paddw %%xmm7,%%xmm3 \n"
3332 "pmullw %%xmm3,%%xmm2 \n"
3333 "movd (%1),%%xmm1 \n"
3334 "lea 0x4(%1),%1 \n"
3335 "psrlw $0x8,%%xmm1 \n"
3336 "por %%xmm4,%%xmm0 \n"
3337 "pmullw %%xmm3,%%xmm1 \n"
3338 "psrlw $0x8,%%xmm2 \n"
3339 "paddusb %%xmm2,%%xmm0 \n"
3340 "pand %%xmm5,%%xmm1 \n"
3341 "paddusb %%xmm1,%%xmm0 \n"
3342 "sub $0x1,%3 \n"
3343 "movd %%xmm0,(%2) \n"
3344 "lea 0x4(%2),%2 \n"
3345 "jge 91b \n"
3346 "99: \n"
3347 : "+r"(src_argb0), // %0
3348 "+r"(src_argb1), // %1
3349 "+r"(dst_argb), // %2
3350 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003351 :
3352 : "memory", "cc"
3353#if defined(__SSE2__)
3354 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3355#endif
3356 );
3357}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003358#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003359
fbarchard@google.com96af8702012-04-06 18:22:27 +00003360#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003361// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003362CONST uvec8 kShuffleAlpha = {
3363 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3364 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3365};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003366
3367// Blend 8 pixels at a time
3368// Shuffle table for reversing the bytes.
3369
3370// Same as SSE2, but replaces
3371// psrlw xmm3, 8 // alpha
3372// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3373// pshuflw xmm3, xmm3,0F5h
3374// with..
3375// pshufb xmm3, kShuffleAlpha // alpha
3376
3377void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3378 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003379 asm volatile (
3380 "pcmpeqb %%xmm7,%%xmm7 \n"
3381 "psrlw $0xf,%%xmm7 \n"
3382 "pcmpeqb %%xmm6,%%xmm6 \n"
3383 "psrlw $0x8,%%xmm6 \n"
3384 "pcmpeqb %%xmm5,%%xmm5 \n"
3385 "psllw $0x8,%%xmm5 \n"
3386 "pcmpeqb %%xmm4,%%xmm4 \n"
3387 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003388 "sub $0x1,%3 \n"
3389 "je 91f \n"
3390 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003391
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003392 // 1 pixel loop until destination pointer is aligned.
3393 "10: \n"
3394 "test $0xf,%2 \n"
3395 "je 19f \n"
3396 "movd (%0),%%xmm3 \n"
3397 "lea 0x4(%0),%0 \n"
3398 "movdqa %%xmm3,%%xmm0 \n"
3399 "pxor %%xmm4,%%xmm3 \n"
3400 "movd (%1),%%xmm2 \n"
3401 "pshufb %4,%%xmm3 \n"
3402 "pand %%xmm6,%%xmm2 \n"
3403 "paddw %%xmm7,%%xmm3 \n"
3404 "pmullw %%xmm3,%%xmm2 \n"
3405 "movd (%1),%%xmm1 \n"
3406 "lea 0x4(%1),%1 \n"
3407 "psrlw $0x8,%%xmm1 \n"
3408 "por %%xmm4,%%xmm0 \n"
3409 "pmullw %%xmm3,%%xmm1 \n"
3410 "psrlw $0x8,%%xmm2 \n"
3411 "paddusb %%xmm2,%%xmm0 \n"
3412 "pand %%xmm5,%%xmm1 \n"
3413 "paddusb %%xmm1,%%xmm0 \n"
3414 "sub $0x1,%3 \n"
3415 "movd %%xmm0,(%2) \n"
3416 "lea 0x4(%2),%2 \n"
3417 "jge 10b \n"
3418
3419 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003420 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003421 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003422 "test $0xf,%0 \n"
3423 "jne 41f \n"
3424 "test $0xf,%1 \n"
3425 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003426
fbarchard@google.com794fe122012-06-15 01:05:01 +00003427 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003428 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003429 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003430 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003431 "lea 0x10(%0),%0 \n"
3432 "movdqa %%xmm3,%%xmm0 \n"
3433 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003434 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003435 "pshufb %4,%%xmm3 \n"
3436 "pand %%xmm6,%%xmm2 \n"
3437 "paddw %%xmm7,%%xmm3 \n"
3438 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003439 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003440 "lea 0x10(%1),%1 \n"
3441 "psrlw $0x8,%%xmm1 \n"
3442 "por %%xmm4,%%xmm0 \n"
3443 "pmullw %%xmm3,%%xmm1 \n"
3444 "psrlw $0x8,%%xmm2 \n"
3445 "paddusb %%xmm2,%%xmm0 \n"
3446 "pand %%xmm5,%%xmm1 \n"
3447 "paddusb %%xmm1,%%xmm0 \n"
3448 "sub $0x4,%3 \n"
3449 "movdqa %%xmm0,(%2) \n"
3450 "lea 0x10(%2),%2 \n"
3451 "jge 40b \n"
3452 "jmp 49f \n"
3453
3454 // 4 pixel unaligned loop.
3455 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003456 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003457 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003458 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003459 "movdqa %%xmm3,%%xmm0 \n"
3460 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003461 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003462 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003463 "pand %%xmm6,%%xmm2 \n"
3464 "paddw %%xmm7,%%xmm3 \n"
3465 "pmullw %%xmm3,%%xmm2 \n"
3466 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003467 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003468 "psrlw $0x8,%%xmm1 \n"
3469 "por %%xmm4,%%xmm0 \n"
3470 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003471 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003472 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003473 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003474 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003475 "sub $0x4,%3 \n"
3476 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003477 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003478 "jge 41b \n"
3479
3480 "49: \n"
3481 "add $0x3,%3 \n"
3482 "jl 99f \n"
3483
fbarchard@google.com794fe122012-06-15 01:05:01 +00003484 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003485 "91: \n"
3486 "movd (%0),%%xmm3 \n"
3487 "lea 0x4(%0),%0 \n"
3488 "movdqa %%xmm3,%%xmm0 \n"
3489 "pxor %%xmm4,%%xmm3 \n"
3490 "movd (%1),%%xmm2 \n"
3491 "pshufb %4,%%xmm3 \n"
3492 "pand %%xmm6,%%xmm2 \n"
3493 "paddw %%xmm7,%%xmm3 \n"
3494 "pmullw %%xmm3,%%xmm2 \n"
3495 "movd (%1),%%xmm1 \n"
3496 "lea 0x4(%1),%1 \n"
3497 "psrlw $0x8,%%xmm1 \n"
3498 "por %%xmm4,%%xmm0 \n"
3499 "pmullw %%xmm3,%%xmm1 \n"
3500 "psrlw $0x8,%%xmm2 \n"
3501 "paddusb %%xmm2,%%xmm0 \n"
3502 "pand %%xmm5,%%xmm1 \n"
3503 "paddusb %%xmm1,%%xmm0 \n"
3504 "sub $0x1,%3 \n"
3505 "movd %%xmm0,(%2) \n"
3506 "lea 0x4(%2),%2 \n"
3507 "jge 91b \n"
3508 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003509 : "+r"(src_argb0), // %0
3510 "+r"(src_argb1), // %1
3511 "+r"(dst_argb), // %2
3512 "+r"(width) // %3
3513 : "m"(kShuffleAlpha) // %4
3514 : "memory", "cc"
3515#if defined(__SSE2__)
3516 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3517#endif
3518 );
3519}
3520#endif // HAS_ARGBBLENDROW_SSSE3
3521
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003522#ifdef HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003523// Attenuate 4 pixels at a time.
3524// aligned to 16 bytes
3525void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3526 asm volatile (
3527 "sub %0,%1 \n"
3528 "pcmpeqb %%xmm4,%%xmm4 \n"
3529 "pslld $0x18,%%xmm4 \n"
3530 "pcmpeqb %%xmm5,%%xmm5 \n"
3531 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003532
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003533 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003534 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003535 "1: \n"
3536 "movdqa (%0),%%xmm0 \n"
3537 "punpcklbw %%xmm0,%%xmm0 \n"
3538 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3539 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3540 "pmulhuw %%xmm2,%%xmm0 \n"
3541 "movdqa (%0),%%xmm1 \n"
3542 "punpckhbw %%xmm1,%%xmm1 \n"
3543 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3544 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3545 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003546 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003547 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003548 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003549 "psrlw $0x8,%%xmm1 \n"
3550 "packuswb %%xmm1,%%xmm0 \n"
3551 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003552 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003553 "sub $0x4,%2 \n"
3554 "movdqa %%xmm0,(%0,%1,1) \n"
3555 "lea 0x10(%0),%0 \n"
3556 "jg 1b \n"
3557 : "+r"(src_argb), // %0
3558 "+r"(dst_argb), // %1
3559 "+r"(width) // %2
3560 :
3561 : "memory", "cc"
3562#if defined(__SSE2__)
3563 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3564#endif
3565 );
3566}
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003567#endif // HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003568
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003569#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003570// Shuffle table duplicating alpha
3571CONST uvec8 kShuffleAlpha0 = {
3572 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3573};
3574CONST uvec8 kShuffleAlpha1 = {
3575 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3576 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3577};
3578// Attenuate 4 pixels at a time.
3579// aligned to 16 bytes
3580void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3581 asm volatile (
3582 "sub %0,%1 \n"
3583 "pcmpeqb %%xmm3,%%xmm3 \n"
3584 "pslld $0x18,%%xmm3 \n"
3585 "movdqa %3,%%xmm4 \n"
3586 "movdqa %4,%%xmm5 \n"
3587
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003588 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003589 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003590 "1: \n"
3591 "movdqa (%0),%%xmm0 \n"
3592 "pshufb %%xmm4,%%xmm0 \n"
3593 "movdqa (%0),%%xmm1 \n"
3594 "punpcklbw %%xmm1,%%xmm1 \n"
3595 "pmulhuw %%xmm1,%%xmm0 \n"
3596 "movdqa (%0),%%xmm1 \n"
3597 "pshufb %%xmm5,%%xmm1 \n"
3598 "movdqa (%0),%%xmm2 \n"
3599 "punpckhbw %%xmm2,%%xmm2 \n"
3600 "pmulhuw %%xmm2,%%xmm1 \n"
3601 "movdqa (%0),%%xmm2 \n"
3602 "pand %%xmm3,%%xmm2 \n"
3603 "psrlw $0x8,%%xmm0 \n"
3604 "psrlw $0x8,%%xmm1 \n"
3605 "packuswb %%xmm1,%%xmm0 \n"
3606 "por %%xmm2,%%xmm0 \n"
3607 "sub $0x4,%2 \n"
3608 "movdqa %%xmm0,(%0,%1,1) \n"
3609 "lea 0x10(%0),%0 \n"
3610 "jg 1b \n"
3611 : "+r"(src_argb), // %0
3612 "+r"(dst_argb), // %1
3613 "+r"(width) // %2
3614 : "m"(kShuffleAlpha0), // %3
3615 "m"(kShuffleAlpha1) // %4
3616 : "memory", "cc"
3617#if defined(__SSE2__)
3618 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3619#endif
3620 );
3621}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003622#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003623
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003624#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003625// Unattenuate 4 pixels at a time.
3626// aligned to 16 bytes
3627void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3628 int width) {
3629 uintptr_t alpha = 0;
3630 asm volatile (
3631 "sub %0,%1 \n"
3632 "pcmpeqb %%xmm4,%%xmm4 \n"
3633 "pslld $0x18,%%xmm4 \n"
3634
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003635 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003636 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003637 "1: \n"
3638 "movdqa (%0),%%xmm0 \n"
3639 "movzb 0x3(%0),%3 \n"
3640 "punpcklbw %%xmm0,%%xmm0 \n"
3641 "movd 0x0(%4,%3,4),%%xmm2 \n"
3642 "movzb 0x7(%0),%3 \n"
3643 "movd 0x0(%4,%3,4),%%xmm3 \n"
3644 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3645 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3646 "movlhps %%xmm3,%%xmm2 \n"
3647 "pmulhuw %%xmm2,%%xmm0 \n"
3648 "movdqa (%0),%%xmm1 \n"
3649 "movzb 0xb(%0),%3 \n"
3650 "punpckhbw %%xmm1,%%xmm1 \n"
3651 "movd 0x0(%4,%3,4),%%xmm2 \n"
3652 "movzb 0xf(%0),%3 \n"
3653 "movd 0x0(%4,%3,4),%%xmm3 \n"
3654 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3655 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3656 "movlhps %%xmm3,%%xmm2 \n"
3657 "pmulhuw %%xmm2,%%xmm1 \n"
3658 "movdqa (%0),%%xmm2 \n"
3659 "pand %%xmm4,%%xmm2 \n"
3660 "packuswb %%xmm1,%%xmm0 \n"
3661 "por %%xmm2,%%xmm0 \n"
3662 "sub $0x4,%2 \n"
3663 "movdqa %%xmm0,(%0,%1,1) \n"
3664 "lea 0x10(%0),%0 \n"
3665 "jg 1b \n"
3666 : "+r"(src_argb), // %0
3667 "+r"(dst_argb), // %1
3668 "+r"(width), // %2
3669 "+r"(alpha) // %3
3670 : "r"(fixed_invtbl8) // %4
3671 : "memory", "cc"
3672#if defined(__SSE2__)
3673 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3674#endif
3675 );
3676}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003677#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003678
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003679#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003680// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003681CONST vec8 kARGBToGray = {
3682 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3683};
3684
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003685// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003686void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003687 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003688 "movdqa %3,%%xmm4 \n"
3689 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003690
3691 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003692 ".p2align 4 \n"
3693 "1: \n"
3694 "movdqa (%0),%%xmm0 \n"
3695 "movdqa 0x10(%0),%%xmm1 \n"
3696 "pmaddubsw %%xmm4,%%xmm0 \n"
3697 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003698 "phaddw %%xmm1,%%xmm0 \n"
3699 "psrlw $0x7,%%xmm0 \n"
3700 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003701 "movdqa (%0),%%xmm2 \n"
3702 "movdqa 0x10(%0),%%xmm3 \n"
3703 "psrld $0x18,%%xmm2 \n"
3704 "psrld $0x18,%%xmm3 \n"
3705 "packuswb %%xmm3,%%xmm2 \n"
3706 "packuswb %%xmm2,%%xmm2 \n"
3707 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003708 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003709 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003710 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003711 "punpcklwd %%xmm3,%%xmm0 \n"
3712 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003713 "sub $0x8,%2 \n"
3714 "movdqa %%xmm0,(%0,%1,1) \n"
3715 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003716 "lea 0x20(%0),%0 \n"
3717 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003718 : "+r"(src_argb), // %0
3719 "+r"(dst_argb), // %1
3720 "+r"(width) // %2
3721 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003722 : "memory", "cc"
3723#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003724 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003725#endif
3726 );
3727}
3728#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003729
3730#ifdef HAS_ARGBSEPIAROW_SSSE3
3731// b = (r * 35 + g * 68 + b * 17) >> 7
3732// g = (r * 45 + g * 88 + b * 22) >> 7
3733// r = (r * 50 + g * 98 + b * 24) >> 7
3734// Constant for ARGB color to sepia tone
3735CONST vec8 kARGBToSepiaB = {
3736 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3737};
3738
3739CONST vec8 kARGBToSepiaG = {
3740 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3741};
3742
3743CONST vec8 kARGBToSepiaR = {
3744 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3745};
3746
fbarchard@google.come442dc42012-06-18 17:37:09 +00003747// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003748void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3749 asm volatile (
3750 "movdqa %2,%%xmm2 \n"
3751 "movdqa %3,%%xmm3 \n"
3752 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003753
3754 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003755 ".p2align 4 \n"
3756 "1: \n"
3757 "movdqa (%0),%%xmm0 \n"
3758 "movdqa 0x10(%0),%%xmm6 \n"
3759 "pmaddubsw %%xmm2,%%xmm0 \n"
3760 "pmaddubsw %%xmm2,%%xmm6 \n"
3761 "phaddw %%xmm6,%%xmm0 \n"
3762 "psrlw $0x7,%%xmm0 \n"
3763 "packuswb %%xmm0,%%xmm0 \n"
3764 "movdqa (%0),%%xmm5 \n"
3765 "movdqa 0x10(%0),%%xmm1 \n"
3766 "pmaddubsw %%xmm3,%%xmm5 \n"
3767 "pmaddubsw %%xmm3,%%xmm1 \n"
3768 "phaddw %%xmm1,%%xmm5 \n"
3769 "psrlw $0x7,%%xmm5 \n"
3770 "packuswb %%xmm5,%%xmm5 \n"
3771 "punpcklbw %%xmm5,%%xmm0 \n"
3772 "movdqa (%0),%%xmm5 \n"
3773 "movdqa 0x10(%0),%%xmm1 \n"
3774 "pmaddubsw %%xmm4,%%xmm5 \n"
3775 "pmaddubsw %%xmm4,%%xmm1 \n"
3776 "phaddw %%xmm1,%%xmm5 \n"
3777 "psrlw $0x7,%%xmm5 \n"
3778 "packuswb %%xmm5,%%xmm5 \n"
3779 "movdqa (%0),%%xmm6 \n"
3780 "movdqa 0x10(%0),%%xmm1 \n"
3781 "psrld $0x18,%%xmm6 \n"
3782 "psrld $0x18,%%xmm1 \n"
3783 "packuswb %%xmm1,%%xmm6 \n"
3784 "packuswb %%xmm6,%%xmm6 \n"
3785 "punpcklbw %%xmm6,%%xmm5 \n"
3786 "movdqa %%xmm0,%%xmm1 \n"
3787 "punpcklwd %%xmm5,%%xmm0 \n"
3788 "punpckhwd %%xmm5,%%xmm1 \n"
3789 "sub $0x8,%1 \n"
3790 "movdqa %%xmm0,(%0) \n"
3791 "movdqa %%xmm1,0x10(%0) \n"
3792 "lea 0x20(%0),%0 \n"
3793 "jg 1b \n"
3794 : "+r"(dst_argb), // %0
3795 "+r"(width) // %1
3796 : "m"(kARGBToSepiaB), // %2
3797 "m"(kARGBToSepiaG), // %3
3798 "m"(kARGBToSepiaR) // %4
3799 : "memory", "cc"
3800#if defined(__SSE2__)
3801 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3802#endif
3803 );
3804}
3805#endif // HAS_ARGBSEPIAROW_SSSE3
3806
fbarchard@google.come442dc42012-06-18 17:37:09 +00003807#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3808// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3809// Same as Sepia except matrix is provided.
3810void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3811 int width) {
3812 asm volatile (
3813 "movd (%2),%%xmm2 \n"
3814 "movd 0x4(%2),%%xmm3 \n"
3815 "movd 0x8(%2),%%xmm4 \n"
3816 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3817 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3818 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003819
3820 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003821 ".p2align 4 \n"
3822 "1: \n"
3823 "movdqa (%0),%%xmm0 \n"
3824 "movdqa 0x10(%0),%%xmm6 \n"
3825 "pmaddubsw %%xmm2,%%xmm0 \n"
3826 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003827 "movdqa (%0),%%xmm5 \n"
3828 "movdqa 0x10(%0),%%xmm1 \n"
3829 "pmaddubsw %%xmm3,%%xmm5 \n"
3830 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003831 "phaddsw %%xmm6,%%xmm0 \n"
3832 "phaddsw %%xmm1,%%xmm5 \n"
3833 "psraw $0x7,%%xmm0 \n"
3834 "psraw $0x7,%%xmm5 \n"
3835 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003836 "packuswb %%xmm5,%%xmm5 \n"
3837 "punpcklbw %%xmm5,%%xmm0 \n"
3838 "movdqa (%0),%%xmm5 \n"
3839 "movdqa 0x10(%0),%%xmm1 \n"
3840 "pmaddubsw %%xmm4,%%xmm5 \n"
3841 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003842 "phaddsw %%xmm1,%%xmm5 \n"
3843 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003844 "packuswb %%xmm5,%%xmm5 \n"
3845 "movdqa (%0),%%xmm6 \n"
3846 "movdqa 0x10(%0),%%xmm1 \n"
3847 "psrld $0x18,%%xmm6 \n"
3848 "psrld $0x18,%%xmm1 \n"
3849 "packuswb %%xmm1,%%xmm6 \n"
3850 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003851 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003852 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003853 "punpcklwd %%xmm5,%%xmm0 \n"
3854 "punpckhwd %%xmm5,%%xmm1 \n"
3855 "sub $0x8,%1 \n"
3856 "movdqa %%xmm0,(%0) \n"
3857 "movdqa %%xmm1,0x10(%0) \n"
3858 "lea 0x20(%0),%0 \n"
3859 "jg 1b \n"
3860 : "+r"(dst_argb), // %0
3861 "+r"(width) // %1
3862 : "r"(matrix_argb) // %2
3863 : "memory", "cc"
3864#if defined(__SSE2__)
3865 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3866#endif
3867 );
3868}
3869#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3870
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003871#ifdef HAS_ARGBQUANTIZEROW_SSE2
3872// Quantize 4 ARGB pixels (16 bytes).
3873// aligned to 16 bytes
3874void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3875 int interval_offset, int width) {
3876 asm volatile (
3877 "movd %2,%%xmm2 \n"
3878 "movd %3,%%xmm3 \n"
3879 "movd %4,%%xmm4 \n"
3880 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3881 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3882 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3883 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3884 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3885 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3886 "pxor %%xmm5,%%xmm5 \n"
3887 "pcmpeqb %%xmm6,%%xmm6 \n"
3888 "pslld $0x18,%%xmm6 \n"
3889
3890 // 4 pixel loop.
3891 ".p2align 2 \n"
3892 "1: \n"
3893 "movdqa (%0),%%xmm0 \n"
3894 "punpcklbw %%xmm5,%%xmm0 \n"
3895 "pmulhuw %%xmm2,%%xmm0 \n"
3896 "movdqa (%0),%%xmm1 \n"
3897 "punpckhbw %%xmm5,%%xmm1 \n"
3898 "pmulhuw %%xmm2,%%xmm1 \n"
3899 "pmullw %%xmm3,%%xmm0 \n"
3900 "movdqa (%0),%%xmm7 \n"
3901 "pmullw %%xmm3,%%xmm1 \n"
3902 "pand %%xmm6,%%xmm7 \n"
3903 "paddw %%xmm4,%%xmm0 \n"
3904 "paddw %%xmm4,%%xmm1 \n"
3905 "packuswb %%xmm1,%%xmm0 \n"
3906 "por %%xmm7,%%xmm0 \n"
3907 "sub $0x4,%1 \n"
3908 "movdqa %%xmm0,(%0) \n"
3909 "lea 0x10(%0),%0 \n"
3910 "jg 1b \n"
3911 : "+r"(dst_argb), // %0
3912 "+r"(width) // %1
3913 : "r"(scale), // %2
3914 "r"(interval_size), // %3
3915 "r"(interval_offset) // %4
3916 : "memory", "cc"
3917#if defined(__SSE2__)
3918 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3919#endif
3920 );
3921}
3922#endif // HAS_ARGBQUANTIZEROW_SSE2
3923
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003924#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3925// Creates a table of cumulative sums where each value is a sum of all values
3926// above and to the left of the value, inclusive of the value.
3927void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003928 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003929 asm volatile (
3930 "sub %1,%2 \n"
3931 "pxor %%xmm0,%%xmm0 \n"
3932 "pxor %%xmm1,%%xmm1 \n"
3933 "sub $0x4,%3 \n"
3934 "jl 49f \n"
3935 "test $0xf,%1 \n"
3936 "jne 49f \n"
3937
3938 // 4 pixel loop \n"
3939 ".p2align 2 \n"
3940 "40: \n"
3941 "movdqu (%0),%%xmm2 \n"
3942 "lea 0x10(%0),%0 \n"
3943 "movdqa %%xmm2,%%xmm4 \n"
3944 "punpcklbw %%xmm1,%%xmm2 \n"
3945 "movdqa %%xmm2,%%xmm3 \n"
3946 "punpcklwd %%xmm1,%%xmm2 \n"
3947 "punpckhwd %%xmm1,%%xmm3 \n"
3948 "punpckhbw %%xmm1,%%xmm4 \n"
3949 "movdqa %%xmm4,%%xmm5 \n"
3950 "punpcklwd %%xmm1,%%xmm4 \n"
3951 "punpckhwd %%xmm1,%%xmm5 \n"
3952 "paddd %%xmm2,%%xmm0 \n"
3953 "movdqa (%1,%2,1),%%xmm2 \n"
3954 "paddd %%xmm0,%%xmm2 \n"
3955 "paddd %%xmm3,%%xmm0 \n"
3956 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3957 "paddd %%xmm0,%%xmm3 \n"
3958 "paddd %%xmm4,%%xmm0 \n"
3959 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3960 "paddd %%xmm0,%%xmm4 \n"
3961 "paddd %%xmm5,%%xmm0 \n"
3962 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3963 "paddd %%xmm0,%%xmm5 \n"
3964 "movdqa %%xmm2,(%1) \n"
3965 "movdqa %%xmm3,0x10(%1) \n"
3966 "movdqa %%xmm4,0x20(%1) \n"
3967 "movdqa %%xmm5,0x30(%1) \n"
3968 "lea 0x40(%1),%1 \n"
3969 "sub $0x4,%3 \n"
3970 "jge 40b \n"
3971
3972 "49: \n"
3973 "add $0x3,%3 \n"
3974 "jl 19f \n"
3975
3976 // 1 pixel loop \n"
3977 ".p2align 2 \n"
3978 "10: \n"
3979 "movd (%0),%%xmm2 \n"
3980 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003981 "punpcklbw %%xmm1,%%xmm2 \n"
3982 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003983 "paddd %%xmm2,%%xmm0 \n"
3984 "movdqu (%1,%2,1),%%xmm2 \n"
3985 "paddd %%xmm0,%%xmm2 \n"
3986 "movdqu %%xmm2,(%1) \n"
3987 "lea 0x10(%1),%1 \n"
3988 "sub $0x1,%3 \n"
3989 "jge 10b \n"
3990
3991 "19: \n"
3992 : "+r"(row), // %0
3993 "+r"(cumsum), // %1
3994 "+r"(previous_cumsum), // %2
3995 "+r"(width) // %3
3996 :
3997 : "memory", "cc"
3998#if defined(__SSE2__)
3999 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4000#endif
4001 );
4002}
4003#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4004
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004005#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4006void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4007 int width, int area, uint8* dst,
4008 int count) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004009 asm volatile (
4010 "movd %5,%%xmm4 \n"
4011 "cvtdq2ps %%xmm4,%%xmm4 \n"
4012 "rcpss %%xmm4,%%xmm4 \n"
4013 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4014 "sub $0x4,%3 \n"
4015 "jl 49f \n"
4016
4017 // 4 pixel loop \n"
4018 ".p2align 2 \n"
4019 "40: \n"
4020 "movdqa (%0),%%xmm0 \n"
4021 "movdqa 0x10(%0),%%xmm1 \n"
4022 "movdqa 0x20(%0),%%xmm2 \n"
4023 "movdqa 0x30(%0),%%xmm3 \n"
4024 "psubd (%0,%4,4),%%xmm0 \n"
4025 "psubd 0x10(%0,%4,4),%%xmm1 \n"
4026 "psubd 0x20(%0,%4,4),%%xmm2 \n"
4027 "psubd 0x30(%0,%4,4),%%xmm3 \n"
4028 "lea 0x40(%0),%0 \n"
4029 "psubd (%1),%%xmm0 \n"
4030 "psubd 0x10(%1),%%xmm1 \n"
4031 "psubd 0x20(%1),%%xmm2 \n"
4032 "psubd 0x30(%1),%%xmm3 \n"
4033 "paddd (%1,%4,4),%%xmm0 \n"
4034 "paddd 0x10(%1,%4,4),%%xmm1 \n"
4035 "paddd 0x20(%1,%4,4),%%xmm2 \n"
4036 "paddd 0x30(%1,%4,4),%%xmm3 \n"
4037 "lea 0x40(%1),%1 \n"
4038 "cvtdq2ps %%xmm0,%%xmm0 \n"
4039 "cvtdq2ps %%xmm1,%%xmm1 \n"
4040 "mulps %%xmm4,%%xmm0 \n"
4041 "mulps %%xmm4,%%xmm1 \n"
4042 "cvtdq2ps %%xmm2,%%xmm2 \n"
4043 "cvtdq2ps %%xmm3,%%xmm3 \n"
4044 "mulps %%xmm4,%%xmm2 \n"
4045 "mulps %%xmm4,%%xmm3 \n"
4046 "cvtps2dq %%xmm0,%%xmm0 \n"
4047 "cvtps2dq %%xmm1,%%xmm1 \n"
4048 "cvtps2dq %%xmm2,%%xmm2 \n"
4049 "cvtps2dq %%xmm3,%%xmm3 \n"
4050 "packssdw %%xmm1,%%xmm0 \n"
4051 "packssdw %%xmm3,%%xmm2 \n"
4052 "packuswb %%xmm2,%%xmm0 \n"
4053 "movdqu %%xmm0,(%2) \n"
4054 "lea 0x10(%2),%2 \n"
4055 "sub $0x4,%3 \n"
4056 "jge 40b \n"
4057
4058 "49: \n"
4059 "add $0x3,%3 \n"
4060 "jl 19f \n"
4061
4062 // 1 pixel loop \n"
4063 ".p2align 2 \n"
4064 "10: \n"
4065 "movdqa (%0),%%xmm0 \n"
4066 "psubd (%0,%4,4),%%xmm0 \n"
4067 "lea 0x10(%0),%0 \n"
4068 "psubd (%1),%%xmm0 \n"
4069 "paddd (%1,%4,4),%%xmm0 \n"
4070 "lea 0x10(%1),%1 \n"
4071 "cvtdq2ps %%xmm0,%%xmm0 \n"
4072 "mulps %%xmm4,%%xmm0 \n"
4073 "cvtps2dq %%xmm0,%%xmm0 \n"
4074 "packssdw %%xmm0,%%xmm0 \n"
4075 "packuswb %%xmm0,%%xmm0 \n"
4076 "movd %%xmm0,(%2) \n"
4077 "lea 0x4(%2),%2 \n"
4078 "sub $0x1,%3 \n"
4079 "jge 10b \n"
4080 "19: \n"
4081 : "+r"(topleft), // %0
4082 "+r"(botleft), // %1
4083 "+r"(dst), // %2
4084 "+rm"(count) // %3
4085 : "r"(static_cast<intptr_t>(width)), // %4
4086 "rm"(area) // %5
4087 : "memory", "cc"
4088#if defined(__SSE2__)
4089 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
4090#endif
4091 );
4092}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004093#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4094#ifdef HAS_ARGBSHADEROW_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00004095// Shade 4 pixels at a time by specified value.
4096// Aligned to 16 bytes.
4097void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4098 uint32 value) {
4099 asm volatile (
4100 "movd %3,%%xmm2 \n"
4101 "sub %0,%1 \n"
4102 "punpcklbw %%xmm2,%%xmm2 \n"
4103 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004104
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00004105 // 4 pixel loop.
4106 ".p2align 2 \n"
4107 "1: \n"
4108 "movdqa (%0),%%xmm0 \n"
4109 "movdqa %%xmm0,%%xmm1 \n"
4110 "punpcklbw %%xmm0,%%xmm0 \n"
4111 "punpckhbw %%xmm1,%%xmm1 \n"
4112 "pmulhuw %%xmm2,%%xmm0 \n"
4113 "pmulhuw %%xmm2,%%xmm1 \n"
4114 "psrlw $0x8,%%xmm0 \n"
4115 "psrlw $0x8,%%xmm1 \n"
4116 "packuswb %%xmm1,%%xmm0 \n"
4117 "sub $0x4,%2 \n"
4118 "movdqa %%xmm0,(%0,%1,1) \n"
4119 "lea 0x10(%0),%0 \n"
4120 "jg 1b \n"
4121 : "+r"(src_argb), // %0
4122 "+r"(dst_argb), // %1
4123 "+r"(width) // %2
4124 : "r"(value) // %3
4125 : "memory", "cc"
4126#if defined(__SSE2__)
4127 , "xmm0", "xmm1", "xmm2"
4128#endif
4129 );
4130}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004131#endif // HAS_ARGBSHADEROW_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004132
fbarchard@google.com73444402012-08-09 17:33:29 +00004133#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004134// TODO(fbarchard): Find 64 bit way to avoid masking.
fbarchard@google.com73444402012-08-09 17:33:29 +00004135// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004136// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00004137// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004138
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00004139LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00004140void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004141 uint8* dst_argb, const float* src_dudv, int width) {
fbarchard@google.com73444402012-08-09 17:33:29 +00004142 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004143 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00004144 asm volatile (
4145 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004146 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004147 "shl $0x10,%1 \n"
4148 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004149 "movd %1,%%xmm5 \n"
4150 "sub $0x4,%4 \n"
4151 "jl 49f \n"
4152
4153 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4154 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004155 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004156 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004157 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004158 "movdqa %%xmm7,%%xmm4 \n"
4159 "addps %%xmm4,%%xmm4 \n"
4160 "movdqa %%xmm2,%%xmm3 \n"
4161 "addps %%xmm4,%%xmm3 \n"
4162 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004163
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004164 // 4 pixel loop \n"
4165 ".p2align 4 \n"
4166 "40: \n"
4167 "cvttps2dq %%xmm2,%%xmm0 \n"
4168 "cvttps2dq %%xmm3,%%xmm1 \n"
4169 "packssdw %%xmm1,%%xmm0 \n"
4170 "pmaddwd %%xmm5,%%xmm0 \n"
4171#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004172 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004173 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004174 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004175 "shr $32,%5 \n"
4176 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4177#else
4178 "movd %%xmm0,%1 \n"
4179 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4180 "movd %%xmm0,%5 \n"
4181 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4182#endif
4183 "movd (%0,%1,1),%%xmm1 \n"
4184 "movd (%0,%5,1),%%xmm6 \n"
4185 "punpckldq %%xmm6,%%xmm1 \n"
4186 "addps %%xmm4,%%xmm2 \n"
4187 "movq %%xmm1,(%2) \n"
4188#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004189 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004190 "mov %1,%5 \n"
4191 "and $0x0fffffff,%1 \n"
4192 "shr $32,%5 \n"
4193#else
4194 "movd %%xmm0,%1 \n"
4195 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4196 "movd %%xmm0,%5 \n"
4197#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004198 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004199 "movd (%0,%5,1),%%xmm6 \n"
4200 "punpckldq %%xmm6,%%xmm0 \n"
4201 "addps %%xmm4,%%xmm3 \n"
4202 "sub $0x4,%4 \n"
4203 "movq %%xmm0,0x08(%2) \n"
4204 "lea 0x10(%2),%2 \n"
4205 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004206
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004207 "49: \n"
4208 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004209 "jl 19f \n"
4210
4211 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004212 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004213 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004214 "cvttps2dq %%xmm2,%%xmm0 \n"
4215 "packssdw %%xmm0,%%xmm0 \n"
4216 "pmaddwd %%xmm5,%%xmm0 \n"
4217 "addps %%xmm7,%%xmm2 \n"
4218 "movd %%xmm0,%1 \n"
4219#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004220 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004221#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004222 "movd (%0,%1,1),%%xmm0 \n"
4223 "sub $0x1,%4 \n"
4224 "movd %%xmm0,(%2) \n"
4225 "lea 0x4(%2),%2 \n"
4226 "jge 10b \n"
4227 "19: \n"
4228 : "+r"(src_argb), // %0
4229 "+r"(src_argb_stride_temp), // %1
4230 "+r"(dst_argb), // %2
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004231 "+r"(src_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004232 "+rm"(width), // %4
4233 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004234 :
4235 : "memory", "cc"
4236#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004237 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004238#endif
4239 );
4240}
4241#endif // HAS_ARGBAFFINEROW_SSE2
4242
fbarchard@google.comb5491752012-11-20 09:44:46 +00004243// Bilinear image filtering.
4244// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4245void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004246 ptrdiff_t src_stride, int dst_width,
4247 int source_y_fraction) {
4248 asm volatile (
4249 "sub %1,%0 \n"
4250 "shr %3 \n"
4251 "cmp $0x0,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004252 "je 100f \n"
4253 "cmp $0x20,%3 \n"
4254 "je 75f \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004255 "cmp $0x40,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004256 "je 50f \n"
4257 "cmp $0x60,%3 \n"
4258 "je 25f \n"
4259
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004260 "movd %3,%%xmm0 \n"
4261 "neg %3 \n"
4262 "add $0x80,%3 \n"
4263 "movd %3,%%xmm5 \n"
4264 "punpcklbw %%xmm0,%%xmm5 \n"
4265 "punpcklwd %%xmm5,%%xmm5 \n"
4266 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004267
4268 // General purpose row blend.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004269 ".p2align 4 \n"
4270 "1: \n"
4271 "movdqa (%1),%%xmm0 \n"
4272 "movdqa (%1,%4,1),%%xmm2 \n"
4273 "movdqa %%xmm0,%%xmm1 \n"
4274 "punpcklbw %%xmm2,%%xmm0 \n"
4275 "punpckhbw %%xmm2,%%xmm1 \n"
4276 "pmaddubsw %%xmm5,%%xmm0 \n"
4277 "pmaddubsw %%xmm5,%%xmm1 \n"
4278 "psrlw $0x7,%%xmm0 \n"
4279 "psrlw $0x7,%%xmm1 \n"
4280 "packuswb %%xmm1,%%xmm0 \n"
4281 "sub $0x4,%2 \n"
4282 "movdqa %%xmm0,(%1,%0,1) \n"
4283 "lea 0x10(%1),%1 \n"
4284 "jg 1b \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004285 "jmp 99f \n"
4286
4287 // Blend 25 / 75.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004288 ".p2align 4 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004289 "25: \n"
4290 "movdqa (%1),%%xmm0 \n"
4291 "movdqa (%1,%4,1),%%xmm1 \n"
4292 "pavgb %%xmm1,%%xmm0 \n"
4293 "pavgb %%xmm1,%%xmm0 \n"
4294 "sub $0x4,%2 \n"
4295 "movdqa %%xmm0,(%1,%0,1) \n"
4296 "lea 0x10(%1),%1 \n"
4297 "jg 25b \n"
4298 "jmp 99f \n"
4299
4300 // Blend 50 / 50.
4301 ".p2align 4 \n"
4302 "50: \n"
4303 "movdqa (%1),%%xmm0 \n"
4304 "movdqa (%1,%4,1),%%xmm1 \n"
4305 "pavgb %%xmm1,%%xmm0 \n"
4306 "sub $0x4,%2 \n"
4307 "movdqa %%xmm0,(%1,%0,1) \n"
4308 "lea 0x10(%1),%1 \n"
4309 "jg 50b \n"
4310 "jmp 99f \n"
4311
4312 // Blend 75 / 25.
4313 ".p2align 4 \n"
4314 "75: \n"
4315 "movdqa (%1),%%xmm1 \n"
4316 "movdqa (%1,%4,1),%%xmm0 \n"
4317 "pavgb %%xmm1,%%xmm0 \n"
4318 "pavgb %%xmm1,%%xmm0 \n"
4319 "sub $0x4,%2 \n"
4320 "movdqa %%xmm0,(%1,%0,1) \n"
4321 "lea 0x10(%1),%1 \n"
4322 "jg 75b \n"
4323 "jmp 99f \n"
4324
4325 // Blend 100 / 0 - Copy row unchanged.
4326 ".p2align 4 \n"
4327 "100: \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004328 "movdqa (%1),%%xmm0 \n"
4329 "sub $0x4,%2 \n"
4330 "movdqa %%xmm0,(%1,%0,1) \n"
4331 "lea 0x10(%1),%1 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004332 "jg 100b \n"
4333
4334 // Extrude last pixel.
4335 "99: \n"
4336 : "+r"(dst_argb), // %0
4337 "+r"(src_argb), // %1
4338 "+r"(dst_width), // %2
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004339 "+r"(source_y_fraction) // %3
4340 : "r"(static_cast<intptr_t>(src_stride)) // %4
4341 : "memory", "cc"
4342#if defined(__SSE2__)
4343 , "xmm0", "xmm1", "xmm2", "xmm5"
4344#endif
4345 );
4346}
4347
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004348void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4349 uint8* dst_uv, int pix) {
4350 asm volatile (
4351 "sub %0,%1 \n"
4352 ".p2align 4 \n"
4353 "1: \n"
4354 "movdqa (%0),%%xmm0 \n"
4355 "pavgb (%0,%3),%%xmm0 \n"
4356 "sub $0x10,%2 \n"
4357 "movdqa %%xmm0,(%0,%1) \n"
4358 "lea 0x10(%0),%0 \n"
4359 "jg 1b \n"
4360 : "+r"(src_uv), // %0
4361 "+r"(dst_uv), // %1
4362 "+r"(pix) // %2
4363 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4364 : "memory", "cc"
4365#if defined(__SSE2__)
4366 , "xmm0"
4367#endif
4368 );
4369}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004370
4371void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4372 uint32 selector, int pix) {
4373 asm volatile (
4374 "movd %3,%%xmm5 \n"
4375 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4376 ".p2align 4 \n"
4377 "1: \n"
4378 "movdqa (%0),%%xmm0 \n"
4379 "lea 0x10(%0),%0 \n"
4380 "pshufb %%xmm5,%%xmm0 \n"
4381 "sub $0x4,%2 \n"
4382 "movd %%xmm0,(%1) \n"
4383 "lea 0x4(%1),%1 \n"
4384 "jg 1b \n"
4385 : "+r"(src_argb), // %0
4386 "+r"(dst_bayer), // %1
4387 "+r"(pix) // %2
4388 : "g"(selector) // %3
4389 : "memory", "cc"
4390#if defined(__SSE2__)
4391 , "xmm0", "xmm5"
4392#endif
4393 );
4394}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004395
4396void I422ToYUY2Row_SSE2(const uint8* src_y,
4397 const uint8* src_u,
4398 const uint8* src_v,
4399 uint8* dst_frame, int width) {
4400 asm volatile (
4401 "sub %1,%2 \n"
4402 ".p2align 4 \n"
4403 "1: \n"
4404 "movq (%1),%%xmm2 \n"
4405 "movq (%1,%2,1),%%xmm3 \n"
4406 "lea 0x8(%1),%1 \n"
4407 "punpcklbw %%xmm3,%%xmm2 \n"
4408 "movdqa (%0),%%xmm0 \n"
4409 "lea 0x10(%0),%0 \n"
4410 "movdqa %%xmm0,%%xmm1 \n"
4411 "punpcklbw %%xmm2,%%xmm0 \n"
4412 "punpckhbw %%xmm2,%%xmm1 \n"
4413 "movdqa %%xmm0,(%3) \n"
4414 "movdqa %%xmm1,0x10(%3) \n"
4415 "lea 0x20(%3),%3 \n"
4416 "sub $0x10,%4 \n"
4417 "jg 1b \n"
4418 : "+r"(src_y), // %0
4419 "+r"(src_u), // %1
4420 "+r"(src_v), // %2
4421 "+r"(dst_frame), // %3
4422 "+rm"(width) // %4
4423 :
4424 : "memory", "cc"
4425#if defined(__SSE2__)
4426 , "xmm0", "xmm1", "xmm2", "xmm3"
4427#endif
4428 );
4429}
4430
4431void I422ToUYVYRow_SSE2(const uint8* src_y,
4432 const uint8* src_u,
4433 const uint8* src_v,
4434 uint8* dst_frame, int width) {
4435 asm volatile (
4436 "sub %1,%2 \n"
4437 ".p2align 4 \n"
4438 "1: \n"
4439 "movq (%1),%%xmm2 \n"
4440 "movq (%1,%2,1),%%xmm3 \n"
4441 "lea 0x8(%1),%1 \n"
4442 "punpcklbw %%xmm3,%%xmm2 \n"
4443 "movdqa (%0),%%xmm0 \n"
4444 "movdqa %%xmm2,%%xmm1 \n"
4445 "lea 0x10(%0),%0 \n"
4446 "punpcklbw %%xmm0,%%xmm1 \n"
4447 "punpckhbw %%xmm0,%%xmm2 \n"
4448 "movdqa %%xmm1,(%3) \n"
4449 "movdqa %%xmm2,0x10(%3) \n"
4450 "lea 0x20(%3),%3 \n"
4451 "sub $0x10,%4 \n"
4452 "jg 1b \n"
4453 : "+r"(src_y), // %0
4454 "+r"(src_u), // %1
4455 "+r"(src_v), // %2
4456 "+r"(dst_frame), // %3
4457 "+rm"(width) // %4
4458 :
4459 : "memory", "cc"
4460#if defined(__SSE2__)
4461 , "xmm0", "xmm1", "xmm2", "xmm3"
4462#endif
4463 );
4464}
4465
fbarchard@google.com2d11d432012-02-16 02:50:39 +00004466#endif // defined(__x86_64__) || defined(__i386__)
4467
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004468#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00004469} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004470} // namespace libyuv
4471#endif