blob: 267cd4b7a2acf3e5fa0d01b7ba44845f17aae445 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
fbarchard@google.com4de0c432012-10-11 01:25:46 +000072// Constants for RGBA.
73CONST vec8 kRGBAToY = {
74 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
75};
76
77CONST vec8 kRGBAToU = {
78 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
79};
80
81CONST vec8 kRGBAToV = {
82 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
83};
84
fbarchard@google.com714050a2012-02-17 22:59:56 +000085CONST uvec8 kAddY16 = {
86 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000087};
fbarchard@google.com2430e042011-11-11 21:57:06 +000088
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000089CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
92};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000093
fbarchard@google.comba1f5262012-01-12 19:22:41 +000094// Shuffle table for converting RGB24 to ARGB.
95CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000096 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
97};
98
99// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000100CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000101 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
102};
103
fbarchard@google.comb6149762011-11-07 21:58:52 +0000104// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000105CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000106 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
107};
108
109// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000110CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
112};
113
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000114// Shuffle table for converting RGBA to ARGB.
115CONST uvec8 kShuffleMaskRGBAToARGB = {
116 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
117};
118
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000119// Shuffle table for converting ARGB to RGBA.
120CONST uvec8 kShuffleMaskARGBToRGBA = {
121 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
122};
123
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000124// Shuffle table for converting ARGB to RGB24.
125CONST uvec8 kShuffleMaskARGBToRGB24 = {
126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
127};
128
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000129// Shuffle table for converting ARGB to RAW.
130CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000131 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000132};
133
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000134// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000135CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
137};
138
139// Shuffle table for converting ARGB to RAW.
140CONST uvec8 kShuffleMaskARGBToRAW_0 = {
141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142};
143
fbarchard@google.comb6149762011-11-07 21:58:52 +0000144void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000145 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pcmpeqb %%xmm5,%%xmm5 \n"
147 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000148 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "1: \n"
150 "movq (%0),%%xmm0 \n"
151 "lea 0x8(%0),%0 \n"
152 "punpcklbw %%xmm0,%%xmm0 \n"
153 "movdqa %%xmm0,%%xmm1 \n"
154 "punpcklwd %%xmm0,%%xmm0 \n"
155 "punpckhwd %%xmm1,%%xmm1 \n"
156 "por %%xmm5,%%xmm0 \n"
157 "por %%xmm5,%%xmm1 \n"
158 "movdqa %%xmm0,(%1) \n"
159 "movdqa %%xmm1,0x10(%1) \n"
160 "lea 0x20(%1),%1 \n"
161 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000162 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163 : "+r"(src_y), // %0
164 "+r"(dst_argb), // %1
165 "+r"(pix) // %2
166 :
167 : "memory", "cc"
168#if defined(__SSE2__)
169 , "xmm0", "xmm1", "xmm5"
170#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000171 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000172}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173
174void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000175 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000176 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000177 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000178 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000179 "1: \n"
180 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000181 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000182 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000183 "movdqa %%xmm0,(%0,%1,1) \n"
184 "lea 0x10(%0),%0 \n"
185 "jg 1b \n"
186
fbarchard@google.comb6149762011-11-07 21:58:52 +0000187 : "+r"(src_abgr), // %0
188 "+r"(dst_argb), // %1
189 "+r"(pix) // %2
190 : "m"(kShuffleMaskABGRToARGB) // %3
191 : "memory", "cc"
192#if defined(__SSE2__)
193 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000194#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000195 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000196}
197
198void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000199 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000200 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000201 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000202 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000203 "1: \n"
204 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000205 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000206 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000207 "movdqa %%xmm0,(%0,%1,1) \n"
208 "lea 0x10(%0),%0 \n"
209 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000210 : "+r"(src_bgra), // %0
211 "+r"(dst_argb), // %1
212 "+r"(pix) // %2
213 : "m"(kShuffleMaskBGRAToARGB) // %3
214 : "memory", "cc"
215#if defined(__SSE2__)
216 , "xmm0", "xmm5"
217#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000218 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000219}
220
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000221void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
222 asm volatile (
223 "movdqa %3,%%xmm5 \n"
224 "sub %0,%1 \n"
225 ".p2align 4 \n"
226 "1: \n"
227 "movdqa (%0),%%xmm0 \n"
228 "pshufb %%xmm5,%%xmm0 \n"
229 "sub $0x4,%2 \n"
230 "movdqa %%xmm0,(%0,%1,1) \n"
231 "lea 0x10(%0),%0 \n"
232 "jg 1b \n"
233
234 : "+r"(src_rgba), // %0
235 "+r"(dst_argb), // %1
236 "+r"(pix) // %2
237 : "m"(kShuffleMaskRGBAToARGB) // %3
238 : "memory", "cc"
239#if defined(__SSE2__)
240 , "xmm0", "xmm5"
241#endif
242 );
243}
244
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000245void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
246 asm volatile (
247 "movdqa %3,%%xmm5 \n"
248 "sub %0,%1 \n"
249 ".p2align 4 \n"
250 "1: \n"
251 "movdqa (%0),%%xmm0 \n"
252 "pshufb %%xmm5,%%xmm0 \n"
253 "sub $0x4,%2 \n"
254 "movdqa %%xmm0,(%0,%1,1) \n"
255 "lea 0x10(%0),%0 \n"
256 "jg 1b \n"
257
258 : "+r"(src_argb), // %0
259 "+r"(dst_rgba), // %1
260 "+r"(pix) // %2
261 : "m"(kShuffleMaskARGBToRGBA) // %3
262 : "memory", "cc"
263#if defined(__SSE2__)
264 , "xmm0", "xmm5"
265#endif
266 );
267}
268
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000269void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000270 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000271 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
272 "pslld $0x18,%%xmm5 \n"
273 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000274 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000275 "1: \n"
276 "movdqu (%0),%%xmm0 \n"
277 "movdqu 0x10(%0),%%xmm1 \n"
278 "movdqu 0x20(%0),%%xmm3 \n"
279 "lea 0x30(%0),%0 \n"
280 "movdqa %%xmm3,%%xmm2 \n"
281 "palignr $0x8,%%xmm1,%%xmm2 \n"
282 "pshufb %%xmm4,%%xmm2 \n"
283 "por %%xmm5,%%xmm2 \n"
284 "palignr $0xc,%%xmm0,%%xmm1 \n"
285 "pshufb %%xmm4,%%xmm0 \n"
286 "movdqa %%xmm2,0x20(%1) \n"
287 "por %%xmm5,%%xmm0 \n"
288 "pshufb %%xmm4,%%xmm1 \n"
289 "movdqa %%xmm0,(%1) \n"
290 "por %%xmm5,%%xmm1 \n"
291 "palignr $0x4,%%xmm3,%%xmm3 \n"
292 "pshufb %%xmm4,%%xmm3 \n"
293 "movdqa %%xmm1,0x10(%1) \n"
294 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000295 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000296 "movdqa %%xmm3,0x30(%1) \n"
297 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000298 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000299 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000300 "+r"(dst_argb), // %1
301 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000302 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000303 : "memory", "cc"
304#if defined(__SSE2__)
305 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
306#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000307 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000308}
309
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000310void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000311 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000312 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
313 "pslld $0x18,%%xmm5 \n"
314 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000315 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000316 "1: \n"
317 "movdqu (%0),%%xmm0 \n"
318 "movdqu 0x10(%0),%%xmm1 \n"
319 "movdqu 0x20(%0),%%xmm3 \n"
320 "lea 0x30(%0),%0 \n"
321 "movdqa %%xmm3,%%xmm2 \n"
322 "palignr $0x8,%%xmm1,%%xmm2 \n"
323 "pshufb %%xmm4,%%xmm2 \n"
324 "por %%xmm5,%%xmm2 \n"
325 "palignr $0xc,%%xmm0,%%xmm1 \n"
326 "pshufb %%xmm4,%%xmm0 \n"
327 "movdqa %%xmm2,0x20(%1) \n"
328 "por %%xmm5,%%xmm0 \n"
329 "pshufb %%xmm4,%%xmm1 \n"
330 "movdqa %%xmm0,(%1) \n"
331 "por %%xmm5,%%xmm1 \n"
332 "palignr $0x4,%%xmm3,%%xmm3 \n"
333 "pshufb %%xmm4,%%xmm3 \n"
334 "movdqa %%xmm1,0x10(%1) \n"
335 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000336 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000337 "movdqa %%xmm3,0x30(%1) \n"
338 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000339 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000340 : "+r"(src_raw), // %0
341 "+r"(dst_argb), // %1
342 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000343 : "m"(kShuffleMaskRAWToARGB) // %3
344 : "memory", "cc"
345#if defined(__SSE2__)
346 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
347#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000348 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000349}
350
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000351void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000352 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000353 "mov $0x1080108,%%eax \n"
354 "movd %%eax,%%xmm5 \n"
355 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000356 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000357 "movd %%eax,%%xmm6 \n"
358 "pshufd $0x0,%%xmm6,%%xmm6 \n"
359 "pcmpeqb %%xmm3,%%xmm3 \n"
360 "psllw $0xb,%%xmm3 \n"
361 "pcmpeqb %%xmm4,%%xmm4 \n"
362 "psllw $0xa,%%xmm4 \n"
363 "psrlw $0x5,%%xmm4 \n"
364 "pcmpeqb %%xmm7,%%xmm7 \n"
365 "psllw $0x8,%%xmm7 \n"
366 "sub %0,%1 \n"
367 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000368 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000369 "1: \n"
370 "movdqu (%0),%%xmm0 \n"
371 "movdqa %%xmm0,%%xmm1 \n"
372 "movdqa %%xmm0,%%xmm2 \n"
373 "pand %%xmm3,%%xmm1 \n"
374 "psllw $0xb,%%xmm2 \n"
375 "pmulhuw %%xmm5,%%xmm1 \n"
376 "pmulhuw %%xmm5,%%xmm2 \n"
377 "psllw $0x8,%%xmm1 \n"
378 "por %%xmm2,%%xmm1 \n"
379 "pand %%xmm4,%%xmm0 \n"
380 "pmulhuw %%xmm6,%%xmm0 \n"
381 "por %%xmm7,%%xmm0 \n"
382 "movdqa %%xmm1,%%xmm2 \n"
383 "punpcklbw %%xmm0,%%xmm1 \n"
384 "punpckhbw %%xmm0,%%xmm2 \n"
385 "movdqa %%xmm1,(%1,%0,2) \n"
386 "movdqa %%xmm2,0x10(%1,%0,2) \n"
387 "lea 0x10(%0),%0 \n"
388 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000389 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000390 : "+r"(src), // %0
391 "+r"(dst), // %1
392 "+r"(pix) // %2
393 :
394 : "memory", "cc", "eax"
395#if defined(__SSE2__)
396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
397#endif
398 );
399}
400
401void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000402 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000403 "mov $0x1080108,%%eax \n"
404 "movd %%eax,%%xmm5 \n"
405 "pshufd $0x0,%%xmm5,%%xmm5 \n"
406 "mov $0x42004200,%%eax \n"
407 "movd %%eax,%%xmm6 \n"
408 "pshufd $0x0,%%xmm6,%%xmm6 \n"
409 "pcmpeqb %%xmm3,%%xmm3 \n"
410 "psllw $0xb,%%xmm3 \n"
411 "movdqa %%xmm3,%%xmm4 \n"
412 "psrlw $0x6,%%xmm4 \n"
413 "pcmpeqb %%xmm7,%%xmm7 \n"
414 "psllw $0x8,%%xmm7 \n"
415 "sub %0,%1 \n"
416 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000417 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000418 "1: \n"
419 "movdqu (%0),%%xmm0 \n"
420 "movdqa %%xmm0,%%xmm1 \n"
421 "movdqa %%xmm0,%%xmm2 \n"
422 "psllw $0x1,%%xmm1 \n"
423 "psllw $0xb,%%xmm2 \n"
424 "pand %%xmm3,%%xmm1 \n"
425 "pmulhuw %%xmm5,%%xmm2 \n"
426 "pmulhuw %%xmm5,%%xmm1 \n"
427 "psllw $0x8,%%xmm1 \n"
428 "por %%xmm2,%%xmm1 \n"
429 "movdqa %%xmm0,%%xmm2 \n"
430 "pand %%xmm4,%%xmm0 \n"
431 "psraw $0x8,%%xmm2 \n"
432 "pmulhuw %%xmm6,%%xmm0 \n"
433 "pand %%xmm7,%%xmm2 \n"
434 "por %%xmm2,%%xmm0 \n"
435 "movdqa %%xmm1,%%xmm2 \n"
436 "punpcklbw %%xmm0,%%xmm1 \n"
437 "punpckhbw %%xmm0,%%xmm2 \n"
438 "movdqa %%xmm1,(%1,%0,2) \n"
439 "movdqa %%xmm2,0x10(%1,%0,2) \n"
440 "lea 0x10(%0),%0 \n"
441 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 :
447 : "memory", "cc", "eax"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
450#endif
451 );
452}
453
454void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "mov $0xf0f0f0f,%%eax \n"
457 "movd %%eax,%%xmm4 \n"
458 "pshufd $0x0,%%xmm4,%%xmm4 \n"
459 "movdqa %%xmm4,%%xmm5 \n"
460 "pslld $0x4,%%xmm5 \n"
461 "sub %0,%1 \n"
462 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000463 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000464 "1: \n"
465 "movdqu (%0),%%xmm0 \n"
466 "movdqa %%xmm0,%%xmm2 \n"
467 "pand %%xmm4,%%xmm0 \n"
468 "pand %%xmm5,%%xmm2 \n"
469 "movdqa %%xmm0,%%xmm1 \n"
470 "movdqa %%xmm2,%%xmm3 \n"
471 "psllw $0x4,%%xmm1 \n"
472 "psrlw $0x4,%%xmm3 \n"
473 "por %%xmm1,%%xmm0 \n"
474 "por %%xmm3,%%xmm2 \n"
475 "movdqa %%xmm0,%%xmm1 \n"
476 "punpcklbw %%xmm2,%%xmm0 \n"
477 "punpckhbw %%xmm2,%%xmm1 \n"
478 "movdqa %%xmm0,(%1,%0,2) \n"
479 "movdqa %%xmm1,0x10(%1,%0,2) \n"
480 "lea 0x10(%0),%0 \n"
481 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000482 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000483 : "+r"(src), // %0
484 "+r"(dst), // %1
485 "+r"(pix) // %2
486 :
487 : "memory", "cc", "eax"
488#if defined(__SSE2__)
489 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
490#endif
491 );
492}
493
494void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000495 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000496 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000497 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000498 "1: \n"
499 "movdqa (%0),%%xmm0 \n"
500 "movdqa 0x10(%0),%%xmm1 \n"
501 "movdqa 0x20(%0),%%xmm2 \n"
502 "movdqa 0x30(%0),%%xmm3 \n"
503 "lea 0x40(%0),%0 \n"
504 "pshufb %%xmm6,%%xmm0 \n"
505 "pshufb %%xmm6,%%xmm1 \n"
506 "pshufb %%xmm6,%%xmm2 \n"
507 "pshufb %%xmm6,%%xmm3 \n"
508 "movdqa %%xmm1,%%xmm4 \n"
509 "psrldq $0x4,%%xmm1 \n"
510 "pslldq $0xc,%%xmm4 \n"
511 "movdqa %%xmm2,%%xmm5 \n"
512 "por %%xmm4,%%xmm0 \n"
513 "pslldq $0x8,%%xmm5 \n"
514 "movdqa %%xmm0,(%1) \n"
515 "por %%xmm5,%%xmm1 \n"
516 "psrldq $0x8,%%xmm2 \n"
517 "pslldq $0x4,%%xmm3 \n"
518 "por %%xmm3,%%xmm2 \n"
519 "movdqa %%xmm1,0x10(%1) \n"
520 "movdqa %%xmm2,0x20(%1) \n"
521 "lea 0x30(%1),%1 \n"
522 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 : "m"(kShuffleMaskARGBToRGB24) // %3
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
531#endif
532 );
533}
534
535void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000538 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000539 "1: \n"
540 "movdqa (%0),%%xmm0 \n"
541 "movdqa 0x10(%0),%%xmm1 \n"
542 "movdqa 0x20(%0),%%xmm2 \n"
543 "movdqa 0x30(%0),%%xmm3 \n"
544 "lea 0x40(%0),%0 \n"
545 "pshufb %%xmm6,%%xmm0 \n"
546 "pshufb %%xmm6,%%xmm1 \n"
547 "pshufb %%xmm6,%%xmm2 \n"
548 "pshufb %%xmm6,%%xmm3 \n"
549 "movdqa %%xmm1,%%xmm4 \n"
550 "psrldq $0x4,%%xmm1 \n"
551 "pslldq $0xc,%%xmm4 \n"
552 "movdqa %%xmm2,%%xmm5 \n"
553 "por %%xmm4,%%xmm0 \n"
554 "pslldq $0x8,%%xmm5 \n"
555 "movdqa %%xmm0,(%1) \n"
556 "por %%xmm5,%%xmm1 \n"
557 "psrldq $0x8,%%xmm2 \n"
558 "pslldq $0x4,%%xmm3 \n"
559 "por %%xmm3,%%xmm2 \n"
560 "movdqa %%xmm1,0x10(%1) \n"
561 "movdqa %%xmm2,0x20(%1) \n"
562 "lea 0x30(%1),%1 \n"
563 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000564 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000565 : "+r"(src), // %0
566 "+r"(dst), // %1
567 "+r"(pix) // %2
568 : "m"(kShuffleMaskARGBToRAW) // %3
569 : "memory", "cc"
570#if defined(__SSE2__)
571 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
572#endif
573 );
574}
575
576void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000577 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000578 "pcmpeqb %%xmm3,%%xmm3 \n"
579 "psrld $0x1b,%%xmm3 \n"
580 "pcmpeqb %%xmm4,%%xmm4 \n"
581 "psrld $0x1a,%%xmm4 \n"
582 "pslld $0x5,%%xmm4 \n"
583 "pcmpeqb %%xmm5,%%xmm5 \n"
584 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "movdqa %%xmm0,%%xmm2 \n"
590 "pslld $0x8,%%xmm0 \n"
591 "psrld $0x3,%%xmm1 \n"
592 "psrld $0x5,%%xmm2 \n"
593 "psrad $0x10,%%xmm0 \n"
594 "pand %%xmm3,%%xmm1 \n"
595 "pand %%xmm4,%%xmm2 \n"
596 "pand %%xmm5,%%xmm0 \n"
597 "por %%xmm2,%%xmm1 \n"
598 "por %%xmm1,%%xmm0 \n"
599 "packssdw %%xmm0,%%xmm0 \n"
600 "lea 0x10(%0),%0 \n"
601 "movq %%xmm0,(%1) \n"
602 "lea 0x8(%1),%1 \n"
603 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000604 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000605 : "+r"(src), // %0
606 "+r"(dst), // %1
607 "+r"(pix) // %2
608 :
609 : "memory", "cc"
610#if defined(__SSE2__)
611 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
612#endif
613 );
614}
615
616void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000617 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000618 "pcmpeqb %%xmm4,%%xmm4 \n"
619 "psrld $0x1b,%%xmm4 \n"
620 "movdqa %%xmm4,%%xmm5 \n"
621 "pslld $0x5,%%xmm5 \n"
622 "movdqa %%xmm4,%%xmm6 \n"
623 "pslld $0xa,%%xmm6 \n"
624 "pcmpeqb %%xmm7,%%xmm7 \n"
625 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000626 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000627 "1: \n"
628 "movdqa (%0),%%xmm0 \n"
629 "movdqa %%xmm0,%%xmm1 \n"
630 "movdqa %%xmm0,%%xmm2 \n"
631 "movdqa %%xmm0,%%xmm3 \n"
632 "psrad $0x10,%%xmm0 \n"
633 "psrld $0x3,%%xmm1 \n"
634 "psrld $0x6,%%xmm2 \n"
635 "psrld $0x9,%%xmm3 \n"
636 "pand %%xmm7,%%xmm0 \n"
637 "pand %%xmm4,%%xmm1 \n"
638 "pand %%xmm5,%%xmm2 \n"
639 "pand %%xmm6,%%xmm3 \n"
640 "por %%xmm1,%%xmm0 \n"
641 "por %%xmm3,%%xmm2 \n"
642 "por %%xmm2,%%xmm0 \n"
643 "packssdw %%xmm0,%%xmm0 \n"
644 "lea 0x10(%0),%0 \n"
645 "movq %%xmm0,(%1) \n"
646 "lea 0x8(%1),%1 \n"
647 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000648 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000649 : "+r"(src), // %0
650 "+r"(dst), // %1
651 "+r"(pix) // %2
652 :
653 : "memory", "cc"
654#if defined(__SSE2__)
655 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
656#endif
657 );
658}
659
660void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000661 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000662 "pcmpeqb %%xmm4,%%xmm4 \n"
663 "psllw $0xc,%%xmm4 \n"
664 "movdqa %%xmm4,%%xmm3 \n"
665 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000666 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000667 "1: \n"
668 "movdqa (%0),%%xmm0 \n"
669 "movdqa %%xmm0,%%xmm1 \n"
670 "pand %%xmm3,%%xmm0 \n"
671 "pand %%xmm4,%%xmm1 \n"
672 "psrlq $0x4,%%xmm0 \n"
673 "psrlq $0x8,%%xmm1 \n"
674 "por %%xmm1,%%xmm0 \n"
675 "packuswb %%xmm0,%%xmm0 \n"
676 "lea 0x10(%0),%0 \n"
677 "movq %%xmm0,(%1) \n"
678 "lea 0x8(%1),%1 \n"
679 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000680 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000681 : "+r"(src), // %0
682 "+r"(dst), // %1
683 "+r"(pix) // %2
684 :
685 : "memory", "cc"
686#if defined(__SSE2__)
687 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
688#endif
689 );
690}
691
fbarchard@google.comb6149762011-11-07 21:58:52 +0000692void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000693 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000694 "movdqa %4,%%xmm5 \n"
695 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000696 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000697 "1: \n"
698 "movdqa (%0),%%xmm0 \n"
699 "movdqa 0x10(%0),%%xmm1 \n"
700 "movdqa 0x20(%0),%%xmm2 \n"
701 "movdqa 0x30(%0),%%xmm3 \n"
702 "pmaddubsw %%xmm4,%%xmm0 \n"
703 "pmaddubsw %%xmm4,%%xmm1 \n"
704 "pmaddubsw %%xmm4,%%xmm2 \n"
705 "pmaddubsw %%xmm4,%%xmm3 \n"
706 "lea 0x40(%0),%0 \n"
707 "phaddw %%xmm1,%%xmm0 \n"
708 "phaddw %%xmm3,%%xmm2 \n"
709 "psrlw $0x7,%%xmm0 \n"
710 "psrlw $0x7,%%xmm2 \n"
711 "packuswb %%xmm2,%%xmm0 \n"
712 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000713 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000714 "movdqa %%xmm0,(%1) \n"
715 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000716 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000717 : "+r"(src_argb), // %0
718 "+r"(dst_y), // %1
719 "+r"(pix) // %2
720 : "m"(kARGBToY), // %3
721 "m"(kAddY16) // %4
722 : "memory", "cc"
723#if defined(__SSE2__)
724 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
725#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000726 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000727}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000728
729void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000730 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000731 "movdqa %4,%%xmm5 \n"
732 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000733 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000734 "1: \n"
735 "movdqu (%0),%%xmm0 \n"
736 "movdqu 0x10(%0),%%xmm1 \n"
737 "movdqu 0x20(%0),%%xmm2 \n"
738 "movdqu 0x30(%0),%%xmm3 \n"
739 "pmaddubsw %%xmm4,%%xmm0 \n"
740 "pmaddubsw %%xmm4,%%xmm1 \n"
741 "pmaddubsw %%xmm4,%%xmm2 \n"
742 "pmaddubsw %%xmm4,%%xmm3 \n"
743 "lea 0x40(%0),%0 \n"
744 "phaddw %%xmm1,%%xmm0 \n"
745 "phaddw %%xmm3,%%xmm2 \n"
746 "psrlw $0x7,%%xmm0 \n"
747 "psrlw $0x7,%%xmm2 \n"
748 "packuswb %%xmm2,%%xmm0 \n"
749 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000750 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000751 "movdqu %%xmm0,(%1) \n"
752 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000753 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000754 : "+r"(src_argb), // %0
755 "+r"(dst_y), // %1
756 "+r"(pix) // %2
757 : "m"(kARGBToY), // %3
758 "m"(kAddY16) // %4
759 : "memory", "cc"
760#if defined(__SSE2__)
761 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
762#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000763 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000764}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000765
fbarchard@google.com714050a2012-02-17 22:59:56 +0000766// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000767// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
768// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
769// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000770// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000771void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
772 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000773 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000774 "movdqa %0,%%xmm4 \n"
775 "movdqa %1,%%xmm3 \n"
776 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000777 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000778 : "m"(kARGBToU), // %0
779 "m"(kARGBToV), // %1
780 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000781 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000782 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000783 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000784 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000785 "1: \n"
786 "movdqa (%0),%%xmm0 \n"
787 "movdqa 0x10(%0),%%xmm1 \n"
788 "movdqa 0x20(%0),%%xmm2 \n"
789 "movdqa 0x30(%0),%%xmm6 \n"
790 "pavgb (%0,%4,1),%%xmm0 \n"
791 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
792 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
793 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
794 "lea 0x40(%0),%0 \n"
795 "movdqa %%xmm0,%%xmm7 \n"
796 "shufps $0x88,%%xmm1,%%xmm0 \n"
797 "shufps $0xdd,%%xmm1,%%xmm7 \n"
798 "pavgb %%xmm7,%%xmm0 \n"
799 "movdqa %%xmm2,%%xmm7 \n"
800 "shufps $0x88,%%xmm6,%%xmm2 \n"
801 "shufps $0xdd,%%xmm6,%%xmm7 \n"
802 "pavgb %%xmm7,%%xmm2 \n"
803 "movdqa %%xmm0,%%xmm1 \n"
804 "movdqa %%xmm2,%%xmm6 \n"
805 "pmaddubsw %%xmm4,%%xmm0 \n"
806 "pmaddubsw %%xmm4,%%xmm2 \n"
807 "pmaddubsw %%xmm3,%%xmm1 \n"
808 "pmaddubsw %%xmm3,%%xmm6 \n"
809 "phaddw %%xmm2,%%xmm0 \n"
810 "phaddw %%xmm6,%%xmm1 \n"
811 "psraw $0x8,%%xmm0 \n"
812 "psraw $0x8,%%xmm1 \n"
813 "packsswb %%xmm1,%%xmm0 \n"
814 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000815 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000816 "movlps %%xmm0,(%1) \n"
817 "movhps %%xmm0,(%1,%2,1) \n"
818 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000819 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000820 : "+r"(src_argb0), // %0
821 "+r"(dst_u), // %1
822 "+r"(dst_v), // %2
823 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000824 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000825 : "memory", "cc"
826#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000827 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000828#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000829 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000830}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000831
832void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
833 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000834 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000835 "movdqa %0,%%xmm4 \n"
836 "movdqa %1,%%xmm3 \n"
837 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000838 :
839 : "m"(kARGBToU), // %0
840 "m"(kARGBToV), // %1
841 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000842 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000843 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000844 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000845 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000846 "1: \n"
847 "movdqu (%0),%%xmm0 \n"
848 "movdqu 0x10(%0),%%xmm1 \n"
849 "movdqu 0x20(%0),%%xmm2 \n"
850 "movdqu 0x30(%0),%%xmm6 \n"
851 "movdqu (%0,%4,1),%%xmm7 \n"
852 "pavgb %%xmm7,%%xmm0 \n"
853 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
854 "pavgb %%xmm7,%%xmm1 \n"
855 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
856 "pavgb %%xmm7,%%xmm2 \n"
857 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
858 "pavgb %%xmm7,%%xmm6 \n"
859 "lea 0x40(%0),%0 \n"
860 "movdqa %%xmm0,%%xmm7 \n"
861 "shufps $0x88,%%xmm1,%%xmm0 \n"
862 "shufps $0xdd,%%xmm1,%%xmm7 \n"
863 "pavgb %%xmm7,%%xmm0 \n"
864 "movdqa %%xmm2,%%xmm7 \n"
865 "shufps $0x88,%%xmm6,%%xmm2 \n"
866 "shufps $0xdd,%%xmm6,%%xmm7 \n"
867 "pavgb %%xmm7,%%xmm2 \n"
868 "movdqa %%xmm0,%%xmm1 \n"
869 "movdqa %%xmm2,%%xmm6 \n"
870 "pmaddubsw %%xmm4,%%xmm0 \n"
871 "pmaddubsw %%xmm4,%%xmm2 \n"
872 "pmaddubsw %%xmm3,%%xmm1 \n"
873 "pmaddubsw %%xmm3,%%xmm6 \n"
874 "phaddw %%xmm2,%%xmm0 \n"
875 "phaddw %%xmm6,%%xmm1 \n"
876 "psraw $0x8,%%xmm0 \n"
877 "psraw $0x8,%%xmm1 \n"
878 "packsswb %%xmm1,%%xmm0 \n"
879 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000880 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000881 "movlps %%xmm0,(%1) \n"
882 "movhps %%xmm0,(%1,%2,1) \n"
883 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000884 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000885 : "+r"(src_argb0), // %0
886 "+r"(dst_u), // %1
887 "+r"(dst_v), // %2
888 "+rm"(width) // %3
889 : "r"(static_cast<intptr_t>(src_stride_argb))
890 : "memory", "cc"
891#if defined(__SSE2__)
892 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
893#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000894 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000895}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000896
fbarchard@google.com714050a2012-02-17 22:59:56 +0000897void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000898 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000899 "movdqa %4,%%xmm5 \n"
900 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000901 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000902 "1: \n"
903 "movdqa (%0),%%xmm0 \n"
904 "movdqa 0x10(%0),%%xmm1 \n"
905 "movdqa 0x20(%0),%%xmm2 \n"
906 "movdqa 0x30(%0),%%xmm3 \n"
907 "pmaddubsw %%xmm4,%%xmm0 \n"
908 "pmaddubsw %%xmm4,%%xmm1 \n"
909 "pmaddubsw %%xmm4,%%xmm2 \n"
910 "pmaddubsw %%xmm4,%%xmm3 \n"
911 "lea 0x40(%0),%0 \n"
912 "phaddw %%xmm1,%%xmm0 \n"
913 "phaddw %%xmm3,%%xmm2 \n"
914 "psrlw $0x7,%%xmm0 \n"
915 "psrlw $0x7,%%xmm2 \n"
916 "packuswb %%xmm2,%%xmm0 \n"
917 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000918 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000919 "movdqa %%xmm0,(%1) \n"
920 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000921 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000922 : "+r"(src_bgra), // %0
923 "+r"(dst_y), // %1
924 "+r"(pix) // %2
925 : "m"(kBGRAToY), // %3
926 "m"(kAddY16) // %4
927 : "memory", "cc"
928#if defined(__SSE2__)
929 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000930#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000931 );
932}
933
934void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000935 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000936 "movdqa %4,%%xmm5 \n"
937 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000938 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000939 "1: \n"
940 "movdqu (%0),%%xmm0 \n"
941 "movdqu 0x10(%0),%%xmm1 \n"
942 "movdqu 0x20(%0),%%xmm2 \n"
943 "movdqu 0x30(%0),%%xmm3 \n"
944 "pmaddubsw %%xmm4,%%xmm0 \n"
945 "pmaddubsw %%xmm4,%%xmm1 \n"
946 "pmaddubsw %%xmm4,%%xmm2 \n"
947 "pmaddubsw %%xmm4,%%xmm3 \n"
948 "lea 0x40(%0),%0 \n"
949 "phaddw %%xmm1,%%xmm0 \n"
950 "phaddw %%xmm3,%%xmm2 \n"
951 "psrlw $0x7,%%xmm0 \n"
952 "psrlw $0x7,%%xmm2 \n"
953 "packuswb %%xmm2,%%xmm0 \n"
954 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000955 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000956 "movdqu %%xmm0,(%1) \n"
957 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000958 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000959 : "+r"(src_bgra), // %0
960 "+r"(dst_y), // %1
961 "+r"(pix) // %2
962 : "m"(kBGRAToY), // %3
963 "m"(kAddY16) // %4
964 : "memory", "cc"
965#if defined(__SSE2__)
966 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
967#endif
968 );
969}
970
971void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
972 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000973 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000974 "movdqa %0,%%xmm4 \n"
975 "movdqa %1,%%xmm3 \n"
976 "movdqa %2,%%xmm5 \n"
977 :
978 : "m"(kBGRAToU), // %0
979 "m"(kBGRAToV), // %1
980 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000981 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000982 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000983 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000984 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000985 "1: \n"
986 "movdqa (%0),%%xmm0 \n"
987 "movdqa 0x10(%0),%%xmm1 \n"
988 "movdqa 0x20(%0),%%xmm2 \n"
989 "movdqa 0x30(%0),%%xmm6 \n"
990 "pavgb (%0,%4,1),%%xmm0 \n"
991 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
992 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
993 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
994 "lea 0x40(%0),%0 \n"
995 "movdqa %%xmm0,%%xmm7 \n"
996 "shufps $0x88,%%xmm1,%%xmm0 \n"
997 "shufps $0xdd,%%xmm1,%%xmm7 \n"
998 "pavgb %%xmm7,%%xmm0 \n"
999 "movdqa %%xmm2,%%xmm7 \n"
1000 "shufps $0x88,%%xmm6,%%xmm2 \n"
1001 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1002 "pavgb %%xmm7,%%xmm2 \n"
1003 "movdqa %%xmm0,%%xmm1 \n"
1004 "movdqa %%xmm2,%%xmm6 \n"
1005 "pmaddubsw %%xmm4,%%xmm0 \n"
1006 "pmaddubsw %%xmm4,%%xmm2 \n"
1007 "pmaddubsw %%xmm3,%%xmm1 \n"
1008 "pmaddubsw %%xmm3,%%xmm6 \n"
1009 "phaddw %%xmm2,%%xmm0 \n"
1010 "phaddw %%xmm6,%%xmm1 \n"
1011 "psraw $0x8,%%xmm0 \n"
1012 "psraw $0x8,%%xmm1 \n"
1013 "packsswb %%xmm1,%%xmm0 \n"
1014 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001015 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001016 "movlps %%xmm0,(%1) \n"
1017 "movhps %%xmm0,(%1,%2,1) \n"
1018 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001019 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001020 : "+r"(src_bgra0), // %0
1021 "+r"(dst_u), // %1
1022 "+r"(dst_v), // %2
1023 "+rm"(width) // %3
1024 : "r"(static_cast<intptr_t>(src_stride_bgra))
1025 : "memory", "cc"
1026#if defined(__SSE2__)
1027 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1028#endif
1029 );
1030}
1031
1032void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1033 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001034 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001035 "movdqa %0,%%xmm4 \n"
1036 "movdqa %1,%%xmm3 \n"
1037 "movdqa %2,%%xmm5 \n"
1038 :
1039 : "m"(kBGRAToU), // %0
1040 "m"(kBGRAToV), // %1
1041 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001042 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001043 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001044 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001045 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001046 "1: \n"
1047 "movdqu (%0),%%xmm0 \n"
1048 "movdqu 0x10(%0),%%xmm1 \n"
1049 "movdqu 0x20(%0),%%xmm2 \n"
1050 "movdqu 0x30(%0),%%xmm6 \n"
1051 "movdqu (%0,%4,1),%%xmm7 \n"
1052 "pavgb %%xmm7,%%xmm0 \n"
1053 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1054 "pavgb %%xmm7,%%xmm1 \n"
1055 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1056 "pavgb %%xmm7,%%xmm2 \n"
1057 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1058 "pavgb %%xmm7,%%xmm6 \n"
1059 "lea 0x40(%0),%0 \n"
1060 "movdqa %%xmm0,%%xmm7 \n"
1061 "shufps $0x88,%%xmm1,%%xmm0 \n"
1062 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1063 "pavgb %%xmm7,%%xmm0 \n"
1064 "movdqa %%xmm2,%%xmm7 \n"
1065 "shufps $0x88,%%xmm6,%%xmm2 \n"
1066 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1067 "pavgb %%xmm7,%%xmm2 \n"
1068 "movdqa %%xmm0,%%xmm1 \n"
1069 "movdqa %%xmm2,%%xmm6 \n"
1070 "pmaddubsw %%xmm4,%%xmm0 \n"
1071 "pmaddubsw %%xmm4,%%xmm2 \n"
1072 "pmaddubsw %%xmm3,%%xmm1 \n"
1073 "pmaddubsw %%xmm3,%%xmm6 \n"
1074 "phaddw %%xmm2,%%xmm0 \n"
1075 "phaddw %%xmm6,%%xmm1 \n"
1076 "psraw $0x8,%%xmm0 \n"
1077 "psraw $0x8,%%xmm1 \n"
1078 "packsswb %%xmm1,%%xmm0 \n"
1079 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001080 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001081 "movlps %%xmm0,(%1) \n"
1082 "movhps %%xmm0,(%1,%2,1) \n"
1083 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001084 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001085 : "+r"(src_bgra0), // %0
1086 "+r"(dst_u), // %1
1087 "+r"(dst_v), // %2
1088 "+rm"(width) // %3
1089 : "r"(static_cast<intptr_t>(src_stride_bgra))
1090 : "memory", "cc"
1091#if defined(__SSE2__)
1092 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1093#endif
1094 );
1095}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001096
1097void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001098 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001099 "movdqa %4,%%xmm5 \n"
1100 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001101 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001102 "1: \n"
1103 "movdqa (%0),%%xmm0 \n"
1104 "movdqa 0x10(%0),%%xmm1 \n"
1105 "movdqa 0x20(%0),%%xmm2 \n"
1106 "movdqa 0x30(%0),%%xmm3 \n"
1107 "pmaddubsw %%xmm4,%%xmm0 \n"
1108 "pmaddubsw %%xmm4,%%xmm1 \n"
1109 "pmaddubsw %%xmm4,%%xmm2 \n"
1110 "pmaddubsw %%xmm4,%%xmm3 \n"
1111 "lea 0x40(%0),%0 \n"
1112 "phaddw %%xmm1,%%xmm0 \n"
1113 "phaddw %%xmm3,%%xmm2 \n"
1114 "psrlw $0x7,%%xmm0 \n"
1115 "psrlw $0x7,%%xmm2 \n"
1116 "packuswb %%xmm2,%%xmm0 \n"
1117 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001118 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001119 "movdqa %%xmm0,(%1) \n"
1120 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001121 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001122 : "+r"(src_abgr), // %0
1123 "+r"(dst_y), // %1
1124 "+r"(pix) // %2
1125 : "m"(kABGRToY), // %3
1126 "m"(kAddY16) // %4
1127 : "memory", "cc"
1128#if defined(__SSE2__)
1129 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1130#endif
1131 );
1132}
1133
1134void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001135 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001136 "movdqa %4,%%xmm5 \n"
1137 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001138 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 "1: \n"
1140 "movdqu (%0),%%xmm0 \n"
1141 "movdqu 0x10(%0),%%xmm1 \n"
1142 "movdqu 0x20(%0),%%xmm2 \n"
1143 "movdqu 0x30(%0),%%xmm3 \n"
1144 "pmaddubsw %%xmm4,%%xmm0 \n"
1145 "pmaddubsw %%xmm4,%%xmm1 \n"
1146 "pmaddubsw %%xmm4,%%xmm2 \n"
1147 "pmaddubsw %%xmm4,%%xmm3 \n"
1148 "lea 0x40(%0),%0 \n"
1149 "phaddw %%xmm1,%%xmm0 \n"
1150 "phaddw %%xmm3,%%xmm2 \n"
1151 "psrlw $0x7,%%xmm0 \n"
1152 "psrlw $0x7,%%xmm2 \n"
1153 "packuswb %%xmm2,%%xmm0 \n"
1154 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001155 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001156 "movdqu %%xmm0,(%1) \n"
1157 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001158 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001159 : "+r"(src_abgr), // %0
1160 "+r"(dst_y), // %1
1161 "+r"(pix) // %2
1162 : "m"(kABGRToY), // %3
1163 "m"(kAddY16) // %4
1164 : "memory", "cc"
1165#if defined(__SSE2__)
1166 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1167#endif
1168 );
1169}
1170
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001171void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1172 asm volatile (
1173 "movdqa %4,%%xmm5 \n"
1174 "movdqa %3,%%xmm4 \n"
1175 ".p2align 4 \n"
1176 "1: \n"
1177 "movdqa (%0),%%xmm0 \n"
1178 "movdqa 0x10(%0),%%xmm1 \n"
1179 "movdqa 0x20(%0),%%xmm2 \n"
1180 "movdqa 0x30(%0),%%xmm3 \n"
1181 "pmaddubsw %%xmm4,%%xmm0 \n"
1182 "pmaddubsw %%xmm4,%%xmm1 \n"
1183 "pmaddubsw %%xmm4,%%xmm2 \n"
1184 "pmaddubsw %%xmm4,%%xmm3 \n"
1185 "lea 0x40(%0),%0 \n"
1186 "phaddw %%xmm1,%%xmm0 \n"
1187 "phaddw %%xmm3,%%xmm2 \n"
1188 "psrlw $0x7,%%xmm0 \n"
1189 "psrlw $0x7,%%xmm2 \n"
1190 "packuswb %%xmm2,%%xmm0 \n"
1191 "paddb %%xmm5,%%xmm0 \n"
1192 "sub $0x10,%2 \n"
1193 "movdqa %%xmm0,(%1) \n"
1194 "lea 0x10(%1),%1 \n"
1195 "jg 1b \n"
1196 : "+r"(src_rgba), // %0
1197 "+r"(dst_y), // %1
1198 "+r"(pix) // %2
1199 : "m"(kRGBAToY), // %3
1200 "m"(kAddY16) // %4
1201 : "memory", "cc"
1202#if defined(__SSE2__)
1203 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1204#endif
1205 );
1206}
1207
1208void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1209 asm volatile (
1210 "movdqa %4,%%xmm5 \n"
1211 "movdqa %3,%%xmm4 \n"
1212 ".p2align 4 \n"
1213 "1: \n"
1214 "movdqu (%0),%%xmm0 \n"
1215 "movdqu 0x10(%0),%%xmm1 \n"
1216 "movdqu 0x20(%0),%%xmm2 \n"
1217 "movdqu 0x30(%0),%%xmm3 \n"
1218 "pmaddubsw %%xmm4,%%xmm0 \n"
1219 "pmaddubsw %%xmm4,%%xmm1 \n"
1220 "pmaddubsw %%xmm4,%%xmm2 \n"
1221 "pmaddubsw %%xmm4,%%xmm3 \n"
1222 "lea 0x40(%0),%0 \n"
1223 "phaddw %%xmm1,%%xmm0 \n"
1224 "phaddw %%xmm3,%%xmm2 \n"
1225 "psrlw $0x7,%%xmm0 \n"
1226 "psrlw $0x7,%%xmm2 \n"
1227 "packuswb %%xmm2,%%xmm0 \n"
1228 "paddb %%xmm5,%%xmm0 \n"
1229 "sub $0x10,%2 \n"
1230 "movdqu %%xmm0,(%1) \n"
1231 "lea 0x10(%1),%1 \n"
1232 "jg 1b \n"
1233 : "+r"(src_rgba), // %0
1234 "+r"(dst_y), // %1
1235 "+r"(pix) // %2
1236 : "m"(kRGBAToY), // %3
1237 "m"(kAddY16) // %4
1238 : "memory", "cc"
1239#if defined(__SSE2__)
1240 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1241#endif
1242 );
1243}
1244
fbarchard@google.com714050a2012-02-17 22:59:56 +00001245void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1246 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001247 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001248 "movdqa %0,%%xmm4 \n"
1249 "movdqa %1,%%xmm3 \n"
1250 "movdqa %2,%%xmm5 \n"
1251 :
1252 : "m"(kABGRToU), // %0
1253 "m"(kABGRToV), // %1
1254 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001255 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001256 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001257 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001258 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001259 "1: \n"
1260 "movdqa (%0),%%xmm0 \n"
1261 "movdqa 0x10(%0),%%xmm1 \n"
1262 "movdqa 0x20(%0),%%xmm2 \n"
1263 "movdqa 0x30(%0),%%xmm6 \n"
1264 "pavgb (%0,%4,1),%%xmm0 \n"
1265 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1266 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1267 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1268 "lea 0x40(%0),%0 \n"
1269 "movdqa %%xmm0,%%xmm7 \n"
1270 "shufps $0x88,%%xmm1,%%xmm0 \n"
1271 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1272 "pavgb %%xmm7,%%xmm0 \n"
1273 "movdqa %%xmm2,%%xmm7 \n"
1274 "shufps $0x88,%%xmm6,%%xmm2 \n"
1275 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1276 "pavgb %%xmm7,%%xmm2 \n"
1277 "movdqa %%xmm0,%%xmm1 \n"
1278 "movdqa %%xmm2,%%xmm6 \n"
1279 "pmaddubsw %%xmm4,%%xmm0 \n"
1280 "pmaddubsw %%xmm4,%%xmm2 \n"
1281 "pmaddubsw %%xmm3,%%xmm1 \n"
1282 "pmaddubsw %%xmm3,%%xmm6 \n"
1283 "phaddw %%xmm2,%%xmm0 \n"
1284 "phaddw %%xmm6,%%xmm1 \n"
1285 "psraw $0x8,%%xmm0 \n"
1286 "psraw $0x8,%%xmm1 \n"
1287 "packsswb %%xmm1,%%xmm0 \n"
1288 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001289 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001290 "movlps %%xmm0,(%1) \n"
1291 "movhps %%xmm0,(%1,%2,1) \n"
1292 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001293 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001294 : "+r"(src_abgr0), // %0
1295 "+r"(dst_u), // %1
1296 "+r"(dst_v), // %2
1297 "+rm"(width) // %3
1298 : "r"(static_cast<intptr_t>(src_stride_abgr))
1299 : "memory", "cc"
1300#if defined(__SSE2__)
1301 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1302#endif
1303 );
1304}
1305
1306void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1307 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001308 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001309 "movdqa %0,%%xmm4 \n"
1310 "movdqa %1,%%xmm3 \n"
1311 "movdqa %2,%%xmm5 \n"
1312 :
1313 : "m"(kABGRToU), // %0
1314 "m"(kABGRToV), // %1
1315 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001316 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001317 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001318 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001319 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001320 "1: \n"
1321 "movdqu (%0),%%xmm0 \n"
1322 "movdqu 0x10(%0),%%xmm1 \n"
1323 "movdqu 0x20(%0),%%xmm2 \n"
1324 "movdqu 0x30(%0),%%xmm6 \n"
1325 "movdqu (%0,%4,1),%%xmm7 \n"
1326 "pavgb %%xmm7,%%xmm0 \n"
1327 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1328 "pavgb %%xmm7,%%xmm1 \n"
1329 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1330 "pavgb %%xmm7,%%xmm2 \n"
1331 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1332 "pavgb %%xmm7,%%xmm6 \n"
1333 "lea 0x40(%0),%0 \n"
1334 "movdqa %%xmm0,%%xmm7 \n"
1335 "shufps $0x88,%%xmm1,%%xmm0 \n"
1336 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1337 "pavgb %%xmm7,%%xmm0 \n"
1338 "movdqa %%xmm2,%%xmm7 \n"
1339 "shufps $0x88,%%xmm6,%%xmm2 \n"
1340 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1341 "pavgb %%xmm7,%%xmm2 \n"
1342 "movdqa %%xmm0,%%xmm1 \n"
1343 "movdqa %%xmm2,%%xmm6 \n"
1344 "pmaddubsw %%xmm4,%%xmm0 \n"
1345 "pmaddubsw %%xmm4,%%xmm2 \n"
1346 "pmaddubsw %%xmm3,%%xmm1 \n"
1347 "pmaddubsw %%xmm3,%%xmm6 \n"
1348 "phaddw %%xmm2,%%xmm0 \n"
1349 "phaddw %%xmm6,%%xmm1 \n"
1350 "psraw $0x8,%%xmm0 \n"
1351 "psraw $0x8,%%xmm1 \n"
1352 "packsswb %%xmm1,%%xmm0 \n"
1353 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001354 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001355 "movlps %%xmm0,(%1) \n"
1356 "movhps %%xmm0,(%1,%2,1) \n"
1357 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001358 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001359 : "+r"(src_abgr0), // %0
1360 "+r"(dst_u), // %1
1361 "+r"(dst_v), // %2
1362 "+rm"(width) // %3
1363 : "r"(static_cast<intptr_t>(src_stride_abgr))
1364 : "memory", "cc"
1365#if defined(__SSE2__)
1366 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1367#endif
1368 );
1369}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001370
1371void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1372 uint8* dst_u, uint8* dst_v, int width) {
1373 asm volatile (
1374 "movdqa %0,%%xmm4 \n"
1375 "movdqa %1,%%xmm3 \n"
1376 "movdqa %2,%%xmm5 \n"
1377 :
1378 : "m"(kRGBAToU), // %0
1379 "m"(kRGBAToV), // %1
1380 "m"(kAddUV128) // %2
1381 );
1382 asm volatile (
1383 "sub %1,%2 \n"
1384 ".p2align 4 \n"
1385 "1: \n"
1386 "movdqa (%0),%%xmm0 \n"
1387 "movdqa 0x10(%0),%%xmm1 \n"
1388 "movdqa 0x20(%0),%%xmm2 \n"
1389 "movdqa 0x30(%0),%%xmm6 \n"
1390 "pavgb (%0,%4,1),%%xmm0 \n"
1391 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1392 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1393 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1394 "lea 0x40(%0),%0 \n"
1395 "movdqa %%xmm0,%%xmm7 \n"
1396 "shufps $0x88,%%xmm1,%%xmm0 \n"
1397 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1398 "pavgb %%xmm7,%%xmm0 \n"
1399 "movdqa %%xmm2,%%xmm7 \n"
1400 "shufps $0x88,%%xmm6,%%xmm2 \n"
1401 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1402 "pavgb %%xmm7,%%xmm2 \n"
1403 "movdqa %%xmm0,%%xmm1 \n"
1404 "movdqa %%xmm2,%%xmm6 \n"
1405 "pmaddubsw %%xmm4,%%xmm0 \n"
1406 "pmaddubsw %%xmm4,%%xmm2 \n"
1407 "pmaddubsw %%xmm3,%%xmm1 \n"
1408 "pmaddubsw %%xmm3,%%xmm6 \n"
1409 "phaddw %%xmm2,%%xmm0 \n"
1410 "phaddw %%xmm6,%%xmm1 \n"
1411 "psraw $0x8,%%xmm0 \n"
1412 "psraw $0x8,%%xmm1 \n"
1413 "packsswb %%xmm1,%%xmm0 \n"
1414 "paddb %%xmm5,%%xmm0 \n"
1415 "sub $0x10,%3 \n"
1416 "movlps %%xmm0,(%1) \n"
1417 "movhps %%xmm0,(%1,%2,1) \n"
1418 "lea 0x8(%1),%1 \n"
1419 "jg 1b \n"
1420 : "+r"(src_rgba0), // %0
1421 "+r"(dst_u), // %1
1422 "+r"(dst_v), // %2
1423 "+rm"(width) // %3
1424 : "r"(static_cast<intptr_t>(src_stride_rgba))
1425 : "memory", "cc"
1426#if defined(__SSE2__)
1427 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1428#endif
1429 );
1430}
1431
1432void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1433 uint8* dst_u, uint8* dst_v, int width) {
1434 asm volatile (
1435 "movdqa %0,%%xmm4 \n"
1436 "movdqa %1,%%xmm3 \n"
1437 "movdqa %2,%%xmm5 \n"
1438 :
1439 : "m"(kRGBAToU), // %0
1440 "m"(kRGBAToV), // %1
1441 "m"(kAddUV128) // %2
1442 );
1443 asm volatile (
1444 "sub %1,%2 \n"
1445 ".p2align 4 \n"
1446 "1: \n"
1447 "movdqu (%0),%%xmm0 \n"
1448 "movdqu 0x10(%0),%%xmm1 \n"
1449 "movdqu 0x20(%0),%%xmm2 \n"
1450 "movdqu 0x30(%0),%%xmm6 \n"
1451 "movdqu (%0,%4,1),%%xmm7 \n"
1452 "pavgb %%xmm7,%%xmm0 \n"
1453 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1454 "pavgb %%xmm7,%%xmm1 \n"
1455 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1456 "pavgb %%xmm7,%%xmm2 \n"
1457 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1458 "pavgb %%xmm7,%%xmm6 \n"
1459 "lea 0x40(%0),%0 \n"
1460 "movdqa %%xmm0,%%xmm7 \n"
1461 "shufps $0x88,%%xmm1,%%xmm0 \n"
1462 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1463 "pavgb %%xmm7,%%xmm0 \n"
1464 "movdqa %%xmm2,%%xmm7 \n"
1465 "shufps $0x88,%%xmm6,%%xmm2 \n"
1466 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1467 "pavgb %%xmm7,%%xmm2 \n"
1468 "movdqa %%xmm0,%%xmm1 \n"
1469 "movdqa %%xmm2,%%xmm6 \n"
1470 "pmaddubsw %%xmm4,%%xmm0 \n"
1471 "pmaddubsw %%xmm4,%%xmm2 \n"
1472 "pmaddubsw %%xmm3,%%xmm1 \n"
1473 "pmaddubsw %%xmm3,%%xmm6 \n"
1474 "phaddw %%xmm2,%%xmm0 \n"
1475 "phaddw %%xmm6,%%xmm1 \n"
1476 "psraw $0x8,%%xmm0 \n"
1477 "psraw $0x8,%%xmm1 \n"
1478 "packsswb %%xmm1,%%xmm0 \n"
1479 "paddb %%xmm5,%%xmm0 \n"
1480 "sub $0x10,%3 \n"
1481 "movlps %%xmm0,(%1) \n"
1482 "movhps %%xmm0,(%1,%2,1) \n"
1483 "lea 0x8(%1),%1 \n"
1484 "jg 1b \n"
1485 : "+r"(src_rgba0), // %0
1486 "+r"(dst_u), // %1
1487 "+r"(dst_v), // %2
1488 "+rm"(width) // %3
1489 : "r"(static_cast<intptr_t>(src_stride_rgba))
1490 : "memory", "cc"
1491#if defined(__SSE2__)
1492 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1493#endif
1494 );
1495}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001496#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001497
fbarchard@google.come214fe32012-06-04 23:47:11 +00001498#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001499#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1500#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1501#define UR 0
1502
1503#define VB 0
1504#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1505#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1506
1507// Bias
1508#define BB UB * 128 + VB * 128
1509#define BG UG * 128 + VG * 128
1510#define BR UR * 128 + VR * 128
1511
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001512#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001513
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001514struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001515 vec8 kUVToB; // 0
1516 vec8 kUVToG; // 16
1517 vec8 kUVToR; // 32
1518 vec16 kUVBiasB; // 48
1519 vec16 kUVBiasG; // 64
1520 vec16 kUVBiasR; // 80
1521 vec16 kYSub16; // 96
1522 vec16 kYToRgb; // 112
1523 vec8 kVUToB; // 128
1524 vec8 kVUToG; // 144
1525 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001526} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001527 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1528 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1529 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1530 { BB, BB, BB, BB, BB, BB, BB, BB },
1531 { BG, BG, BG, BG, BG, BG, BG, BG },
1532 { BR, BR, BR, BR, BR, BR, BR, BR },
1533 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001534 { YG, YG, YG, YG, YG, YG, YG, YG },
1535 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1536 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1537 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001538};
1539
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001540
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001541// Read 8 UV from 411
1542#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001543 "movq (%[u_buf]),%%xmm0 \n" \
1544 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1545 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001546 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001547
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001548// Read 4 UV from 422, upsample to 8 UV
1549#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001550 "movd (%[u_buf]),%%xmm0 \n" \
1551 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1552 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001553 "punpcklbw %%xmm1,%%xmm0 \n" \
1554 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001555
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001556// Read 2 UV from 411, upsample to 8 UV
1557#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001558 "movd (%[u_buf]),%%xmm0 \n" \
1559 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1560 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001561 "punpcklbw %%xmm1,%%xmm0 \n" \
1562 "punpcklwd %%xmm0,%%xmm0 \n" \
1563 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001564
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001565// Read 4 UV from NV12, upsample to 8 UV
1566#define READNV12 \
1567 "movq (%[uv_buf]),%%xmm0 \n" \
1568 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001569 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001570
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001571// Convert 8 pixels: 8 UV and 8 Y
1572#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001573 "movdqa %%xmm0,%%xmm1 \n" \
1574 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001575 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1576 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1577 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1578 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1579 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1580 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1581 "movq (%[y_buf]),%%xmm3 \n" \
1582 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001583 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001584 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1585 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001586 "paddsw %%xmm3,%%xmm0 \n" \
1587 "paddsw %%xmm3,%%xmm1 \n" \
1588 "paddsw %%xmm3,%%xmm2 \n" \
1589 "psraw $0x6,%%xmm0 \n" \
1590 "psraw $0x6,%%xmm1 \n" \
1591 "psraw $0x6,%%xmm2 \n" \
1592 "packuswb %%xmm0,%%xmm0 \n" \
1593 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001594 "packuswb %%xmm2,%%xmm2 \n" \
1595
1596// Convert 8 pixels: 8 VU and 8 Y
1597#define YVUTORGB \
1598 "movdqa %%xmm0,%%xmm1 \n" \
1599 "movdqa %%xmm0,%%xmm2 \n" \
1600 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1601 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1602 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1603 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1604 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1605 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1606 "movq (%[y_buf]),%%xmm3 \n" \
1607 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1608 "punpcklbw %%xmm4,%%xmm3 \n" \
1609 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1610 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1611 "paddsw %%xmm3,%%xmm0 \n" \
1612 "paddsw %%xmm3,%%xmm1 \n" \
1613 "paddsw %%xmm3,%%xmm2 \n" \
1614 "psraw $0x6,%%xmm0 \n" \
1615 "psraw $0x6,%%xmm1 \n" \
1616 "psraw $0x6,%%xmm2 \n" \
1617 "packuswb %%xmm0,%%xmm0 \n" \
1618 "packuswb %%xmm1,%%xmm1 \n" \
1619 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001620
1621void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001622 const uint8* u_buf,
1623 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001624 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001625 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001626 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001627 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001628 "pcmpeqb %%xmm5,%%xmm5 \n"
1629 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001630 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001631 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001632 READYUV444
1633 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001634 "punpcklbw %%xmm1,%%xmm0 \n"
1635 "punpcklbw %%xmm5,%%xmm2 \n"
1636 "movdqa %%xmm0,%%xmm1 \n"
1637 "punpcklwd %%xmm2,%%xmm0 \n"
1638 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001639 "movdqa %%xmm0,(%[argb_buf]) \n"
1640 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1641 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1642 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001643 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001644 : [y_buf]"+r"(y_buf), // %[y_buf]
1645 [u_buf]"+r"(u_buf), // %[u_buf]
1646 [v_buf]"+r"(v_buf), // %[v_buf]
1647 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1648 [width]"+rm"(width) // %[width]
1649 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001650 : "memory", "cc"
1651#if defined(__SSE2__)
1652 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1653#endif
1654 );
1655}
1656
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001657void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1658 const uint8* u_buf,
1659 const uint8* v_buf,
1660 uint8* rgb24_buf,
1661 int width) {
1662// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1663#ifdef __APPLE__
1664 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001665 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1666 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1667 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1668 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001669#endif
1670
1671 asm volatile (
1672#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001673 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1674 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001675#endif
1676 "sub %[u_buf],%[v_buf] \n"
1677 "pxor %%xmm4,%%xmm4 \n"
1678 ".p2align 4 \n"
1679 "1: \n"
1680 READYUV422
1681 YUVTORGB
1682 "punpcklbw %%xmm1,%%xmm0 \n"
1683 "punpcklbw %%xmm2,%%xmm2 \n"
1684 "movdqa %%xmm0,%%xmm1 \n"
1685 "punpcklwd %%xmm2,%%xmm0 \n"
1686 "punpckhwd %%xmm2,%%xmm1 \n"
1687 "pshufb %%xmm5,%%xmm0 \n"
1688 "pshufb %%xmm6,%%xmm1 \n"
1689 "palignr $0xc,%%xmm0,%%xmm1 \n"
1690 "movq %%xmm0,(%[rgb24_buf]) \n"
1691 "movdqu %%xmm1,0x8(%[rgb24_buf]) \n"
1692 "lea 0x18(%[rgb24_buf]),%[rgb24_buf] \n"
1693 "sub $0x8,%[width] \n"
1694 "jg 1b \n"
1695 : [y_buf]"+r"(y_buf), // %[y_buf]
1696 [u_buf]"+r"(u_buf), // %[u_buf]
1697 [v_buf]"+r"(v_buf), // %[v_buf]
1698 [rgb24_buf]"+r"(rgb24_buf), // %[rgb24_buf]
1699 [width]"+rm"(width) // %[width]
1700 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1701#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001702 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1703 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001704#endif
1705 : "memory", "cc"
1706#if defined(__SSE2__)
1707 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1708#endif
1709 );
1710}
1711
1712void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1713 const uint8* u_buf,
1714 const uint8* v_buf,
1715 uint8* raw_buf,
1716 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001717// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001718#ifdef __APPLE__
1719 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001720 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1721 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1722 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1723 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001724#endif
1725
1726 asm volatile (
1727#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001728 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1729 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001730#endif
1731 "sub %[u_buf],%[v_buf] \n"
1732 "pxor %%xmm4,%%xmm4 \n"
1733 ".p2align 4 \n"
1734 "1: \n"
1735 READYUV422
1736 YUVTORGB
1737 "punpcklbw %%xmm1,%%xmm0 \n"
1738 "punpcklbw %%xmm2,%%xmm2 \n"
1739 "movdqa %%xmm0,%%xmm1 \n"
1740 "punpcklwd %%xmm2,%%xmm0 \n"
1741 "punpckhwd %%xmm2,%%xmm1 \n"
1742 "pshufb %%xmm5,%%xmm0 \n"
1743 "pshufb %%xmm6,%%xmm1 \n"
1744 "palignr $0xc,%%xmm0,%%xmm1 \n"
1745 "movq %%xmm0,(%[raw_buf]) \n"
1746 "movdqu %%xmm1,0x8(%[raw_buf]) \n"
1747 "lea 0x18(%[raw_buf]),%[raw_buf] \n"
1748 "sub $0x8,%[width] \n"
1749 "jg 1b \n"
1750 : [y_buf]"+r"(y_buf), // %[y_buf]
1751 [u_buf]"+r"(u_buf), // %[u_buf]
1752 [v_buf]"+r"(v_buf), // %[v_buf]
1753 [raw_buf]"+r"(raw_buf), // %[raw_buf]
1754 [width]"+rm"(width) // %[width]
1755 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1756#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001757 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1758 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001759#endif
1760 : "memory", "cc"
1761#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001762 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001763#endif
1764 );
1765}
1766
fbarchard@google.come214fe32012-06-04 23:47:11 +00001767void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001768 const uint8* u_buf,
1769 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001770 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001771 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001772 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001773 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001774 "pcmpeqb %%xmm5,%%xmm5 \n"
1775 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001776 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001777 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001778 READYUV422
1779 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001780 "punpcklbw %%xmm1,%%xmm0 \n"
1781 "punpcklbw %%xmm5,%%xmm2 \n"
1782 "movdqa %%xmm0,%%xmm1 \n"
1783 "punpcklwd %%xmm2,%%xmm0 \n"
1784 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001785 "movdqa %%xmm0,(%[argb_buf]) \n"
1786 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1787 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1788 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001789 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001790 : [y_buf]"+r"(y_buf), // %[y_buf]
1791 [u_buf]"+r"(u_buf), // %[u_buf]
1792 [v_buf]"+r"(v_buf), // %[v_buf]
1793 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1794 [width]"+rm"(width) // %[width]
1795 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001796 : "memory", "cc"
1797#if defined(__SSE2__)
1798 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1799#endif
1800 );
1801}
1802
1803void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1804 const uint8* u_buf,
1805 const uint8* v_buf,
1806 uint8* argb_buf,
1807 int width) {
1808 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001809 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001810 "pcmpeqb %%xmm5,%%xmm5 \n"
1811 "pxor %%xmm4,%%xmm4 \n"
1812 ".p2align 4 \n"
1813 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001814 READYUV411
1815 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001816 "punpcklbw %%xmm1,%%xmm0 \n"
1817 "punpcklbw %%xmm5,%%xmm2 \n"
1818 "movdqa %%xmm0,%%xmm1 \n"
1819 "punpcklwd %%xmm2,%%xmm0 \n"
1820 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001821 "movdqa %%xmm0,(%[argb_buf]) \n"
1822 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1823 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1824 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001825 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001826 : [y_buf]"+r"(y_buf), // %[y_buf]
1827 [u_buf]"+r"(u_buf), // %[u_buf]
1828 [v_buf]"+r"(v_buf), // %[v_buf]
1829 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1830 [width]"+rm"(width) // %[width]
1831 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1832 : "memory", "cc"
1833#if defined(__SSE2__)
1834 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1835#endif
1836 );
1837}
1838
1839void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1840 const uint8* uv_buf,
1841 uint8* argb_buf,
1842 int width) {
1843 asm volatile (
1844 "pcmpeqb %%xmm5,%%xmm5 \n"
1845 "pxor %%xmm4,%%xmm4 \n"
1846 ".p2align 4 \n"
1847 "1: \n"
1848 READNV12
1849 YUVTORGB
1850 "punpcklbw %%xmm1,%%xmm0 \n"
1851 "punpcklbw %%xmm5,%%xmm2 \n"
1852 "movdqa %%xmm0,%%xmm1 \n"
1853 "punpcklwd %%xmm2,%%xmm0 \n"
1854 "punpckhwd %%xmm2,%%xmm1 \n"
1855 "movdqa %%xmm0,(%[argb_buf]) \n"
1856 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1857 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1858 "sub $0x8,%[width] \n"
1859 "jg 1b \n"
1860 : [y_buf]"+r"(y_buf), // %[y_buf]
1861 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1862 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1863 [width]"+rm"(width) // %[width]
1864 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1865 : "memory", "cc"
1866#if defined(__SSE2__)
1867 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1868#endif
1869 );
1870}
1871
1872void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1873 const uint8* vu_buf,
1874 uint8* argb_buf,
1875 int width) {
1876 asm volatile (
1877 "pcmpeqb %%xmm5,%%xmm5 \n"
1878 "pxor %%xmm4,%%xmm4 \n"
1879 ".p2align 4 \n"
1880 "1: \n"
1881 READNV12
1882 YVUTORGB
1883 "punpcklbw %%xmm1,%%xmm0 \n"
1884 "punpcklbw %%xmm5,%%xmm2 \n"
1885 "movdqa %%xmm0,%%xmm1 \n"
1886 "punpcklwd %%xmm2,%%xmm0 \n"
1887 "punpckhwd %%xmm2,%%xmm1 \n"
1888 "movdqa %%xmm0,(%[argb_buf]) \n"
1889 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1890 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1891 "sub $0x8,%[width] \n"
1892 "jg 1b \n"
1893 : [y_buf]"+r"(y_buf), // %[y_buf]
1894 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1895 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1896 [width]"+rm"(width) // %[width]
1897 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001898 : "memory", "cc"
1899#if defined(__SSE2__)
1900 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1901#endif
1902 );
1903}
1904
1905void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1906 const uint8* u_buf,
1907 const uint8* v_buf,
1908 uint8* argb_buf,
1909 int width) {
1910 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001911 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001912 "pcmpeqb %%xmm5,%%xmm5 \n"
1913 "pxor %%xmm4,%%xmm4 \n"
1914 ".p2align 4 \n"
1915 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001916 READYUV444
1917 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001918 "punpcklbw %%xmm1,%%xmm0 \n"
1919 "punpcklbw %%xmm5,%%xmm2 \n"
1920 "movdqa %%xmm0,%%xmm1 \n"
1921 "punpcklwd %%xmm2,%%xmm0 \n"
1922 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001923 "movdqu %%xmm0,(%[argb_buf]) \n"
1924 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1925 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1926 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001927 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001928 : [y_buf]"+r"(y_buf), // %[y_buf]
1929 [u_buf]"+r"(u_buf), // %[u_buf]
1930 [v_buf]"+r"(v_buf), // %[v_buf]
1931 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1932 [width]"+rm"(width) // %[width]
1933 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001934 : "memory", "cc"
1935#if defined(__SSE2__)
1936 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1937#endif
1938 );
1939}
1940
1941void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1942 const uint8* u_buf,
1943 const uint8* v_buf,
1944 uint8* argb_buf,
1945 int width) {
1946 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001947 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001948 "pcmpeqb %%xmm5,%%xmm5 \n"
1949 "pxor %%xmm4,%%xmm4 \n"
1950 ".p2align 4 \n"
1951 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001952 READYUV422
1953 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001954 "punpcklbw %%xmm1,%%xmm0 \n"
1955 "punpcklbw %%xmm5,%%xmm2 \n"
1956 "movdqa %%xmm0,%%xmm1 \n"
1957 "punpcklwd %%xmm2,%%xmm0 \n"
1958 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001959 "movdqu %%xmm0,(%[argb_buf]) \n"
1960 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1961 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1962 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001963 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001964 : [y_buf]"+r"(y_buf), // %[y_buf]
1965 [u_buf]"+r"(u_buf), // %[u_buf]
1966 [v_buf]"+r"(v_buf), // %[v_buf]
1967 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1968 [width]"+rm"(width) // %[width]
1969 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001970 : "memory", "cc"
1971#if defined(__SSE2__)
1972 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1973#endif
1974 );
1975}
1976
1977void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1978 const uint8* u_buf,
1979 const uint8* v_buf,
1980 uint8* argb_buf,
1981 int width) {
1982 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001983 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001984 "pcmpeqb %%xmm5,%%xmm5 \n"
1985 "pxor %%xmm4,%%xmm4 \n"
1986 ".p2align 4 \n"
1987 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001988 READYUV411
1989 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001990 "punpcklbw %%xmm1,%%xmm0 \n"
1991 "punpcklbw %%xmm5,%%xmm2 \n"
1992 "movdqa %%xmm0,%%xmm1 \n"
1993 "punpcklwd %%xmm2,%%xmm0 \n"
1994 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001995 "movdqu %%xmm0,(%[argb_buf]) \n"
1996 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1997 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1998 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001999 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002000 : [y_buf]"+r"(y_buf), // %[y_buf]
2001 [u_buf]"+r"(u_buf), // %[u_buf]
2002 [v_buf]"+r"(v_buf), // %[v_buf]
2003 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2004 [width]"+rm"(width) // %[width]
2005 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2006 : "memory", "cc"
2007#if defined(__SSE2__)
2008 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2009#endif
2010 );
2011}
2012
2013void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2014 const uint8* uv_buf,
2015 uint8* argb_buf,
2016 int width) {
2017 asm volatile (
2018 "pcmpeqb %%xmm5,%%xmm5 \n"
2019 "pxor %%xmm4,%%xmm4 \n"
2020 ".p2align 4 \n"
2021 "1: \n"
2022 READNV12
2023 YUVTORGB
2024 "punpcklbw %%xmm1,%%xmm0 \n"
2025 "punpcklbw %%xmm5,%%xmm2 \n"
2026 "movdqa %%xmm0,%%xmm1 \n"
2027 "punpcklwd %%xmm2,%%xmm0 \n"
2028 "punpckhwd %%xmm2,%%xmm1 \n"
2029 "movdqu %%xmm0,(%[argb_buf]) \n"
2030 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2031 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2032 "sub $0x8,%[width] \n"
2033 "jg 1b \n"
2034 : [y_buf]"+r"(y_buf), // %[y_buf]
2035 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2036 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2037 [width]"+rm"(width) // %[width]
2038 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2039 : "memory", "cc"
2040#if defined(__SSE2__)
2041 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2042#endif
2043 );
2044}
2045
2046void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2047 const uint8* vu_buf,
2048 uint8* argb_buf,
2049 int width) {
2050 asm volatile (
2051 "pcmpeqb %%xmm5,%%xmm5 \n"
2052 "pxor %%xmm4,%%xmm4 \n"
2053 ".p2align 4 \n"
2054 "1: \n"
2055 READNV12
2056 YVUTORGB
2057 "punpcklbw %%xmm1,%%xmm0 \n"
2058 "punpcklbw %%xmm5,%%xmm2 \n"
2059 "movdqa %%xmm0,%%xmm1 \n"
2060 "punpcklwd %%xmm2,%%xmm0 \n"
2061 "punpckhwd %%xmm2,%%xmm1 \n"
2062 "movdqu %%xmm0,(%[argb_buf]) \n"
2063 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2064 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2065 "sub $0x8,%[width] \n"
2066 "jg 1b \n"
2067 : [y_buf]"+r"(y_buf), // %[y_buf]
2068 [uv_buf]"+r"(vu_buf), // %[uv_buf]
2069 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2070 [width]"+rm"(width) // %[width]
2071 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002072 : "memory", "cc"
2073#if defined(__SSE2__)
2074 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2075#endif
2076 );
2077}
2078
2079void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2080 const uint8* u_buf,
2081 const uint8* v_buf,
2082 uint8* bgra_buf,
2083 int width) {
2084 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002085 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002086 "pcmpeqb %%xmm5,%%xmm5 \n"
2087 "pxor %%xmm4,%%xmm4 \n"
2088 ".p2align 4 \n"
2089 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002090 READYUV422
2091 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002092 "pcmpeqb %%xmm5,%%xmm5 \n"
2093 "punpcklbw %%xmm0,%%xmm1 \n"
2094 "punpcklbw %%xmm2,%%xmm5 \n"
2095 "movdqa %%xmm5,%%xmm0 \n"
2096 "punpcklwd %%xmm1,%%xmm5 \n"
2097 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002098 "movdqa %%xmm5,(%[argb_buf]) \n"
2099 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2100 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2101 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002102 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002103 : [y_buf]"+r"(y_buf), // %[y_buf]
2104 [u_buf]"+r"(u_buf), // %[u_buf]
2105 [v_buf]"+r"(v_buf), // %[v_buf]
2106 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2107 [width]"+rm"(width) // %[width]
2108 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002109 : "memory", "cc"
2110#if defined(__SSE2__)
2111 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2112#endif
2113 );
2114}
2115
fbarchard@google.come214fe32012-06-04 23:47:11 +00002116void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002117 const uint8* u_buf,
2118 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002119 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002120 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002121 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002122 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002123 "pcmpeqb %%xmm5,%%xmm5 \n"
2124 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002125 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002126 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002127 READYUV422
2128 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002129 "punpcklbw %%xmm1,%%xmm2 \n"
2130 "punpcklbw %%xmm5,%%xmm0 \n"
2131 "movdqa %%xmm2,%%xmm1 \n"
2132 "punpcklwd %%xmm0,%%xmm2 \n"
2133 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002134 "movdqa %%xmm2,(%[argb_buf]) \n"
2135 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
2136 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2137 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002138 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002139 : [y_buf]"+r"(y_buf), // %[y_buf]
2140 [u_buf]"+r"(u_buf), // %[u_buf]
2141 [v_buf]"+r"(v_buf), // %[v_buf]
2142 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2143 [width]"+rm"(width) // %[width]
2144 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002145 : "memory", "cc"
2146#if defined(__SSE2__)
2147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2148#endif
2149 );
2150}
2151
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002152void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2153 const uint8* u_buf,
2154 const uint8* v_buf,
2155 uint8* rgba_buf,
2156 int width) {
2157 asm volatile (
2158 "sub %[u_buf],%[v_buf] \n"
2159 "pcmpeqb %%xmm5,%%xmm5 \n"
2160 "pxor %%xmm4,%%xmm4 \n"
2161 ".p2align 4 \n"
2162 "1: \n"
2163 READYUV422
2164 YUVTORGB
2165 "pcmpeqb %%xmm5,%%xmm5 \n"
2166 "punpcklbw %%xmm2,%%xmm1 \n"
2167 "punpcklbw %%xmm0,%%xmm5 \n"
2168 "movdqa %%xmm5,%%xmm0 \n"
2169 "punpcklwd %%xmm1,%%xmm5 \n"
2170 "punpckhwd %%xmm1,%%xmm0 \n"
2171 "movdqa %%xmm5,(%[argb_buf]) \n"
2172 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2173 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2174 "sub $0x8,%[width] \n"
2175 "jg 1b \n"
2176 : [y_buf]"+r"(y_buf), // %[y_buf]
2177 [u_buf]"+r"(u_buf), // %[u_buf]
2178 [v_buf]"+r"(v_buf), // %[v_buf]
2179 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2180 [width]"+rm"(width) // %[width]
2181 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2182 : "memory", "cc"
2183#if defined(__SSE2__)
2184 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2185#endif
2186 );
2187}
2188
fbarchard@google.come214fe32012-06-04 23:47:11 +00002189void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002190 const uint8* u_buf,
2191 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002192 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002193 int width) {
2194 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002195 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002196 "pcmpeqb %%xmm5,%%xmm5 \n"
2197 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002198 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002199 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002200 READYUV422
2201 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002202 "pcmpeqb %%xmm5,%%xmm5 \n"
2203 "punpcklbw %%xmm0,%%xmm1 \n"
2204 "punpcklbw %%xmm2,%%xmm5 \n"
2205 "movdqa %%xmm5,%%xmm0 \n"
2206 "punpcklwd %%xmm1,%%xmm5 \n"
2207 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002208 "movdqu %%xmm5,(%[argb_buf]) \n"
2209 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
2210 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2211 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002212 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002213 : [y_buf]"+r"(y_buf), // %[y_buf]
2214 [u_buf]"+r"(u_buf), // %[u_buf]
2215 [v_buf]"+r"(v_buf), // %[v_buf]
2216 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2217 [width]"+rm"(width) // %[width]
2218 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002219 : "memory", "cc"
2220#if defined(__SSE2__)
2221 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2222#endif
2223 );
2224}
2225
fbarchard@google.come214fe32012-06-04 23:47:11 +00002226void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002227 const uint8* u_buf,
2228 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002229 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002230 int width) {
2231 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002232 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002233 "pcmpeqb %%xmm5,%%xmm5 \n"
2234 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002235 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002236 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002237 READYUV422
2238 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002239 "punpcklbw %%xmm1,%%xmm2 \n"
2240 "punpcklbw %%xmm5,%%xmm0 \n"
2241 "movdqa %%xmm2,%%xmm1 \n"
2242 "punpcklwd %%xmm0,%%xmm2 \n"
2243 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002244 "movdqu %%xmm2,(%[argb_buf]) \n"
2245 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2246 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2247 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002248 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002249 : [y_buf]"+r"(y_buf), // %[y_buf]
2250 [u_buf]"+r"(u_buf), // %[u_buf]
2251 [v_buf]"+r"(v_buf), // %[v_buf]
2252 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2253 [width]"+rm"(width) // %[width]
2254 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002255 : "memory", "cc"
2256#if defined(__SSE2__)
2257 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2258#endif
2259 );
2260}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002261
2262void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2263 const uint8* u_buf,
2264 const uint8* v_buf,
2265 uint8* rgba_buf,
2266 int width) {
2267 asm volatile (
2268 "sub %[u_buf],%[v_buf] \n"
2269 "pcmpeqb %%xmm5,%%xmm5 \n"
2270 "pxor %%xmm4,%%xmm4 \n"
2271 ".p2align 4 \n"
2272 "1: \n"
2273 READYUV422
2274 YUVTORGB
2275 "pcmpeqb %%xmm5,%%xmm5 \n"
2276 "punpcklbw %%xmm2,%%xmm1 \n"
2277 "punpcklbw %%xmm0,%%xmm5 \n"
2278 "movdqa %%xmm5,%%xmm0 \n"
2279 "punpcklwd %%xmm1,%%xmm5 \n"
2280 "punpckhwd %%xmm1,%%xmm0 \n"
2281 "movdqa %%xmm5,(%[argb_buf]) \n"
2282 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2283 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2284 "sub $0x8,%[width] \n"
2285 "jg 1b \n"
2286 : [y_buf]"+r"(y_buf), // %[y_buf]
2287 [u_buf]"+r"(u_buf), // %[u_buf]
2288 [v_buf]"+r"(v_buf), // %[v_buf]
2289 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2290 [width]"+rm"(width) // %[width]
2291 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2292 : "memory", "cc"
2293#if defined(__SSE2__)
2294 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2295#endif
2296 );
2297}
2298
fbarchard@google.come214fe32012-06-04 23:47:11 +00002299#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002300
2301#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002302void YToARGBRow_SSE2(const uint8* y_buf,
2303 uint8* rgb_buf,
2304 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002305 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002306 "pcmpeqb %%xmm4,%%xmm4 \n"
2307 "pslld $0x18,%%xmm4 \n"
2308 "mov $0x10001000,%%eax \n"
2309 "movd %%eax,%%xmm3 \n"
2310 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2311 "mov $0x012a012a,%%eax \n"
2312 "movd %%eax,%%xmm2 \n"
2313 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002314 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002315 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002316 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002317 "movq (%0),%%xmm0 \n"
2318 "lea 0x8(%0),%0 \n"
2319 "punpcklbw %%xmm0,%%xmm0 \n"
2320 "psubusw %%xmm3,%%xmm0 \n"
2321 "pmulhuw %%xmm2,%%xmm0 \n"
2322 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002323
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002324 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002325 "punpcklbw %%xmm0,%%xmm0 \n"
2326 "movdqa %%xmm0,%%xmm1 \n"
2327 "punpcklwd %%xmm0,%%xmm0 \n"
2328 "punpckhwd %%xmm1,%%xmm1 \n"
2329 "por %%xmm4,%%xmm0 \n"
2330 "por %%xmm4,%%xmm1 \n"
2331 "movdqa %%xmm0,(%1) \n"
2332 "movdqa %%xmm1,16(%1) \n"
2333 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002334
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002335 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002336 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00002337 : "+r"(y_buf), // %0
2338 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00002339 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002340 :
2341 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002342#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002343 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002344#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002345 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002346}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002347#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002348
fbarchard@google.com42831e02012-01-21 02:54:17 +00002349#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002350// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002351CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002352 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2353};
2354
fbarchard@google.com42831e02012-01-21 02:54:17 +00002355void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002356 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002357 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002358 "movdqa %3,%%xmm5 \n"
2359 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002360 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002361 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002362 "movdqa (%0,%2),%%xmm0 \n"
2363 "pshufb %%xmm5,%%xmm0 \n"
2364 "sub $0x10,%2 \n"
2365 "movdqa %%xmm0,(%1) \n"
2366 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002367 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002368 : "+r"(src), // %0
2369 "+r"(dst), // %1
2370 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002371 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002372 : "memory", "cc"
2373#if defined(__SSE2__)
2374 , "xmm0", "xmm5"
2375#endif
2376 );
2377}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002378#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002379
fbarchard@google.com42831e02012-01-21 02:54:17 +00002380#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002381void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002382 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002383 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002384 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002385 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002386 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002387 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002388 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002389 "psllw $0x8,%%xmm0 \n"
2390 "psrlw $0x8,%%xmm1 \n"
2391 "por %%xmm1,%%xmm0 \n"
2392 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2393 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2394 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2395 "sub $0x10,%2 \n"
2396 "movdqu %%xmm0,(%1) \n"
2397 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002398 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002399 : "+r"(src), // %0
2400 "+r"(dst), // %1
2401 "+r"(temp_width) // %2
2402 :
2403 : "memory", "cc"
2404#if defined(__SSE2__)
2405 , "xmm0", "xmm1"
2406#endif
2407 );
2408}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002409#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002410
fbarchard@google.com16a96642012-03-02 22:38:09 +00002411#ifdef HAS_MIRRORROW_UV_SSSE3
2412// Shuffle table for reversing the bytes of UV channels.
2413CONST uvec8 kShuffleMirrorUV = {
2414 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2415};
2416void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2417 int width) {
2418 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002419 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002420 "movdqa %4,%%xmm1 \n"
2421 "lea -16(%0,%3,2),%0 \n"
2422 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002423 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002424 "1: \n"
2425 "movdqa (%0),%%xmm0 \n"
2426 "lea -16(%0),%0 \n"
2427 "pshufb %%xmm1,%%xmm0 \n"
2428 "sub $8,%3 \n"
2429 "movlpd %%xmm0,(%1) \n"
2430 "movhpd %%xmm0,(%1,%2) \n"
2431 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002432 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002433 : "+r"(src), // %0
2434 "+r"(dst_u), // %1
2435 "+r"(dst_v), // %2
2436 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002437 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002438 : "memory", "cc"
2439#if defined(__SSE2__)
2440 , "xmm0", "xmm1"
2441#endif
2442 );
2443}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002444#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002445
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002446#ifdef HAS_ARGBMIRRORROW_SSSE3
2447// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002448CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002449 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2450};
2451
2452void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2453 intptr_t temp_width = static_cast<intptr_t>(width);
2454 asm volatile (
2455 "movdqa %3,%%xmm5 \n"
2456 "lea -0x10(%0),%0 \n"
2457 ".p2align 4 \n"
2458 "1: \n"
2459 "movdqa (%0,%2,4),%%xmm0 \n"
2460 "pshufb %%xmm5,%%xmm0 \n"
2461 "sub $0x4,%2 \n"
2462 "movdqa %%xmm0,(%1) \n"
2463 "lea 0x10(%1),%1 \n"
2464 "jg 1b \n"
2465 : "+r"(src), // %0
2466 "+r"(dst), // %1
2467 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002468 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002469 : "memory", "cc"
2470#if defined(__SSE2__)
2471 , "xmm0", "xmm5"
2472#endif
2473 );
2474}
2475#endif // HAS_ARGBMIRRORROW_SSSE3
2476
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002477#ifdef HAS_SPLITUV_SSE2
2478void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002479 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002480 "pcmpeqb %%xmm5,%%xmm5 \n"
2481 "psrlw $0x8,%%xmm5 \n"
2482 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002483 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002484 "1: \n"
2485 "movdqa (%0),%%xmm0 \n"
2486 "movdqa 0x10(%0),%%xmm1 \n"
2487 "lea 0x20(%0),%0 \n"
2488 "movdqa %%xmm0,%%xmm2 \n"
2489 "movdqa %%xmm1,%%xmm3 \n"
2490 "pand %%xmm5,%%xmm0 \n"
2491 "pand %%xmm5,%%xmm1 \n"
2492 "packuswb %%xmm1,%%xmm0 \n"
2493 "psrlw $0x8,%%xmm2 \n"
2494 "psrlw $0x8,%%xmm3 \n"
2495 "packuswb %%xmm3,%%xmm2 \n"
2496 "movdqa %%xmm0,(%1) \n"
2497 "movdqa %%xmm2,(%1,%2) \n"
2498 "lea 0x10(%1),%1 \n"
2499 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002500 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002501 : "+r"(src_uv), // %0
2502 "+r"(dst_u), // %1
2503 "+r"(dst_v), // %2
2504 "+r"(pix) // %3
2505 :
2506 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002507#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002508 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002509#endif
2510 );
2511}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002512
2513void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2514 int pix) {
2515 asm volatile (
2516 "pcmpeqb %%xmm5,%%xmm5 \n"
2517 "psrlw $0x8,%%xmm5 \n"
2518 "sub %1,%2 \n"
2519 ".p2align 4 \n"
2520 "1: \n"
2521 "movdqu (%0),%%xmm0 \n"
2522 "movdqu 0x10(%0),%%xmm1 \n"
2523 "lea 0x20(%0),%0 \n"
2524 "movdqa %%xmm0,%%xmm2 \n"
2525 "movdqa %%xmm1,%%xmm3 \n"
2526 "pand %%xmm5,%%xmm0 \n"
2527 "pand %%xmm5,%%xmm1 \n"
2528 "packuswb %%xmm1,%%xmm0 \n"
2529 "psrlw $0x8,%%xmm2 \n"
2530 "psrlw $0x8,%%xmm3 \n"
2531 "packuswb %%xmm3,%%xmm2 \n"
2532 "movdqu %%xmm0,(%1) \n"
2533 "movdqu %%xmm2,(%1,%2) \n"
2534 "lea 0x10(%1),%1 \n"
2535 "sub $0x10,%3 \n"
2536 "jg 1b \n"
2537 : "+r"(src_uv), // %0
2538 "+r"(dst_u), // %1
2539 "+r"(dst_v), // %2
2540 "+r"(pix) // %3
2541 :
2542 : "memory", "cc"
2543#if defined(__SSE2__)
2544 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2545#endif
2546 );
2547}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002548#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002549
fbarchard@google.com19932f82012-02-16 22:19:14 +00002550#ifdef HAS_COPYROW_SSE2
2551void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002552 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002553 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002554 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002555 "1: \n"
2556 "movdqa (%0),%%xmm0 \n"
2557 "movdqa 0x10(%0),%%xmm1 \n"
2558 "movdqa %%xmm0,(%0,%1) \n"
2559 "movdqa %%xmm1,0x10(%0,%1) \n"
2560 "lea 0x20(%0),%0 \n"
2561 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002562 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002563 : "+r"(src), // %0
2564 "+r"(dst), // %1
2565 "+r"(count) // %2
2566 :
2567 : "memory", "cc"
2568#if defined(__SSE2__)
2569 , "xmm0", "xmm1"
2570#endif
2571 );
2572}
2573#endif // HAS_COPYROW_SSE2
2574
2575#ifdef HAS_COPYROW_X86
2576void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2577 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002578 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002579 "shr $0x2,%2 \n"
2580 "rep movsl \n"
2581 : "+S"(src), // %0
2582 "+D"(dst), // %1
2583 "+c"(width_tmp) // %2
2584 :
2585 : "memory", "cc"
2586 );
2587}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002588#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002589
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002590#ifdef HAS_SETROW_X86
2591void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2592 size_t width_tmp = static_cast<size_t>(width);
2593 asm volatile (
2594 "shr $0x2,%1 \n"
2595 "rep stosl \n"
2596 : "+D"(dst), // %0
2597 "+c"(width_tmp) // %1
2598 : "a"(v32) // %2
2599 : "memory", "cc");
2600}
2601
2602void SetRows32_X86(uint8* dst, uint32 v32, int width,
2603 int dst_stride, int height) {
2604 for (int y = 0; y < height; ++y) {
2605 size_t width_tmp = static_cast<size_t>(width);
2606 uint32* d = reinterpret_cast<uint32*>(dst);
2607 asm volatile (
2608 "rep stosl \n"
2609 : "+D"(d), // %0
2610 "+c"(width_tmp) // %1
2611 : "a"(v32) // %2
2612 : "memory", "cc");
2613 dst += dst_stride;
2614 }
2615}
2616#endif // HAS_SETROW_X86
2617
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002618#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002619void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002620 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002621 "pcmpeqb %%xmm5,%%xmm5 \n"
2622 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002623 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002624 "1: \n"
2625 "movdqa (%0),%%xmm0 \n"
2626 "movdqa 0x10(%0),%%xmm1 \n"
2627 "lea 0x20(%0),%0 \n"
2628 "pand %%xmm5,%%xmm0 \n"
2629 "pand %%xmm5,%%xmm1 \n"
2630 "packuswb %%xmm1,%%xmm0 \n"
2631 "movdqa %%xmm0,(%1) \n"
2632 "lea 0x10(%1),%1 \n"
2633 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002634 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002635 : "+r"(src_yuy2), // %0
2636 "+r"(dst_y), // %1
2637 "+r"(pix) // %2
2638 :
2639 : "memory", "cc"
2640#if defined(__SSE2__)
2641 , "xmm0", "xmm1", "xmm5"
2642#endif
2643 );
2644}
2645
2646void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002647 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002648 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002649 "pcmpeqb %%xmm5,%%xmm5 \n"
2650 "psrlw $0x8,%%xmm5 \n"
2651 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002652 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002653 "1: \n"
2654 "movdqa (%0),%%xmm0 \n"
2655 "movdqa 0x10(%0),%%xmm1 \n"
2656 "movdqa (%0,%4,1),%%xmm2 \n"
2657 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2658 "lea 0x20(%0),%0 \n"
2659 "pavgb %%xmm2,%%xmm0 \n"
2660 "pavgb %%xmm3,%%xmm1 \n"
2661 "psrlw $0x8,%%xmm0 \n"
2662 "psrlw $0x8,%%xmm1 \n"
2663 "packuswb %%xmm1,%%xmm0 \n"
2664 "movdqa %%xmm0,%%xmm1 \n"
2665 "pand %%xmm5,%%xmm0 \n"
2666 "packuswb %%xmm0,%%xmm0 \n"
2667 "psrlw $0x8,%%xmm1 \n"
2668 "packuswb %%xmm1,%%xmm1 \n"
2669 "movq %%xmm0,(%1) \n"
2670 "movq %%xmm1,(%1,%2) \n"
2671 "lea 0x8(%1),%1 \n"
2672 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002673 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002674 : "+r"(src_yuy2), // %0
2675 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002676 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002677 "+r"(pix) // %3
2678 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2679 : "memory", "cc"
2680#if defined(__SSE2__)
2681 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2682#endif
2683 );
2684}
2685
fbarchard@google.comc704f782012-08-30 19:53:48 +00002686void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2687 uint8* dst_u, uint8* dst_v, int pix) {
2688 asm volatile (
2689 "pcmpeqb %%xmm5,%%xmm5 \n"
2690 "psrlw $0x8,%%xmm5 \n"
2691 "sub %1,%2 \n"
2692 ".p2align 4 \n"
2693 "1: \n"
2694 "movdqa (%0),%%xmm0 \n"
2695 "movdqa 0x10(%0),%%xmm1 \n"
2696 "lea 0x20(%0),%0 \n"
2697 "psrlw $0x8,%%xmm0 \n"
2698 "psrlw $0x8,%%xmm1 \n"
2699 "packuswb %%xmm1,%%xmm0 \n"
2700 "movdqa %%xmm0,%%xmm1 \n"
2701 "pand %%xmm5,%%xmm0 \n"
2702 "packuswb %%xmm0,%%xmm0 \n"
2703 "psrlw $0x8,%%xmm1 \n"
2704 "packuswb %%xmm1,%%xmm1 \n"
2705 "movq %%xmm0,(%1) \n"
2706 "movq %%xmm1,(%1,%2) \n"
2707 "lea 0x8(%1),%1 \n"
2708 "sub $0x10,%3 \n"
2709 "jg 1b \n"
2710 : "+r"(src_yuy2), // %0
2711 "+r"(dst_u), // %1
2712 "+r"(dst_v), // %2
2713 "+r"(pix) // %3
2714 :
2715 : "memory", "cc"
2716#if defined(__SSE2__)
2717 , "xmm0", "xmm1", "xmm5"
2718#endif
2719 );
2720}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002721
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002722void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2723 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002724 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002725 "pcmpeqb %%xmm5,%%xmm5 \n"
2726 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002727 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002728 "1: \n"
2729 "movdqu (%0),%%xmm0 \n"
2730 "movdqu 0x10(%0),%%xmm1 \n"
2731 "lea 0x20(%0),%0 \n"
2732 "pand %%xmm5,%%xmm0 \n"
2733 "pand %%xmm5,%%xmm1 \n"
2734 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002735 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002736 "movdqu %%xmm0,(%1) \n"
2737 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002738 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002739 : "+r"(src_yuy2), // %0
2740 "+r"(dst_y), // %1
2741 "+r"(pix) // %2
2742 :
2743 : "memory", "cc"
2744#if defined(__SSE2__)
2745 , "xmm0", "xmm1", "xmm5"
2746#endif
2747 );
2748}
2749
2750void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2751 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002752 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002753 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002754 "pcmpeqb %%xmm5,%%xmm5 \n"
2755 "psrlw $0x8,%%xmm5 \n"
2756 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002757 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002758 "1: \n"
2759 "movdqu (%0),%%xmm0 \n"
2760 "movdqu 0x10(%0),%%xmm1 \n"
2761 "movdqu (%0,%4,1),%%xmm2 \n"
2762 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2763 "lea 0x20(%0),%0 \n"
2764 "pavgb %%xmm2,%%xmm0 \n"
2765 "pavgb %%xmm3,%%xmm1 \n"
2766 "psrlw $0x8,%%xmm0 \n"
2767 "psrlw $0x8,%%xmm1 \n"
2768 "packuswb %%xmm1,%%xmm0 \n"
2769 "movdqa %%xmm0,%%xmm1 \n"
2770 "pand %%xmm5,%%xmm0 \n"
2771 "packuswb %%xmm0,%%xmm0 \n"
2772 "psrlw $0x8,%%xmm1 \n"
2773 "packuswb %%xmm1,%%xmm1 \n"
2774 "movq %%xmm0,(%1) \n"
2775 "movq %%xmm1,(%1,%2) \n"
2776 "lea 0x8(%1),%1 \n"
2777 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002778 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002779 : "+r"(src_yuy2), // %0
2780 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002781 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002782 "+r"(pix) // %3
2783 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2784 : "memory", "cc"
2785#if defined(__SSE2__)
2786 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2787#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002788 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002789}
2790
fbarchard@google.comc704f782012-08-30 19:53:48 +00002791void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2792 uint8* dst_u, uint8* dst_v, int pix) {
2793 asm volatile (
2794 "pcmpeqb %%xmm5,%%xmm5 \n"
2795 "psrlw $0x8,%%xmm5 \n"
2796 "sub %1,%2 \n"
2797 ".p2align 4 \n"
2798 "1: \n"
2799 "movdqu (%0),%%xmm0 \n"
2800 "movdqu 0x10(%0),%%xmm1 \n"
2801 "lea 0x20(%0),%0 \n"
2802 "psrlw $0x8,%%xmm0 \n"
2803 "psrlw $0x8,%%xmm1 \n"
2804 "packuswb %%xmm1,%%xmm0 \n"
2805 "movdqa %%xmm0,%%xmm1 \n"
2806 "pand %%xmm5,%%xmm0 \n"
2807 "packuswb %%xmm0,%%xmm0 \n"
2808 "psrlw $0x8,%%xmm1 \n"
2809 "packuswb %%xmm1,%%xmm1 \n"
2810 "movq %%xmm0,(%1) \n"
2811 "movq %%xmm1,(%1,%2) \n"
2812 "lea 0x8(%1),%1 \n"
2813 "sub $0x10,%3 \n"
2814 "jg 1b \n"
2815 : "+r"(src_yuy2), // %0
2816 "+r"(dst_u), // %1
2817 "+r"(dst_v), // %2
2818 "+r"(pix) // %3
2819 :
2820 : "memory", "cc"
2821#if defined(__SSE2__)
2822 , "xmm0", "xmm1", "xmm5"
2823#endif
2824 );
2825}
2826
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002827void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002828 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002829 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002830 "1: \n"
2831 "movdqa (%0),%%xmm0 \n"
2832 "movdqa 0x10(%0),%%xmm1 \n"
2833 "lea 0x20(%0),%0 \n"
2834 "psrlw $0x8,%%xmm0 \n"
2835 "psrlw $0x8,%%xmm1 \n"
2836 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002837 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002838 "movdqa %%xmm0,(%1) \n"
2839 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002840 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002841 : "+r"(src_uyvy), // %0
2842 "+r"(dst_y), // %1
2843 "+r"(pix) // %2
2844 :
2845 : "memory", "cc"
2846#if defined(__SSE2__)
2847 , "xmm0", "xmm1"
2848#endif
2849 );
2850}
2851
2852void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002853 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002854 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002855 "pcmpeqb %%xmm5,%%xmm5 \n"
2856 "psrlw $0x8,%%xmm5 \n"
2857 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002858 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002859 "1: \n"
2860 "movdqa (%0),%%xmm0 \n"
2861 "movdqa 0x10(%0),%%xmm1 \n"
2862 "movdqa (%0,%4,1),%%xmm2 \n"
2863 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2864 "lea 0x20(%0),%0 \n"
2865 "pavgb %%xmm2,%%xmm0 \n"
2866 "pavgb %%xmm3,%%xmm1 \n"
2867 "pand %%xmm5,%%xmm0 \n"
2868 "pand %%xmm5,%%xmm1 \n"
2869 "packuswb %%xmm1,%%xmm0 \n"
2870 "movdqa %%xmm0,%%xmm1 \n"
2871 "pand %%xmm5,%%xmm0 \n"
2872 "packuswb %%xmm0,%%xmm0 \n"
2873 "psrlw $0x8,%%xmm1 \n"
2874 "packuswb %%xmm1,%%xmm1 \n"
2875 "movq %%xmm0,(%1) \n"
2876 "movq %%xmm1,(%1,%2) \n"
2877 "lea 0x8(%1),%1 \n"
2878 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002879 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002880 : "+r"(src_uyvy), // %0
2881 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002882 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002883 "+r"(pix) // %3
2884 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2885 : "memory", "cc"
2886#if defined(__SSE2__)
2887 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2888#endif
2889 );
2890}
2891
fbarchard@google.comc704f782012-08-30 19:53:48 +00002892void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2893 uint8* dst_u, uint8* dst_v, int pix) {
2894 asm volatile (
2895 "pcmpeqb %%xmm5,%%xmm5 \n"
2896 "psrlw $0x8,%%xmm5 \n"
2897 "sub %1,%2 \n"
2898 ".p2align 4 \n"
2899 "1: \n"
2900 "movdqa (%0),%%xmm0 \n"
2901 "movdqa 0x10(%0),%%xmm1 \n"
2902 "lea 0x20(%0),%0 \n"
2903 "pand %%xmm5,%%xmm0 \n"
2904 "pand %%xmm5,%%xmm1 \n"
2905 "packuswb %%xmm1,%%xmm0 \n"
2906 "movdqa %%xmm0,%%xmm1 \n"
2907 "pand %%xmm5,%%xmm0 \n"
2908 "packuswb %%xmm0,%%xmm0 \n"
2909 "psrlw $0x8,%%xmm1 \n"
2910 "packuswb %%xmm1,%%xmm1 \n"
2911 "movq %%xmm0,(%1) \n"
2912 "movq %%xmm1,(%1,%2) \n"
2913 "lea 0x8(%1),%1 \n"
2914 "sub $0x10,%3 \n"
2915 "jg 1b \n"
2916 : "+r"(src_uyvy), // %0
2917 "+r"(dst_u), // %1
2918 "+r"(dst_v), // %2
2919 "+r"(pix) // %3
2920 :
2921 : "memory", "cc"
2922#if defined(__SSE2__)
2923 , "xmm0", "xmm1", "xmm5"
2924#endif
2925 );
2926}
2927
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002928void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2929 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002930 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002931 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002932 "1: \n"
2933 "movdqu (%0),%%xmm0 \n"
2934 "movdqu 0x10(%0),%%xmm1 \n"
2935 "lea 0x20(%0),%0 \n"
2936 "psrlw $0x8,%%xmm0 \n"
2937 "psrlw $0x8,%%xmm1 \n"
2938 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002939 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002940 "movdqu %%xmm0,(%1) \n"
2941 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002942 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002943 : "+r"(src_uyvy), // %0
2944 "+r"(dst_y), // %1
2945 "+r"(pix) // %2
2946 :
2947 : "memory", "cc"
2948#if defined(__SSE2__)
2949 , "xmm0", "xmm1"
2950#endif
2951 );
2952}
2953
2954void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002955 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002956 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002957 "pcmpeqb %%xmm5,%%xmm5 \n"
2958 "psrlw $0x8,%%xmm5 \n"
2959 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002960 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002961 "1: \n"
2962 "movdqu (%0),%%xmm0 \n"
2963 "movdqu 0x10(%0),%%xmm1 \n"
2964 "movdqu (%0,%4,1),%%xmm2 \n"
2965 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2966 "lea 0x20(%0),%0 \n"
2967 "pavgb %%xmm2,%%xmm0 \n"
2968 "pavgb %%xmm3,%%xmm1 \n"
2969 "pand %%xmm5,%%xmm0 \n"
2970 "pand %%xmm5,%%xmm1 \n"
2971 "packuswb %%xmm1,%%xmm0 \n"
2972 "movdqa %%xmm0,%%xmm1 \n"
2973 "pand %%xmm5,%%xmm0 \n"
2974 "packuswb %%xmm0,%%xmm0 \n"
2975 "psrlw $0x8,%%xmm1 \n"
2976 "packuswb %%xmm1,%%xmm1 \n"
2977 "movq %%xmm0,(%1) \n"
2978 "movq %%xmm1,(%1,%2) \n"
2979 "lea 0x8(%1),%1 \n"
2980 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002981 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002982 : "+r"(src_uyvy), // %0
2983 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002984 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002985 "+r"(pix) // %3
2986 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2987 : "memory", "cc"
2988#if defined(__SSE2__)
2989 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2990#endif
2991 );
2992}
fbarchard@google.comc704f782012-08-30 19:53:48 +00002993
2994void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2995 uint8* dst_u, uint8* dst_v, int pix) {
2996 asm volatile (
2997 "pcmpeqb %%xmm5,%%xmm5 \n"
2998 "psrlw $0x8,%%xmm5 \n"
2999 "sub %1,%2 \n"
3000 ".p2align 4 \n"
3001 "1: \n"
3002 "movdqu (%0),%%xmm0 \n"
3003 "movdqu 0x10(%0),%%xmm1 \n"
3004 "lea 0x20(%0),%0 \n"
3005 "pand %%xmm5,%%xmm0 \n"
3006 "pand %%xmm5,%%xmm1 \n"
3007 "packuswb %%xmm1,%%xmm0 \n"
3008 "movdqa %%xmm0,%%xmm1 \n"
3009 "pand %%xmm5,%%xmm0 \n"
3010 "packuswb %%xmm0,%%xmm0 \n"
3011 "psrlw $0x8,%%xmm1 \n"
3012 "packuswb %%xmm1,%%xmm1 \n"
3013 "movq %%xmm0,(%1) \n"
3014 "movq %%xmm1,(%1,%2) \n"
3015 "lea 0x8(%1),%1 \n"
3016 "sub $0x10,%3 \n"
3017 "jg 1b \n"
3018 : "+r"(src_uyvy), // %0
3019 "+r"(dst_u), // %1
3020 "+r"(dst_v), // %2
3021 "+r"(pix) // %3
3022 :
3023 : "memory", "cc"
3024#if defined(__SSE2__)
3025 , "xmm0", "xmm1", "xmm5"
3026#endif
3027 );
3028}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003029#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003030
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003031#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003032// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003033void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3034 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003035 asm volatile (
3036 "pcmpeqb %%xmm7,%%xmm7 \n"
3037 "psrlw $0xf,%%xmm7 \n"
3038 "pcmpeqb %%xmm6,%%xmm6 \n"
3039 "psrlw $0x8,%%xmm6 \n"
3040 "pcmpeqb %%xmm5,%%xmm5 \n"
3041 "psllw $0x8,%%xmm5 \n"
3042 "pcmpeqb %%xmm4,%%xmm4 \n"
3043 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003044 "sub $0x1,%3 \n"
3045 "je 91f \n"
3046 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003047
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003048 // 1 pixel loop until destination pointer is aligned.
3049 "10: \n"
3050 "test $0xf,%2 \n"
3051 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003052 "movd (%0),%%xmm3 \n"
3053 "lea 0x4(%0),%0 \n"
3054 "movdqa %%xmm3,%%xmm0 \n"
3055 "pxor %%xmm4,%%xmm3 \n"
3056 "movd (%1),%%xmm2 \n"
3057 "psrlw $0x8,%%xmm3 \n"
3058 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3059 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3060 "pand %%xmm6,%%xmm2 \n"
3061 "paddw %%xmm7,%%xmm3 \n"
3062 "pmullw %%xmm3,%%xmm2 \n"
3063 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003064 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003065 "psrlw $0x8,%%xmm1 \n"
3066 "por %%xmm4,%%xmm0 \n"
3067 "pmullw %%xmm3,%%xmm1 \n"
3068 "psrlw $0x8,%%xmm2 \n"
3069 "paddusb %%xmm2,%%xmm0 \n"
3070 "pand %%xmm5,%%xmm1 \n"
3071 "paddusb %%xmm1,%%xmm0 \n"
3072 "sub $0x1,%3 \n"
3073 "movd %%xmm0,(%2) \n"
3074 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003075 "jge 10b \n"
3076
3077 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003078 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003079 "jl 49f \n"
3080
fbarchard@google.com794fe122012-06-15 01:05:01 +00003081 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003082 ".p2align 2 \n"
3083 "41: \n"
3084 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003085 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003086 "movdqa %%xmm3,%%xmm0 \n"
3087 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003088 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003089 "psrlw $0x8,%%xmm3 \n"
3090 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3091 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003092 "pand %%xmm6,%%xmm2 \n"
3093 "paddw %%xmm7,%%xmm3 \n"
3094 "pmullw %%xmm3,%%xmm2 \n"
3095 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003096 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003097 "psrlw $0x8,%%xmm1 \n"
3098 "por %%xmm4,%%xmm0 \n"
3099 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003100 "psrlw $0x8,%%xmm2 \n"
3101 "paddusb %%xmm2,%%xmm0 \n"
3102 "pand %%xmm5,%%xmm1 \n"
3103 "paddusb %%xmm1,%%xmm0 \n"
3104 "sub $0x4,%3 \n"
3105 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003106 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003107 "jge 41b \n"
3108
3109 "49: \n"
3110 "add $0x3,%3 \n"
3111 "jl 99f \n"
3112
fbarchard@google.com794fe122012-06-15 01:05:01 +00003113 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003114 "91: \n"
3115 "movd (%0),%%xmm3 \n"
3116 "lea 0x4(%0),%0 \n"
3117 "movdqa %%xmm3,%%xmm0 \n"
3118 "pxor %%xmm4,%%xmm3 \n"
3119 "movd (%1),%%xmm2 \n"
3120 "psrlw $0x8,%%xmm3 \n"
3121 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3122 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3123 "pand %%xmm6,%%xmm2 \n"
3124 "paddw %%xmm7,%%xmm3 \n"
3125 "pmullw %%xmm3,%%xmm2 \n"
3126 "movd (%1),%%xmm1 \n"
3127 "lea 0x4(%1),%1 \n"
3128 "psrlw $0x8,%%xmm1 \n"
3129 "por %%xmm4,%%xmm0 \n"
3130 "pmullw %%xmm3,%%xmm1 \n"
3131 "psrlw $0x8,%%xmm2 \n"
3132 "paddusb %%xmm2,%%xmm0 \n"
3133 "pand %%xmm5,%%xmm1 \n"
3134 "paddusb %%xmm1,%%xmm0 \n"
3135 "sub $0x1,%3 \n"
3136 "movd %%xmm0,(%2) \n"
3137 "lea 0x4(%2),%2 \n"
3138 "jge 91b \n"
3139 "99: \n"
3140 : "+r"(src_argb0), // %0
3141 "+r"(src_argb1), // %1
3142 "+r"(dst_argb), // %2
3143 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003144 :
3145 : "memory", "cc"
3146#if defined(__SSE2__)
3147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3148#endif
3149 );
3150}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003151#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003152
fbarchard@google.com96af8702012-04-06 18:22:27 +00003153#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003154// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003155CONST uvec8 kShuffleAlpha = {
3156 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3157 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3158};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003159
3160// Blend 8 pixels at a time
3161// Shuffle table for reversing the bytes.
3162
3163// Same as SSE2, but replaces
3164// psrlw xmm3, 8 // alpha
3165// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3166// pshuflw xmm3, xmm3,0F5h
3167// with..
3168// pshufb xmm3, kShuffleAlpha // alpha
3169
3170void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3171 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003172 asm volatile (
3173 "pcmpeqb %%xmm7,%%xmm7 \n"
3174 "psrlw $0xf,%%xmm7 \n"
3175 "pcmpeqb %%xmm6,%%xmm6 \n"
3176 "psrlw $0x8,%%xmm6 \n"
3177 "pcmpeqb %%xmm5,%%xmm5 \n"
3178 "psllw $0x8,%%xmm5 \n"
3179 "pcmpeqb %%xmm4,%%xmm4 \n"
3180 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003181 "sub $0x1,%3 \n"
3182 "je 91f \n"
3183 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003184
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003185 // 1 pixel loop until destination pointer is aligned.
3186 "10: \n"
3187 "test $0xf,%2 \n"
3188 "je 19f \n"
3189 "movd (%0),%%xmm3 \n"
3190 "lea 0x4(%0),%0 \n"
3191 "movdqa %%xmm3,%%xmm0 \n"
3192 "pxor %%xmm4,%%xmm3 \n"
3193 "movd (%1),%%xmm2 \n"
3194 "pshufb %4,%%xmm3 \n"
3195 "pand %%xmm6,%%xmm2 \n"
3196 "paddw %%xmm7,%%xmm3 \n"
3197 "pmullw %%xmm3,%%xmm2 \n"
3198 "movd (%1),%%xmm1 \n"
3199 "lea 0x4(%1),%1 \n"
3200 "psrlw $0x8,%%xmm1 \n"
3201 "por %%xmm4,%%xmm0 \n"
3202 "pmullw %%xmm3,%%xmm1 \n"
3203 "psrlw $0x8,%%xmm2 \n"
3204 "paddusb %%xmm2,%%xmm0 \n"
3205 "pand %%xmm5,%%xmm1 \n"
3206 "paddusb %%xmm1,%%xmm0 \n"
3207 "sub $0x1,%3 \n"
3208 "movd %%xmm0,(%2) \n"
3209 "lea 0x4(%2),%2 \n"
3210 "jge 10b \n"
3211
3212 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003213 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003214 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003215 "test $0xf,%0 \n"
3216 "jne 41f \n"
3217 "test $0xf,%1 \n"
3218 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003219
fbarchard@google.com794fe122012-06-15 01:05:01 +00003220 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003221 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003222 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003223 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003224 "lea 0x10(%0),%0 \n"
3225 "movdqa %%xmm3,%%xmm0 \n"
3226 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003227 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003228 "pshufb %4,%%xmm3 \n"
3229 "pand %%xmm6,%%xmm2 \n"
3230 "paddw %%xmm7,%%xmm3 \n"
3231 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003232 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003233 "lea 0x10(%1),%1 \n"
3234 "psrlw $0x8,%%xmm1 \n"
3235 "por %%xmm4,%%xmm0 \n"
3236 "pmullw %%xmm3,%%xmm1 \n"
3237 "psrlw $0x8,%%xmm2 \n"
3238 "paddusb %%xmm2,%%xmm0 \n"
3239 "pand %%xmm5,%%xmm1 \n"
3240 "paddusb %%xmm1,%%xmm0 \n"
3241 "sub $0x4,%3 \n"
3242 "movdqa %%xmm0,(%2) \n"
3243 "lea 0x10(%2),%2 \n"
3244 "jge 40b \n"
3245 "jmp 49f \n"
3246
3247 // 4 pixel unaligned loop.
3248 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003249 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003250 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003251 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003252 "movdqa %%xmm3,%%xmm0 \n"
3253 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003254 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003255 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003256 "pand %%xmm6,%%xmm2 \n"
3257 "paddw %%xmm7,%%xmm3 \n"
3258 "pmullw %%xmm3,%%xmm2 \n"
3259 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003260 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003261 "psrlw $0x8,%%xmm1 \n"
3262 "por %%xmm4,%%xmm0 \n"
3263 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003264 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003265 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003266 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003267 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003268 "sub $0x4,%3 \n"
3269 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003270 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003271 "jge 41b \n"
3272
3273 "49: \n"
3274 "add $0x3,%3 \n"
3275 "jl 99f \n"
3276
fbarchard@google.com794fe122012-06-15 01:05:01 +00003277 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003278 "91: \n"
3279 "movd (%0),%%xmm3 \n"
3280 "lea 0x4(%0),%0 \n"
3281 "movdqa %%xmm3,%%xmm0 \n"
3282 "pxor %%xmm4,%%xmm3 \n"
3283 "movd (%1),%%xmm2 \n"
3284 "pshufb %4,%%xmm3 \n"
3285 "pand %%xmm6,%%xmm2 \n"
3286 "paddw %%xmm7,%%xmm3 \n"
3287 "pmullw %%xmm3,%%xmm2 \n"
3288 "movd (%1),%%xmm1 \n"
3289 "lea 0x4(%1),%1 \n"
3290 "psrlw $0x8,%%xmm1 \n"
3291 "por %%xmm4,%%xmm0 \n"
3292 "pmullw %%xmm3,%%xmm1 \n"
3293 "psrlw $0x8,%%xmm2 \n"
3294 "paddusb %%xmm2,%%xmm0 \n"
3295 "pand %%xmm5,%%xmm1 \n"
3296 "paddusb %%xmm1,%%xmm0 \n"
3297 "sub $0x1,%3 \n"
3298 "movd %%xmm0,(%2) \n"
3299 "lea 0x4(%2),%2 \n"
3300 "jge 91b \n"
3301 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003302 : "+r"(src_argb0), // %0
3303 "+r"(src_argb1), // %1
3304 "+r"(dst_argb), // %2
3305 "+r"(width) // %3
3306 : "m"(kShuffleAlpha) // %4
3307 : "memory", "cc"
3308#if defined(__SSE2__)
3309 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3310#endif
3311 );
3312}
3313#endif // HAS_ARGBBLENDROW_SSSE3
3314
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003315#ifdef HAS_ARGBATTENUATE_SSE2
3316// Attenuate 4 pixels at a time.
3317// aligned to 16 bytes
3318void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3319 asm volatile (
3320 "sub %0,%1 \n"
3321 "pcmpeqb %%xmm4,%%xmm4 \n"
3322 "pslld $0x18,%%xmm4 \n"
3323 "pcmpeqb %%xmm5,%%xmm5 \n"
3324 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003325
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003326 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003327 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003328 "1: \n"
3329 "movdqa (%0),%%xmm0 \n"
3330 "punpcklbw %%xmm0,%%xmm0 \n"
3331 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3332 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3333 "pmulhuw %%xmm2,%%xmm0 \n"
3334 "movdqa (%0),%%xmm1 \n"
3335 "punpckhbw %%xmm1,%%xmm1 \n"
3336 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3337 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3338 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003339 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003340 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003341 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003342 "psrlw $0x8,%%xmm1 \n"
3343 "packuswb %%xmm1,%%xmm0 \n"
3344 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003345 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003346 "sub $0x4,%2 \n"
3347 "movdqa %%xmm0,(%0,%1,1) \n"
3348 "lea 0x10(%0),%0 \n"
3349 "jg 1b \n"
3350 : "+r"(src_argb), // %0
3351 "+r"(dst_argb), // %1
3352 "+r"(width) // %2
3353 :
3354 : "memory", "cc"
3355#if defined(__SSE2__)
3356 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3357#endif
3358 );
3359}
3360#endif // HAS_ARGBATTENUATE_SSE2
3361
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003362#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003363// Shuffle table duplicating alpha
3364CONST uvec8 kShuffleAlpha0 = {
3365 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3366};
3367CONST uvec8 kShuffleAlpha1 = {
3368 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3369 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3370};
3371// Attenuate 4 pixels at a time.
3372// aligned to 16 bytes
3373void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3374 asm volatile (
3375 "sub %0,%1 \n"
3376 "pcmpeqb %%xmm3,%%xmm3 \n"
3377 "pslld $0x18,%%xmm3 \n"
3378 "movdqa %3,%%xmm4 \n"
3379 "movdqa %4,%%xmm5 \n"
3380
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003381 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003382 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003383 "1: \n"
3384 "movdqa (%0),%%xmm0 \n"
3385 "pshufb %%xmm4,%%xmm0 \n"
3386 "movdqa (%0),%%xmm1 \n"
3387 "punpcklbw %%xmm1,%%xmm1 \n"
3388 "pmulhuw %%xmm1,%%xmm0 \n"
3389 "movdqa (%0),%%xmm1 \n"
3390 "pshufb %%xmm5,%%xmm1 \n"
3391 "movdqa (%0),%%xmm2 \n"
3392 "punpckhbw %%xmm2,%%xmm2 \n"
3393 "pmulhuw %%xmm2,%%xmm1 \n"
3394 "movdqa (%0),%%xmm2 \n"
3395 "pand %%xmm3,%%xmm2 \n"
3396 "psrlw $0x8,%%xmm0 \n"
3397 "psrlw $0x8,%%xmm1 \n"
3398 "packuswb %%xmm1,%%xmm0 \n"
3399 "por %%xmm2,%%xmm0 \n"
3400 "sub $0x4,%2 \n"
3401 "movdqa %%xmm0,(%0,%1,1) \n"
3402 "lea 0x10(%0),%0 \n"
3403 "jg 1b \n"
3404 : "+r"(src_argb), // %0
3405 "+r"(dst_argb), // %1
3406 "+r"(width) // %2
3407 : "m"(kShuffleAlpha0), // %3
3408 "m"(kShuffleAlpha1) // %4
3409 : "memory", "cc"
3410#if defined(__SSE2__)
3411 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3412#endif
3413 );
3414}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003415#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003416
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003417#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003418// Unattenuate 4 pixels at a time.
3419// aligned to 16 bytes
3420void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3421 int width) {
3422 uintptr_t alpha = 0;
3423 asm volatile (
3424 "sub %0,%1 \n"
3425 "pcmpeqb %%xmm4,%%xmm4 \n"
3426 "pslld $0x18,%%xmm4 \n"
3427
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003428 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003429 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003430 "1: \n"
3431 "movdqa (%0),%%xmm0 \n"
3432 "movzb 0x3(%0),%3 \n"
3433 "punpcklbw %%xmm0,%%xmm0 \n"
3434 "movd 0x0(%4,%3,4),%%xmm2 \n"
3435 "movzb 0x7(%0),%3 \n"
3436 "movd 0x0(%4,%3,4),%%xmm3 \n"
3437 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3438 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3439 "movlhps %%xmm3,%%xmm2 \n"
3440 "pmulhuw %%xmm2,%%xmm0 \n"
3441 "movdqa (%0),%%xmm1 \n"
3442 "movzb 0xb(%0),%3 \n"
3443 "punpckhbw %%xmm1,%%xmm1 \n"
3444 "movd 0x0(%4,%3,4),%%xmm2 \n"
3445 "movzb 0xf(%0),%3 \n"
3446 "movd 0x0(%4,%3,4),%%xmm3 \n"
3447 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3448 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3449 "movlhps %%xmm3,%%xmm2 \n"
3450 "pmulhuw %%xmm2,%%xmm1 \n"
3451 "movdqa (%0),%%xmm2 \n"
3452 "pand %%xmm4,%%xmm2 \n"
3453 "packuswb %%xmm1,%%xmm0 \n"
3454 "por %%xmm2,%%xmm0 \n"
3455 "sub $0x4,%2 \n"
3456 "movdqa %%xmm0,(%0,%1,1) \n"
3457 "lea 0x10(%0),%0 \n"
3458 "jg 1b \n"
3459 : "+r"(src_argb), // %0
3460 "+r"(dst_argb), // %1
3461 "+r"(width), // %2
3462 "+r"(alpha) // %3
3463 : "r"(fixed_invtbl8) // %4
3464 : "memory", "cc"
3465#if defined(__SSE2__)
3466 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3467#endif
3468 );
3469}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003470#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003471
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003472#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003473// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003474CONST vec8 kARGBToGray = {
3475 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3476};
3477
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003478// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003479void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003480 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003481 "movdqa %3,%%xmm4 \n"
3482 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003483
3484 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003485 ".p2align 4 \n"
3486 "1: \n"
3487 "movdqa (%0),%%xmm0 \n"
3488 "movdqa 0x10(%0),%%xmm1 \n"
3489 "pmaddubsw %%xmm4,%%xmm0 \n"
3490 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003491 "phaddw %%xmm1,%%xmm0 \n"
3492 "psrlw $0x7,%%xmm0 \n"
3493 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003494 "movdqa (%0),%%xmm2 \n"
3495 "movdqa 0x10(%0),%%xmm3 \n"
3496 "psrld $0x18,%%xmm2 \n"
3497 "psrld $0x18,%%xmm3 \n"
3498 "packuswb %%xmm3,%%xmm2 \n"
3499 "packuswb %%xmm2,%%xmm2 \n"
3500 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003501 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003502 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003503 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003504 "punpcklwd %%xmm3,%%xmm0 \n"
3505 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003506 "sub $0x8,%2 \n"
3507 "movdqa %%xmm0,(%0,%1,1) \n"
3508 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003509 "lea 0x20(%0),%0 \n"
3510 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003511 : "+r"(src_argb), // %0
3512 "+r"(dst_argb), // %1
3513 "+r"(width) // %2
3514 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003515 : "memory", "cc"
3516#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003517 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003518#endif
3519 );
3520}
3521#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003522
3523#ifdef HAS_ARGBSEPIAROW_SSSE3
3524// b = (r * 35 + g * 68 + b * 17) >> 7
3525// g = (r * 45 + g * 88 + b * 22) >> 7
3526// r = (r * 50 + g * 98 + b * 24) >> 7
3527// Constant for ARGB color to sepia tone
3528CONST vec8 kARGBToSepiaB = {
3529 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3530};
3531
3532CONST vec8 kARGBToSepiaG = {
3533 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3534};
3535
3536CONST vec8 kARGBToSepiaR = {
3537 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3538};
3539
fbarchard@google.come442dc42012-06-18 17:37:09 +00003540// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003541void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3542 asm volatile (
3543 "movdqa %2,%%xmm2 \n"
3544 "movdqa %3,%%xmm3 \n"
3545 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003546
3547 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003548 ".p2align 4 \n"
3549 "1: \n"
3550 "movdqa (%0),%%xmm0 \n"
3551 "movdqa 0x10(%0),%%xmm6 \n"
3552 "pmaddubsw %%xmm2,%%xmm0 \n"
3553 "pmaddubsw %%xmm2,%%xmm6 \n"
3554 "phaddw %%xmm6,%%xmm0 \n"
3555 "psrlw $0x7,%%xmm0 \n"
3556 "packuswb %%xmm0,%%xmm0 \n"
3557 "movdqa (%0),%%xmm5 \n"
3558 "movdqa 0x10(%0),%%xmm1 \n"
3559 "pmaddubsw %%xmm3,%%xmm5 \n"
3560 "pmaddubsw %%xmm3,%%xmm1 \n"
3561 "phaddw %%xmm1,%%xmm5 \n"
3562 "psrlw $0x7,%%xmm5 \n"
3563 "packuswb %%xmm5,%%xmm5 \n"
3564 "punpcklbw %%xmm5,%%xmm0 \n"
3565 "movdqa (%0),%%xmm5 \n"
3566 "movdqa 0x10(%0),%%xmm1 \n"
3567 "pmaddubsw %%xmm4,%%xmm5 \n"
3568 "pmaddubsw %%xmm4,%%xmm1 \n"
3569 "phaddw %%xmm1,%%xmm5 \n"
3570 "psrlw $0x7,%%xmm5 \n"
3571 "packuswb %%xmm5,%%xmm5 \n"
3572 "movdqa (%0),%%xmm6 \n"
3573 "movdqa 0x10(%0),%%xmm1 \n"
3574 "psrld $0x18,%%xmm6 \n"
3575 "psrld $0x18,%%xmm1 \n"
3576 "packuswb %%xmm1,%%xmm6 \n"
3577 "packuswb %%xmm6,%%xmm6 \n"
3578 "punpcklbw %%xmm6,%%xmm5 \n"
3579 "movdqa %%xmm0,%%xmm1 \n"
3580 "punpcklwd %%xmm5,%%xmm0 \n"
3581 "punpckhwd %%xmm5,%%xmm1 \n"
3582 "sub $0x8,%1 \n"
3583 "movdqa %%xmm0,(%0) \n"
3584 "movdqa %%xmm1,0x10(%0) \n"
3585 "lea 0x20(%0),%0 \n"
3586 "jg 1b \n"
3587 : "+r"(dst_argb), // %0
3588 "+r"(width) // %1
3589 : "m"(kARGBToSepiaB), // %2
3590 "m"(kARGBToSepiaG), // %3
3591 "m"(kARGBToSepiaR) // %4
3592 : "memory", "cc"
3593#if defined(__SSE2__)
3594 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3595#endif
3596 );
3597}
3598#endif // HAS_ARGBSEPIAROW_SSSE3
3599
fbarchard@google.come442dc42012-06-18 17:37:09 +00003600#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3601// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3602// Same as Sepia except matrix is provided.
3603void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3604 int width) {
3605 asm volatile (
3606 "movd (%2),%%xmm2 \n"
3607 "movd 0x4(%2),%%xmm3 \n"
3608 "movd 0x8(%2),%%xmm4 \n"
3609 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3610 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3611 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003612
3613 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003614 ".p2align 4 \n"
3615 "1: \n"
3616 "movdqa (%0),%%xmm0 \n"
3617 "movdqa 0x10(%0),%%xmm6 \n"
3618 "pmaddubsw %%xmm2,%%xmm0 \n"
3619 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003620 "movdqa (%0),%%xmm5 \n"
3621 "movdqa 0x10(%0),%%xmm1 \n"
3622 "pmaddubsw %%xmm3,%%xmm5 \n"
3623 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003624 "phaddsw %%xmm6,%%xmm0 \n"
3625 "phaddsw %%xmm1,%%xmm5 \n"
3626 "psraw $0x7,%%xmm0 \n"
3627 "psraw $0x7,%%xmm5 \n"
3628 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003629 "packuswb %%xmm5,%%xmm5 \n"
3630 "punpcklbw %%xmm5,%%xmm0 \n"
3631 "movdqa (%0),%%xmm5 \n"
3632 "movdqa 0x10(%0),%%xmm1 \n"
3633 "pmaddubsw %%xmm4,%%xmm5 \n"
3634 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003635 "phaddsw %%xmm1,%%xmm5 \n"
3636 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003637 "packuswb %%xmm5,%%xmm5 \n"
3638 "movdqa (%0),%%xmm6 \n"
3639 "movdqa 0x10(%0),%%xmm1 \n"
3640 "psrld $0x18,%%xmm6 \n"
3641 "psrld $0x18,%%xmm1 \n"
3642 "packuswb %%xmm1,%%xmm6 \n"
3643 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003644 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003645 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003646 "punpcklwd %%xmm5,%%xmm0 \n"
3647 "punpckhwd %%xmm5,%%xmm1 \n"
3648 "sub $0x8,%1 \n"
3649 "movdqa %%xmm0,(%0) \n"
3650 "movdqa %%xmm1,0x10(%0) \n"
3651 "lea 0x20(%0),%0 \n"
3652 "jg 1b \n"
3653 : "+r"(dst_argb), // %0
3654 "+r"(width) // %1
3655 : "r"(matrix_argb) // %2
3656 : "memory", "cc"
3657#if defined(__SSE2__)
3658 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3659#endif
3660 );
3661}
3662#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3663
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003664#ifdef HAS_ARGBQUANTIZEROW_SSE2
3665// Quantize 4 ARGB pixels (16 bytes).
3666// aligned to 16 bytes
3667void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3668 int interval_offset, int width) {
3669 asm volatile (
3670 "movd %2,%%xmm2 \n"
3671 "movd %3,%%xmm3 \n"
3672 "movd %4,%%xmm4 \n"
3673 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3674 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3675 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3676 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3677 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3678 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3679 "pxor %%xmm5,%%xmm5 \n"
3680 "pcmpeqb %%xmm6,%%xmm6 \n"
3681 "pslld $0x18,%%xmm6 \n"
3682
3683 // 4 pixel loop.
3684 ".p2align 2 \n"
3685 "1: \n"
3686 "movdqa (%0),%%xmm0 \n"
3687 "punpcklbw %%xmm5,%%xmm0 \n"
3688 "pmulhuw %%xmm2,%%xmm0 \n"
3689 "movdqa (%0),%%xmm1 \n"
3690 "punpckhbw %%xmm5,%%xmm1 \n"
3691 "pmulhuw %%xmm2,%%xmm1 \n"
3692 "pmullw %%xmm3,%%xmm0 \n"
3693 "movdqa (%0),%%xmm7 \n"
3694 "pmullw %%xmm3,%%xmm1 \n"
3695 "pand %%xmm6,%%xmm7 \n"
3696 "paddw %%xmm4,%%xmm0 \n"
3697 "paddw %%xmm4,%%xmm1 \n"
3698 "packuswb %%xmm1,%%xmm0 \n"
3699 "por %%xmm7,%%xmm0 \n"
3700 "sub $0x4,%1 \n"
3701 "movdqa %%xmm0,(%0) \n"
3702 "lea 0x10(%0),%0 \n"
3703 "jg 1b \n"
3704 : "+r"(dst_argb), // %0
3705 "+r"(width) // %1
3706 : "r"(scale), // %2
3707 "r"(interval_size), // %3
3708 "r"(interval_offset) // %4
3709 : "memory", "cc"
3710#if defined(__SSE2__)
3711 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3712#endif
3713 );
3714}
3715#endif // HAS_ARGBQUANTIZEROW_SSE2
3716
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003717#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3718// Creates a table of cumulative sums where each value is a sum of all values
3719// above and to the left of the value, inclusive of the value.
3720void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003721 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003722 asm volatile (
3723 "sub %1,%2 \n"
3724 "pxor %%xmm0,%%xmm0 \n"
3725 "pxor %%xmm1,%%xmm1 \n"
3726 "sub $0x4,%3 \n"
3727 "jl 49f \n"
3728 "test $0xf,%1 \n"
3729 "jne 49f \n"
3730
3731 // 4 pixel loop \n"
3732 ".p2align 2 \n"
3733 "40: \n"
3734 "movdqu (%0),%%xmm2 \n"
3735 "lea 0x10(%0),%0 \n"
3736 "movdqa %%xmm2,%%xmm4 \n"
3737 "punpcklbw %%xmm1,%%xmm2 \n"
3738 "movdqa %%xmm2,%%xmm3 \n"
3739 "punpcklwd %%xmm1,%%xmm2 \n"
3740 "punpckhwd %%xmm1,%%xmm3 \n"
3741 "punpckhbw %%xmm1,%%xmm4 \n"
3742 "movdqa %%xmm4,%%xmm5 \n"
3743 "punpcklwd %%xmm1,%%xmm4 \n"
3744 "punpckhwd %%xmm1,%%xmm5 \n"
3745 "paddd %%xmm2,%%xmm0 \n"
3746 "movdqa (%1,%2,1),%%xmm2 \n"
3747 "paddd %%xmm0,%%xmm2 \n"
3748 "paddd %%xmm3,%%xmm0 \n"
3749 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3750 "paddd %%xmm0,%%xmm3 \n"
3751 "paddd %%xmm4,%%xmm0 \n"
3752 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3753 "paddd %%xmm0,%%xmm4 \n"
3754 "paddd %%xmm5,%%xmm0 \n"
3755 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3756 "paddd %%xmm0,%%xmm5 \n"
3757 "movdqa %%xmm2,(%1) \n"
3758 "movdqa %%xmm3,0x10(%1) \n"
3759 "movdqa %%xmm4,0x20(%1) \n"
3760 "movdqa %%xmm5,0x30(%1) \n"
3761 "lea 0x40(%1),%1 \n"
3762 "sub $0x4,%3 \n"
3763 "jge 40b \n"
3764
3765 "49: \n"
3766 "add $0x3,%3 \n"
3767 "jl 19f \n"
3768
3769 // 1 pixel loop \n"
3770 ".p2align 2 \n"
3771 "10: \n"
3772 "movd (%0),%%xmm2 \n"
3773 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003774 "punpcklbw %%xmm1,%%xmm2 \n"
3775 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003776 "paddd %%xmm2,%%xmm0 \n"
3777 "movdqu (%1,%2,1),%%xmm2 \n"
3778 "paddd %%xmm0,%%xmm2 \n"
3779 "movdqu %%xmm2,(%1) \n"
3780 "lea 0x10(%1),%1 \n"
3781 "sub $0x1,%3 \n"
3782 "jge 10b \n"
3783
3784 "19: \n"
3785 : "+r"(row), // %0
3786 "+r"(cumsum), // %1
3787 "+r"(previous_cumsum), // %2
3788 "+r"(width) // %3
3789 :
3790 : "memory", "cc"
3791#if defined(__SSE2__)
3792 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3793#endif
3794 );
3795}
3796#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3797
3798#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3799void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3800 int width, int area, uint8* dst, int count) {
3801 asm volatile (
3802 "movd %5,%%xmm4 \n"
3803 "cvtdq2ps %%xmm4,%%xmm4 \n"
3804 "rcpss %%xmm4,%%xmm4 \n"
3805 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3806 "sub $0x4,%3 \n"
3807 "jl 49f \n"
3808
3809 // 4 pixel loop \n"
3810 ".p2align 2 \n"
3811 "40: \n"
3812 "movdqa (%0),%%xmm0 \n"
3813 "movdqa 0x10(%0),%%xmm1 \n"
3814 "movdqa 0x20(%0),%%xmm2 \n"
3815 "movdqa 0x30(%0),%%xmm3 \n"
3816 "psubd (%0,%4,4),%%xmm0 \n"
3817 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3818 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3819 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3820 "lea 0x40(%0),%0 \n"
3821 "psubd (%1),%%xmm0 \n"
3822 "psubd 0x10(%1),%%xmm1 \n"
3823 "psubd 0x20(%1),%%xmm2 \n"
3824 "psubd 0x30(%1),%%xmm3 \n"
3825 "paddd (%1,%4,4),%%xmm0 \n"
3826 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3827 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3828 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3829 "lea 0x40(%1),%1 \n"
3830 "cvtdq2ps %%xmm0,%%xmm0 \n"
3831 "cvtdq2ps %%xmm1,%%xmm1 \n"
3832 "mulps %%xmm4,%%xmm0 \n"
3833 "mulps %%xmm4,%%xmm1 \n"
3834 "cvtdq2ps %%xmm2,%%xmm2 \n"
3835 "cvtdq2ps %%xmm3,%%xmm3 \n"
3836 "mulps %%xmm4,%%xmm2 \n"
3837 "mulps %%xmm4,%%xmm3 \n"
3838 "cvtps2dq %%xmm0,%%xmm0 \n"
3839 "cvtps2dq %%xmm1,%%xmm1 \n"
3840 "cvtps2dq %%xmm2,%%xmm2 \n"
3841 "cvtps2dq %%xmm3,%%xmm3 \n"
3842 "packssdw %%xmm1,%%xmm0 \n"
3843 "packssdw %%xmm3,%%xmm2 \n"
3844 "packuswb %%xmm2,%%xmm0 \n"
3845 "movdqu %%xmm0,(%2) \n"
3846 "lea 0x10(%2),%2 \n"
3847 "sub $0x4,%3 \n"
3848 "jge 40b \n"
3849
3850 "49: \n"
3851 "add $0x3,%3 \n"
3852 "jl 19f \n"
3853
3854 // 1 pixel loop \n"
3855 ".p2align 2 \n"
3856 "10: \n"
3857 "movdqa (%0),%%xmm0 \n"
3858 "psubd (%0,%4,4),%%xmm0 \n"
3859 "lea 0x10(%0),%0 \n"
3860 "psubd (%1),%%xmm0 \n"
3861 "paddd (%1,%4,4),%%xmm0 \n"
3862 "lea 0x10(%1),%1 \n"
3863 "cvtdq2ps %%xmm0,%%xmm0 \n"
3864 "mulps %%xmm4,%%xmm0 \n"
3865 "cvtps2dq %%xmm0,%%xmm0 \n"
3866 "packssdw %%xmm0,%%xmm0 \n"
3867 "packuswb %%xmm0,%%xmm0 \n"
3868 "movd %%xmm0,(%2) \n"
3869 "lea 0x4(%2),%2 \n"
3870 "sub $0x1,%3 \n"
3871 "jge 10b \n"
3872 "19: \n"
3873 : "+r"(topleft), // %0
3874 "+r"(botleft), // %1
3875 "+r"(dst), // %2
3876 "+rm"(count) // %3
3877 : "r"(static_cast<intptr_t>(width)), // %4
3878 "rm"(area) // %5
3879 : "memory", "cc"
3880#if defined(__SSE2__)
3881 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3882#endif
3883 );
3884}
3885#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003886#ifdef HAS_ARGBSHADE_SSE2
3887// Shade 4 pixels at a time by specified value.
3888// Aligned to 16 bytes.
3889void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3890 uint32 value) {
3891 asm volatile (
3892 "movd %3,%%xmm2 \n"
3893 "sub %0,%1 \n"
3894 "punpcklbw %%xmm2,%%xmm2 \n"
3895 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003896
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003897 // 4 pixel loop.
3898 ".p2align 2 \n"
3899 "1: \n"
3900 "movdqa (%0),%%xmm0 \n"
3901 "movdqa %%xmm0,%%xmm1 \n"
3902 "punpcklbw %%xmm0,%%xmm0 \n"
3903 "punpckhbw %%xmm1,%%xmm1 \n"
3904 "pmulhuw %%xmm2,%%xmm0 \n"
3905 "pmulhuw %%xmm2,%%xmm1 \n"
3906 "psrlw $0x8,%%xmm0 \n"
3907 "psrlw $0x8,%%xmm1 \n"
3908 "packuswb %%xmm1,%%xmm0 \n"
3909 "sub $0x4,%2 \n"
3910 "movdqa %%xmm0,(%0,%1,1) \n"
3911 "lea 0x10(%0),%0 \n"
3912 "jg 1b \n"
3913 : "+r"(src_argb), // %0
3914 "+r"(dst_argb), // %1
3915 "+r"(width) // %2
3916 : "r"(value) // %3
3917 : "memory", "cc"
3918#if defined(__SSE2__)
3919 , "xmm0", "xmm1", "xmm2"
3920#endif
3921 );
3922}
3923#endif // HAS_ARGBSHADE_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003924
fbarchard@google.com73444402012-08-09 17:33:29 +00003925#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003926// TODO(fbarchard): Find 64 bit way to avoid masking.
3927// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
fbarchard@google.com73444402012-08-09 17:33:29 +00003928// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003929// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003930// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003931
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00003932LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00003933void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3934 uint8* dst_argb, const float* uv_dudv, int width) {
3935 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003936 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00003937 asm volatile (
3938 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003939 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003940 "shl $0x10,%1 \n"
3941 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003942 "movd %1,%%xmm5 \n"
3943 "sub $0x4,%4 \n"
3944 "jl 49f \n"
3945
3946 "pshufd $0x44,%%xmm7,%%xmm7 \n"
3947 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003948 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003949 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003950 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003951 "movdqa %%xmm7,%%xmm4 \n"
3952 "addps %%xmm4,%%xmm4 \n"
3953 "movdqa %%xmm2,%%xmm3 \n"
3954 "addps %%xmm4,%%xmm3 \n"
3955 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003956
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003957 // 4 pixel loop \n"
3958 ".p2align 4 \n"
3959 "40: \n"
3960 "cvttps2dq %%xmm2,%%xmm0 \n"
3961 "cvttps2dq %%xmm3,%%xmm1 \n"
3962 "packssdw %%xmm1,%%xmm0 \n"
3963 "pmaddwd %%xmm5,%%xmm0 \n"
3964#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003965 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003966 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003967 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003968 "shr $32,%5 \n"
3969 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
3970#else
3971 "movd %%xmm0,%1 \n"
3972 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3973 "movd %%xmm0,%5 \n"
3974 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3975#endif
3976 "movd (%0,%1,1),%%xmm1 \n"
3977 "movd (%0,%5,1),%%xmm6 \n"
3978 "punpckldq %%xmm6,%%xmm1 \n"
3979 "addps %%xmm4,%%xmm2 \n"
3980 "movq %%xmm1,(%2) \n"
3981#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003982 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003983 "mov %1,%5 \n"
3984 "and $0x0fffffff,%1 \n"
3985 "shr $32,%5 \n"
3986#else
3987 "movd %%xmm0,%1 \n"
3988 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3989 "movd %%xmm0,%5 \n"
3990#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00003991 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003992 "movd (%0,%5,1),%%xmm6 \n"
3993 "punpckldq %%xmm6,%%xmm0 \n"
3994 "addps %%xmm4,%%xmm3 \n"
3995 "sub $0x4,%4 \n"
3996 "movq %%xmm0,0x08(%2) \n"
3997 "lea 0x10(%2),%2 \n"
3998 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003999
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004000 "49: \n"
4001 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004002 "jl 19f \n"
4003
4004 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004005 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004006 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004007 "cvttps2dq %%xmm2,%%xmm0 \n"
4008 "packssdw %%xmm0,%%xmm0 \n"
4009 "pmaddwd %%xmm5,%%xmm0 \n"
4010 "addps %%xmm7,%%xmm2 \n"
4011 "movd %%xmm0,%1 \n"
4012#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004013 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004014#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004015 "movd (%0,%1,1),%%xmm0 \n"
4016 "sub $0x1,%4 \n"
4017 "movd %%xmm0,(%2) \n"
4018 "lea 0x4(%2),%2 \n"
4019 "jge 10b \n"
4020 "19: \n"
4021 : "+r"(src_argb), // %0
4022 "+r"(src_argb_stride_temp), // %1
4023 "+r"(dst_argb), // %2
4024 "+r"(uv_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004025 "+rm"(width), // %4
4026 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004027 :
4028 : "memory", "cc"
4029#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004030 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004031#endif
4032 );
4033}
4034#endif // HAS_ARGBAFFINEROW_SSE2
4035
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004036// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
4037void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4038 ptrdiff_t src_stride, int dst_width,
4039 int source_y_fraction) {
4040 asm volatile (
4041 "sub %1,%0 \n"
4042 "shr %3 \n"
4043 "cmp $0x0,%3 \n"
4044 "je 2f \n"
4045 "cmp $0x40,%3 \n"
4046 "je 3f \n"
4047 "movd %3,%%xmm0 \n"
4048 "neg %3 \n"
4049 "add $0x80,%3 \n"
4050 "movd %3,%%xmm5 \n"
4051 "punpcklbw %%xmm0,%%xmm5 \n"
4052 "punpcklwd %%xmm5,%%xmm5 \n"
4053 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4054 ".p2align 4 \n"
4055 "1: \n"
4056 "movdqa (%1),%%xmm0 \n"
4057 "movdqa (%1,%4,1),%%xmm2 \n"
4058 "movdqa %%xmm0,%%xmm1 \n"
4059 "punpcklbw %%xmm2,%%xmm0 \n"
4060 "punpckhbw %%xmm2,%%xmm1 \n"
4061 "pmaddubsw %%xmm5,%%xmm0 \n"
4062 "pmaddubsw %%xmm5,%%xmm1 \n"
4063 "psrlw $0x7,%%xmm0 \n"
4064 "psrlw $0x7,%%xmm1 \n"
4065 "packuswb %%xmm1,%%xmm0 \n"
4066 "sub $0x4,%2 \n"
4067 "movdqa %%xmm0,(%1,%0,1) \n"
4068 "lea 0x10(%1),%1 \n"
4069 "jg 1b \n"
4070 "jmp 4f \n"
4071 ".p2align 4 \n"
4072 "2: \n"
4073 "movdqa (%1),%%xmm0 \n"
4074 "sub $0x4,%2 \n"
4075 "movdqa %%xmm0,(%1,%0,1) \n"
4076 "lea 0x10(%1),%1 \n"
4077 "jg 2b \n"
4078 "jmp 4f \n"
4079 ".p2align 4 \n"
4080 "3: \n"
4081 "movdqa (%1),%%xmm0 \n"
4082 "pavgb (%1,%4,1),%%xmm0 \n"
4083 "sub $0x4,%2 \n"
4084 "movdqa %%xmm0,(%1,%0,1) \n"
4085 "lea 0x10(%1),%1 \n"
4086 "jg 3b \n"
4087 "4: \n"
4088 ".p2align 4 \n"
4089 : "+r"(dst_ptr), // %0
4090 "+r"(src_ptr), // %1
4091 "+r"(dst_width), // %2
4092 "+r"(source_y_fraction) // %3
4093 : "r"(static_cast<intptr_t>(src_stride)) // %4
4094 : "memory", "cc"
4095#if defined(__SSE2__)
4096 , "xmm0", "xmm1", "xmm2", "xmm5"
4097#endif
4098 );
4099}
4100
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004101void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4102 uint8* dst_uv, int pix) {
4103 asm volatile (
4104 "sub %0,%1 \n"
4105 ".p2align 4 \n"
4106 "1: \n"
4107 "movdqa (%0),%%xmm0 \n"
4108 "pavgb (%0,%3),%%xmm0 \n"
4109 "sub $0x10,%2 \n"
4110 "movdqa %%xmm0,(%0,%1) \n"
4111 "lea 0x10(%0),%0 \n"
4112 "jg 1b \n"
4113 : "+r"(src_uv), // %0
4114 "+r"(dst_uv), // %1
4115 "+r"(pix) // %2
4116 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4117 : "memory", "cc"
4118#if defined(__SSE2__)
4119 , "xmm0"
4120#endif
4121 );
4122}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004123
4124void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4125 uint32 selector, int pix) {
4126 asm volatile (
4127 "movd %3,%%xmm5 \n"
4128 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4129 ".p2align 4 \n"
4130 "1: \n"
4131 "movdqa (%0),%%xmm0 \n"
4132 "lea 0x10(%0),%0 \n"
4133 "pshufb %%xmm5,%%xmm0 \n"
4134 "sub $0x4,%2 \n"
4135 "movd %%xmm0,(%1) \n"
4136 "lea 0x4(%1),%1 \n"
4137 "jg 1b \n"
4138 : "+r"(src_argb), // %0
4139 "+r"(dst_bayer), // %1
4140 "+r"(pix) // %2
4141 : "g"(selector) // %3
4142 : "memory", "cc"
4143#if defined(__SSE2__)
4144 , "xmm0", "xmm5"
4145#endif
4146 );
4147}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004148
4149void I422ToYUY2Row_SSE2(const uint8* src_y,
4150 const uint8* src_u,
4151 const uint8* src_v,
4152 uint8* dst_frame, int width) {
4153 asm volatile (
4154 "sub %1,%2 \n"
4155 ".p2align 4 \n"
4156 "1: \n"
4157 "movq (%1),%%xmm2 \n"
4158 "movq (%1,%2,1),%%xmm3 \n"
4159 "lea 0x8(%1),%1 \n"
4160 "punpcklbw %%xmm3,%%xmm2 \n"
4161 "movdqa (%0),%%xmm0 \n"
4162 "lea 0x10(%0),%0 \n"
4163 "movdqa %%xmm0,%%xmm1 \n"
4164 "punpcklbw %%xmm2,%%xmm0 \n"
4165 "punpckhbw %%xmm2,%%xmm1 \n"
4166 "movdqa %%xmm0,(%3) \n"
4167 "movdqa %%xmm1,0x10(%3) \n"
4168 "lea 0x20(%3),%3 \n"
4169 "sub $0x10,%4 \n"
4170 "jg 1b \n"
4171 : "+r"(src_y), // %0
4172 "+r"(src_u), // %1
4173 "+r"(src_v), // %2
4174 "+r"(dst_frame), // %3
4175 "+rm"(width) // %4
4176 :
4177 : "memory", "cc"
4178#if defined(__SSE2__)
4179 , "xmm0", "xmm1", "xmm2", "xmm3"
4180#endif
4181 );
4182}
4183
4184void I422ToUYVYRow_SSE2(const uint8* src_y,
4185 const uint8* src_u,
4186 const uint8* src_v,
4187 uint8* dst_frame, int width) {
4188 asm volatile (
4189 "sub %1,%2 \n"
4190 ".p2align 4 \n"
4191 "1: \n"
4192 "movq (%1),%%xmm2 \n"
4193 "movq (%1,%2,1),%%xmm3 \n"
4194 "lea 0x8(%1),%1 \n"
4195 "punpcklbw %%xmm3,%%xmm2 \n"
4196 "movdqa (%0),%%xmm0 \n"
4197 "movdqa %%xmm2,%%xmm1 \n"
4198 "lea 0x10(%0),%0 \n"
4199 "punpcklbw %%xmm0,%%xmm1 \n"
4200 "punpckhbw %%xmm0,%%xmm2 \n"
4201 "movdqa %%xmm1,(%3) \n"
4202 "movdqa %%xmm2,0x10(%3) \n"
4203 "lea 0x20(%3),%3 \n"
4204 "sub $0x10,%4 \n"
4205 "jg 1b \n"
4206 : "+r"(src_y), // %0
4207 "+r"(src_u), // %1
4208 "+r"(src_v), // %2
4209 "+r"(dst_frame), // %3
4210 "+rm"(width) // %4
4211 :
4212 : "memory", "cc"
4213#if defined(__SSE2__)
4214 , "xmm0", "xmm1", "xmm2", "xmm3"
4215#endif
4216 );
4217}
4218
fbarchard@google.com2d11d432012-02-16 02:50:39 +00004219#endif // defined(__x86_64__) || defined(__i386__)
4220
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004221#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00004222} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004223} // namespace libyuv
4224#endif