blob: 4c11d4fc21ce97ec6ece968dafeb77ff510d095f [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
fbarchard@google.com4de0c432012-10-11 01:25:46 +000072// Constants for RGBA.
73CONST vec8 kRGBAToY = {
74 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
75};
76
77CONST vec8 kRGBAToU = {
78 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
79};
80
81CONST vec8 kRGBAToV = {
82 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
83};
84
fbarchard@google.com714050a2012-02-17 22:59:56 +000085CONST uvec8 kAddY16 = {
86 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000087};
fbarchard@google.com2430e042011-11-11 21:57:06 +000088
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000089CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
92};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000093
fbarchard@google.comba1f5262012-01-12 19:22:41 +000094// Shuffle table for converting RGB24 to ARGB.
95CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000096 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
97};
98
99// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000100CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000101 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
102};
103
fbarchard@google.comb6149762011-11-07 21:58:52 +0000104// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000105CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000106 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
107};
108
109// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000110CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
112};
113
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000114// Shuffle table for converting RGBA to ARGB.
115CONST uvec8 kShuffleMaskRGBAToARGB = {
116 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
117};
118
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000119// Shuffle table for converting ARGB to RGBA.
120CONST uvec8 kShuffleMaskARGBToRGBA = {
121 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
122};
123
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000124// Shuffle table for converting ARGB to RGB24.
125CONST uvec8 kShuffleMaskARGBToRGB24 = {
126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
127};
128
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000129// Shuffle table for converting ARGB to RAW.
130CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000131 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000132};
133
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000134// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000135CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
137};
138
139// Shuffle table for converting ARGB to RAW.
140CONST uvec8 kShuffleMaskARGBToRAW_0 = {
141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142};
143
fbarchard@google.comb6149762011-11-07 21:58:52 +0000144void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000145 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pcmpeqb %%xmm5,%%xmm5 \n"
147 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000148 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "1: \n"
150 "movq (%0),%%xmm0 \n"
151 "lea 0x8(%0),%0 \n"
152 "punpcklbw %%xmm0,%%xmm0 \n"
153 "movdqa %%xmm0,%%xmm1 \n"
154 "punpcklwd %%xmm0,%%xmm0 \n"
155 "punpckhwd %%xmm1,%%xmm1 \n"
156 "por %%xmm5,%%xmm0 \n"
157 "por %%xmm5,%%xmm1 \n"
158 "movdqa %%xmm0,(%1) \n"
159 "movdqa %%xmm1,0x10(%1) \n"
160 "lea 0x20(%1),%1 \n"
161 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000162 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163 : "+r"(src_y), // %0
164 "+r"(dst_argb), // %1
165 "+r"(pix) // %2
166 :
167 : "memory", "cc"
168#if defined(__SSE2__)
169 , "xmm0", "xmm1", "xmm5"
170#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000171 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000172}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173
174void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000175 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000176 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000177 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000178 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000179 "1: \n"
180 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000181 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000182 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000183 "movdqa %%xmm0,(%0,%1,1) \n"
184 "lea 0x10(%0),%0 \n"
185 "jg 1b \n"
186
fbarchard@google.comb6149762011-11-07 21:58:52 +0000187 : "+r"(src_abgr), // %0
188 "+r"(dst_argb), // %1
189 "+r"(pix) // %2
190 : "m"(kShuffleMaskABGRToARGB) // %3
191 : "memory", "cc"
192#if defined(__SSE2__)
193 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000194#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000195 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000196}
197
198void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000199 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000200 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000201 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000202 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000203 "1: \n"
204 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000205 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000206 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000207 "movdqa %%xmm0,(%0,%1,1) \n"
208 "lea 0x10(%0),%0 \n"
209 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000210 : "+r"(src_bgra), // %0
211 "+r"(dst_argb), // %1
212 "+r"(pix) // %2
213 : "m"(kShuffleMaskBGRAToARGB) // %3
214 : "memory", "cc"
215#if defined(__SSE2__)
216 , "xmm0", "xmm5"
217#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000218 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000219}
220
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000221void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
222 asm volatile (
223 "movdqa %3,%%xmm5 \n"
224 "sub %0,%1 \n"
225 ".p2align 4 \n"
226 "1: \n"
227 "movdqa (%0),%%xmm0 \n"
228 "pshufb %%xmm5,%%xmm0 \n"
229 "sub $0x4,%2 \n"
230 "movdqa %%xmm0,(%0,%1,1) \n"
231 "lea 0x10(%0),%0 \n"
232 "jg 1b \n"
233
234 : "+r"(src_rgba), // %0
235 "+r"(dst_argb), // %1
236 "+r"(pix) // %2
237 : "m"(kShuffleMaskRGBAToARGB) // %3
238 : "memory", "cc"
239#if defined(__SSE2__)
240 , "xmm0", "xmm5"
241#endif
242 );
243}
244
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000245void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
246 asm volatile (
247 "movdqa %3,%%xmm5 \n"
248 "sub %0,%1 \n"
249 ".p2align 4 \n"
250 "1: \n"
251 "movdqa (%0),%%xmm0 \n"
252 "pshufb %%xmm5,%%xmm0 \n"
253 "sub $0x4,%2 \n"
254 "movdqa %%xmm0,(%0,%1,1) \n"
255 "lea 0x10(%0),%0 \n"
256 "jg 1b \n"
257
258 : "+r"(src_argb), // %0
259 "+r"(dst_rgba), // %1
260 "+r"(pix) // %2
261 : "m"(kShuffleMaskARGBToRGBA) // %3
262 : "memory", "cc"
263#if defined(__SSE2__)
264 , "xmm0", "xmm5"
265#endif
266 );
267}
268
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000269void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000270 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000271 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
272 "pslld $0x18,%%xmm5 \n"
273 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000274 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000275 "1: \n"
276 "movdqu (%0),%%xmm0 \n"
277 "movdqu 0x10(%0),%%xmm1 \n"
278 "movdqu 0x20(%0),%%xmm3 \n"
279 "lea 0x30(%0),%0 \n"
280 "movdqa %%xmm3,%%xmm2 \n"
281 "palignr $0x8,%%xmm1,%%xmm2 \n"
282 "pshufb %%xmm4,%%xmm2 \n"
283 "por %%xmm5,%%xmm2 \n"
284 "palignr $0xc,%%xmm0,%%xmm1 \n"
285 "pshufb %%xmm4,%%xmm0 \n"
286 "movdqa %%xmm2,0x20(%1) \n"
287 "por %%xmm5,%%xmm0 \n"
288 "pshufb %%xmm4,%%xmm1 \n"
289 "movdqa %%xmm0,(%1) \n"
290 "por %%xmm5,%%xmm1 \n"
291 "palignr $0x4,%%xmm3,%%xmm3 \n"
292 "pshufb %%xmm4,%%xmm3 \n"
293 "movdqa %%xmm1,0x10(%1) \n"
294 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000295 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000296 "movdqa %%xmm3,0x30(%1) \n"
297 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000298 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000299 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000300 "+r"(dst_argb), // %1
301 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000302 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000303 : "memory", "cc"
304#if defined(__SSE2__)
305 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
306#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000307 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000308}
309
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000310void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000311 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000312 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
313 "pslld $0x18,%%xmm5 \n"
314 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000315 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000316 "1: \n"
317 "movdqu (%0),%%xmm0 \n"
318 "movdqu 0x10(%0),%%xmm1 \n"
319 "movdqu 0x20(%0),%%xmm3 \n"
320 "lea 0x30(%0),%0 \n"
321 "movdqa %%xmm3,%%xmm2 \n"
322 "palignr $0x8,%%xmm1,%%xmm2 \n"
323 "pshufb %%xmm4,%%xmm2 \n"
324 "por %%xmm5,%%xmm2 \n"
325 "palignr $0xc,%%xmm0,%%xmm1 \n"
326 "pshufb %%xmm4,%%xmm0 \n"
327 "movdqa %%xmm2,0x20(%1) \n"
328 "por %%xmm5,%%xmm0 \n"
329 "pshufb %%xmm4,%%xmm1 \n"
330 "movdqa %%xmm0,(%1) \n"
331 "por %%xmm5,%%xmm1 \n"
332 "palignr $0x4,%%xmm3,%%xmm3 \n"
333 "pshufb %%xmm4,%%xmm3 \n"
334 "movdqa %%xmm1,0x10(%1) \n"
335 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000336 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000337 "movdqa %%xmm3,0x30(%1) \n"
338 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000339 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000340 : "+r"(src_raw), // %0
341 "+r"(dst_argb), // %1
342 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000343 : "m"(kShuffleMaskRAWToARGB) // %3
344 : "memory", "cc"
345#if defined(__SSE2__)
346 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
347#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000348 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000349}
350
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000351void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000352 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000353 "mov $0x1080108,%%eax \n"
354 "movd %%eax,%%xmm5 \n"
355 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000356 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000357 "movd %%eax,%%xmm6 \n"
358 "pshufd $0x0,%%xmm6,%%xmm6 \n"
359 "pcmpeqb %%xmm3,%%xmm3 \n"
360 "psllw $0xb,%%xmm3 \n"
361 "pcmpeqb %%xmm4,%%xmm4 \n"
362 "psllw $0xa,%%xmm4 \n"
363 "psrlw $0x5,%%xmm4 \n"
364 "pcmpeqb %%xmm7,%%xmm7 \n"
365 "psllw $0x8,%%xmm7 \n"
366 "sub %0,%1 \n"
367 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000368 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000369 "1: \n"
370 "movdqu (%0),%%xmm0 \n"
371 "movdqa %%xmm0,%%xmm1 \n"
372 "movdqa %%xmm0,%%xmm2 \n"
373 "pand %%xmm3,%%xmm1 \n"
374 "psllw $0xb,%%xmm2 \n"
375 "pmulhuw %%xmm5,%%xmm1 \n"
376 "pmulhuw %%xmm5,%%xmm2 \n"
377 "psllw $0x8,%%xmm1 \n"
378 "por %%xmm2,%%xmm1 \n"
379 "pand %%xmm4,%%xmm0 \n"
380 "pmulhuw %%xmm6,%%xmm0 \n"
381 "por %%xmm7,%%xmm0 \n"
382 "movdqa %%xmm1,%%xmm2 \n"
383 "punpcklbw %%xmm0,%%xmm1 \n"
384 "punpckhbw %%xmm0,%%xmm2 \n"
385 "movdqa %%xmm1,(%1,%0,2) \n"
386 "movdqa %%xmm2,0x10(%1,%0,2) \n"
387 "lea 0x10(%0),%0 \n"
388 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000389 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000390 : "+r"(src), // %0
391 "+r"(dst), // %1
392 "+r"(pix) // %2
393 :
394 : "memory", "cc", "eax"
395#if defined(__SSE2__)
396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
397#endif
398 );
399}
400
401void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000402 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000403 "mov $0x1080108,%%eax \n"
404 "movd %%eax,%%xmm5 \n"
405 "pshufd $0x0,%%xmm5,%%xmm5 \n"
406 "mov $0x42004200,%%eax \n"
407 "movd %%eax,%%xmm6 \n"
408 "pshufd $0x0,%%xmm6,%%xmm6 \n"
409 "pcmpeqb %%xmm3,%%xmm3 \n"
410 "psllw $0xb,%%xmm3 \n"
411 "movdqa %%xmm3,%%xmm4 \n"
412 "psrlw $0x6,%%xmm4 \n"
413 "pcmpeqb %%xmm7,%%xmm7 \n"
414 "psllw $0x8,%%xmm7 \n"
415 "sub %0,%1 \n"
416 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000417 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000418 "1: \n"
419 "movdqu (%0),%%xmm0 \n"
420 "movdqa %%xmm0,%%xmm1 \n"
421 "movdqa %%xmm0,%%xmm2 \n"
422 "psllw $0x1,%%xmm1 \n"
423 "psllw $0xb,%%xmm2 \n"
424 "pand %%xmm3,%%xmm1 \n"
425 "pmulhuw %%xmm5,%%xmm2 \n"
426 "pmulhuw %%xmm5,%%xmm1 \n"
427 "psllw $0x8,%%xmm1 \n"
428 "por %%xmm2,%%xmm1 \n"
429 "movdqa %%xmm0,%%xmm2 \n"
430 "pand %%xmm4,%%xmm0 \n"
431 "psraw $0x8,%%xmm2 \n"
432 "pmulhuw %%xmm6,%%xmm0 \n"
433 "pand %%xmm7,%%xmm2 \n"
434 "por %%xmm2,%%xmm0 \n"
435 "movdqa %%xmm1,%%xmm2 \n"
436 "punpcklbw %%xmm0,%%xmm1 \n"
437 "punpckhbw %%xmm0,%%xmm2 \n"
438 "movdqa %%xmm1,(%1,%0,2) \n"
439 "movdqa %%xmm2,0x10(%1,%0,2) \n"
440 "lea 0x10(%0),%0 \n"
441 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 :
447 : "memory", "cc", "eax"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
450#endif
451 );
452}
453
454void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "mov $0xf0f0f0f,%%eax \n"
457 "movd %%eax,%%xmm4 \n"
458 "pshufd $0x0,%%xmm4,%%xmm4 \n"
459 "movdqa %%xmm4,%%xmm5 \n"
460 "pslld $0x4,%%xmm5 \n"
461 "sub %0,%1 \n"
462 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000463 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000464 "1: \n"
465 "movdqu (%0),%%xmm0 \n"
466 "movdqa %%xmm0,%%xmm2 \n"
467 "pand %%xmm4,%%xmm0 \n"
468 "pand %%xmm5,%%xmm2 \n"
469 "movdqa %%xmm0,%%xmm1 \n"
470 "movdqa %%xmm2,%%xmm3 \n"
471 "psllw $0x4,%%xmm1 \n"
472 "psrlw $0x4,%%xmm3 \n"
473 "por %%xmm1,%%xmm0 \n"
474 "por %%xmm3,%%xmm2 \n"
475 "movdqa %%xmm0,%%xmm1 \n"
476 "punpcklbw %%xmm2,%%xmm0 \n"
477 "punpckhbw %%xmm2,%%xmm1 \n"
478 "movdqa %%xmm0,(%1,%0,2) \n"
479 "movdqa %%xmm1,0x10(%1,%0,2) \n"
480 "lea 0x10(%0),%0 \n"
481 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000482 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000483 : "+r"(src), // %0
484 "+r"(dst), // %1
485 "+r"(pix) // %2
486 :
487 : "memory", "cc", "eax"
488#if defined(__SSE2__)
489 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
490#endif
491 );
492}
493
494void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000495 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000496 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000497 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000498 "1: \n"
499 "movdqa (%0),%%xmm0 \n"
500 "movdqa 0x10(%0),%%xmm1 \n"
501 "movdqa 0x20(%0),%%xmm2 \n"
502 "movdqa 0x30(%0),%%xmm3 \n"
503 "lea 0x40(%0),%0 \n"
504 "pshufb %%xmm6,%%xmm0 \n"
505 "pshufb %%xmm6,%%xmm1 \n"
506 "pshufb %%xmm6,%%xmm2 \n"
507 "pshufb %%xmm6,%%xmm3 \n"
508 "movdqa %%xmm1,%%xmm4 \n"
509 "psrldq $0x4,%%xmm1 \n"
510 "pslldq $0xc,%%xmm4 \n"
511 "movdqa %%xmm2,%%xmm5 \n"
512 "por %%xmm4,%%xmm0 \n"
513 "pslldq $0x8,%%xmm5 \n"
514 "movdqa %%xmm0,(%1) \n"
515 "por %%xmm5,%%xmm1 \n"
516 "psrldq $0x8,%%xmm2 \n"
517 "pslldq $0x4,%%xmm3 \n"
518 "por %%xmm3,%%xmm2 \n"
519 "movdqa %%xmm1,0x10(%1) \n"
520 "movdqa %%xmm2,0x20(%1) \n"
521 "lea 0x30(%1),%1 \n"
522 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 : "m"(kShuffleMaskARGBToRGB24) // %3
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
531#endif
532 );
533}
534
535void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000538 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000539 "1: \n"
540 "movdqa (%0),%%xmm0 \n"
541 "movdqa 0x10(%0),%%xmm1 \n"
542 "movdqa 0x20(%0),%%xmm2 \n"
543 "movdqa 0x30(%0),%%xmm3 \n"
544 "lea 0x40(%0),%0 \n"
545 "pshufb %%xmm6,%%xmm0 \n"
546 "pshufb %%xmm6,%%xmm1 \n"
547 "pshufb %%xmm6,%%xmm2 \n"
548 "pshufb %%xmm6,%%xmm3 \n"
549 "movdqa %%xmm1,%%xmm4 \n"
550 "psrldq $0x4,%%xmm1 \n"
551 "pslldq $0xc,%%xmm4 \n"
552 "movdqa %%xmm2,%%xmm5 \n"
553 "por %%xmm4,%%xmm0 \n"
554 "pslldq $0x8,%%xmm5 \n"
555 "movdqa %%xmm0,(%1) \n"
556 "por %%xmm5,%%xmm1 \n"
557 "psrldq $0x8,%%xmm2 \n"
558 "pslldq $0x4,%%xmm3 \n"
559 "por %%xmm3,%%xmm2 \n"
560 "movdqa %%xmm1,0x10(%1) \n"
561 "movdqa %%xmm2,0x20(%1) \n"
562 "lea 0x30(%1),%1 \n"
563 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000564 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000565 : "+r"(src), // %0
566 "+r"(dst), // %1
567 "+r"(pix) // %2
568 : "m"(kShuffleMaskARGBToRAW) // %3
569 : "memory", "cc"
570#if defined(__SSE2__)
571 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
572#endif
573 );
574}
575
576void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000577 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000578 "pcmpeqb %%xmm3,%%xmm3 \n"
579 "psrld $0x1b,%%xmm3 \n"
580 "pcmpeqb %%xmm4,%%xmm4 \n"
581 "psrld $0x1a,%%xmm4 \n"
582 "pslld $0x5,%%xmm4 \n"
583 "pcmpeqb %%xmm5,%%xmm5 \n"
584 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "movdqa %%xmm0,%%xmm2 \n"
590 "pslld $0x8,%%xmm0 \n"
591 "psrld $0x3,%%xmm1 \n"
592 "psrld $0x5,%%xmm2 \n"
593 "psrad $0x10,%%xmm0 \n"
594 "pand %%xmm3,%%xmm1 \n"
595 "pand %%xmm4,%%xmm2 \n"
596 "pand %%xmm5,%%xmm0 \n"
597 "por %%xmm2,%%xmm1 \n"
598 "por %%xmm1,%%xmm0 \n"
599 "packssdw %%xmm0,%%xmm0 \n"
600 "lea 0x10(%0),%0 \n"
601 "movq %%xmm0,(%1) \n"
602 "lea 0x8(%1),%1 \n"
603 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000604 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000605 : "+r"(src), // %0
606 "+r"(dst), // %1
607 "+r"(pix) // %2
608 :
609 : "memory", "cc"
610#if defined(__SSE2__)
611 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
612#endif
613 );
614}
615
616void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000617 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000618 "pcmpeqb %%xmm4,%%xmm4 \n"
619 "psrld $0x1b,%%xmm4 \n"
620 "movdqa %%xmm4,%%xmm5 \n"
621 "pslld $0x5,%%xmm5 \n"
622 "movdqa %%xmm4,%%xmm6 \n"
623 "pslld $0xa,%%xmm6 \n"
624 "pcmpeqb %%xmm7,%%xmm7 \n"
625 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000626 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000627 "1: \n"
628 "movdqa (%0),%%xmm0 \n"
629 "movdqa %%xmm0,%%xmm1 \n"
630 "movdqa %%xmm0,%%xmm2 \n"
631 "movdqa %%xmm0,%%xmm3 \n"
632 "psrad $0x10,%%xmm0 \n"
633 "psrld $0x3,%%xmm1 \n"
634 "psrld $0x6,%%xmm2 \n"
635 "psrld $0x9,%%xmm3 \n"
636 "pand %%xmm7,%%xmm0 \n"
637 "pand %%xmm4,%%xmm1 \n"
638 "pand %%xmm5,%%xmm2 \n"
639 "pand %%xmm6,%%xmm3 \n"
640 "por %%xmm1,%%xmm0 \n"
641 "por %%xmm3,%%xmm2 \n"
642 "por %%xmm2,%%xmm0 \n"
643 "packssdw %%xmm0,%%xmm0 \n"
644 "lea 0x10(%0),%0 \n"
645 "movq %%xmm0,(%1) \n"
646 "lea 0x8(%1),%1 \n"
647 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000648 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000649 : "+r"(src), // %0
650 "+r"(dst), // %1
651 "+r"(pix) // %2
652 :
653 : "memory", "cc"
654#if defined(__SSE2__)
655 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
656#endif
657 );
658}
659
660void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000661 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000662 "pcmpeqb %%xmm4,%%xmm4 \n"
663 "psllw $0xc,%%xmm4 \n"
664 "movdqa %%xmm4,%%xmm3 \n"
665 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000666 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000667 "1: \n"
668 "movdqa (%0),%%xmm0 \n"
669 "movdqa %%xmm0,%%xmm1 \n"
670 "pand %%xmm3,%%xmm0 \n"
671 "pand %%xmm4,%%xmm1 \n"
672 "psrlq $0x4,%%xmm0 \n"
673 "psrlq $0x8,%%xmm1 \n"
674 "por %%xmm1,%%xmm0 \n"
675 "packuswb %%xmm0,%%xmm0 \n"
676 "lea 0x10(%0),%0 \n"
677 "movq %%xmm0,(%1) \n"
678 "lea 0x8(%1),%1 \n"
679 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000680 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000681 : "+r"(src), // %0
682 "+r"(dst), // %1
683 "+r"(pix) // %2
684 :
685 : "memory", "cc"
686#if defined(__SSE2__)
687 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
688#endif
689 );
690}
691
fbarchard@google.comb6149762011-11-07 21:58:52 +0000692void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000693 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000694 "movdqa %4,%%xmm5 \n"
695 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000696 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000697 "1: \n"
698 "movdqa (%0),%%xmm0 \n"
699 "movdqa 0x10(%0),%%xmm1 \n"
700 "movdqa 0x20(%0),%%xmm2 \n"
701 "movdqa 0x30(%0),%%xmm3 \n"
702 "pmaddubsw %%xmm4,%%xmm0 \n"
703 "pmaddubsw %%xmm4,%%xmm1 \n"
704 "pmaddubsw %%xmm4,%%xmm2 \n"
705 "pmaddubsw %%xmm4,%%xmm3 \n"
706 "lea 0x40(%0),%0 \n"
707 "phaddw %%xmm1,%%xmm0 \n"
708 "phaddw %%xmm3,%%xmm2 \n"
709 "psrlw $0x7,%%xmm0 \n"
710 "psrlw $0x7,%%xmm2 \n"
711 "packuswb %%xmm2,%%xmm0 \n"
712 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000713 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000714 "movdqa %%xmm0,(%1) \n"
715 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000716 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000717 : "+r"(src_argb), // %0
718 "+r"(dst_y), // %1
719 "+r"(pix) // %2
720 : "m"(kARGBToY), // %3
721 "m"(kAddY16) // %4
722 : "memory", "cc"
723#if defined(__SSE2__)
724 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
725#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000726 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000727}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000728
729void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000730 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000731 "movdqa %4,%%xmm5 \n"
732 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000733 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000734 "1: \n"
735 "movdqu (%0),%%xmm0 \n"
736 "movdqu 0x10(%0),%%xmm1 \n"
737 "movdqu 0x20(%0),%%xmm2 \n"
738 "movdqu 0x30(%0),%%xmm3 \n"
739 "pmaddubsw %%xmm4,%%xmm0 \n"
740 "pmaddubsw %%xmm4,%%xmm1 \n"
741 "pmaddubsw %%xmm4,%%xmm2 \n"
742 "pmaddubsw %%xmm4,%%xmm3 \n"
743 "lea 0x40(%0),%0 \n"
744 "phaddw %%xmm1,%%xmm0 \n"
745 "phaddw %%xmm3,%%xmm2 \n"
746 "psrlw $0x7,%%xmm0 \n"
747 "psrlw $0x7,%%xmm2 \n"
748 "packuswb %%xmm2,%%xmm0 \n"
749 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000750 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000751 "movdqu %%xmm0,(%1) \n"
752 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000753 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000754 : "+r"(src_argb), // %0
755 "+r"(dst_y), // %1
756 "+r"(pix) // %2
757 : "m"(kARGBToY), // %3
758 "m"(kAddY16) // %4
759 : "memory", "cc"
760#if defined(__SSE2__)
761 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
762#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000763 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000764}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000765
fbarchard@google.com714050a2012-02-17 22:59:56 +0000766// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000767// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
768// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
769// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000770// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000771void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
772 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000773 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000774 "movdqa %0,%%xmm4 \n"
775 "movdqa %1,%%xmm3 \n"
776 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000777 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000778 : "m"(kARGBToU), // %0
779 "m"(kARGBToV), // %1
780 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000781 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000782 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000783 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000784 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000785 "1: \n"
786 "movdqa (%0),%%xmm0 \n"
787 "movdqa 0x10(%0),%%xmm1 \n"
788 "movdqa 0x20(%0),%%xmm2 \n"
789 "movdqa 0x30(%0),%%xmm6 \n"
790 "pavgb (%0,%4,1),%%xmm0 \n"
791 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
792 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
793 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
794 "lea 0x40(%0),%0 \n"
795 "movdqa %%xmm0,%%xmm7 \n"
796 "shufps $0x88,%%xmm1,%%xmm0 \n"
797 "shufps $0xdd,%%xmm1,%%xmm7 \n"
798 "pavgb %%xmm7,%%xmm0 \n"
799 "movdqa %%xmm2,%%xmm7 \n"
800 "shufps $0x88,%%xmm6,%%xmm2 \n"
801 "shufps $0xdd,%%xmm6,%%xmm7 \n"
802 "pavgb %%xmm7,%%xmm2 \n"
803 "movdqa %%xmm0,%%xmm1 \n"
804 "movdqa %%xmm2,%%xmm6 \n"
805 "pmaddubsw %%xmm4,%%xmm0 \n"
806 "pmaddubsw %%xmm4,%%xmm2 \n"
807 "pmaddubsw %%xmm3,%%xmm1 \n"
808 "pmaddubsw %%xmm3,%%xmm6 \n"
809 "phaddw %%xmm2,%%xmm0 \n"
810 "phaddw %%xmm6,%%xmm1 \n"
811 "psraw $0x8,%%xmm0 \n"
812 "psraw $0x8,%%xmm1 \n"
813 "packsswb %%xmm1,%%xmm0 \n"
814 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000815 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000816 "movlps %%xmm0,(%1) \n"
817 "movhps %%xmm0,(%1,%2,1) \n"
818 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000819 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000820 : "+r"(src_argb0), // %0
821 "+r"(dst_u), // %1
822 "+r"(dst_v), // %2
823 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000824 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000825 : "memory", "cc"
826#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000827 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000828#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000829 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000830}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000831
832void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
833 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000834 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000835 "movdqa %0,%%xmm4 \n"
836 "movdqa %1,%%xmm3 \n"
837 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000838 :
839 : "m"(kARGBToU), // %0
840 "m"(kARGBToV), // %1
841 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000842 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000843 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000844 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000845 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000846 "1: \n"
847 "movdqu (%0),%%xmm0 \n"
848 "movdqu 0x10(%0),%%xmm1 \n"
849 "movdqu 0x20(%0),%%xmm2 \n"
850 "movdqu 0x30(%0),%%xmm6 \n"
851 "movdqu (%0,%4,1),%%xmm7 \n"
852 "pavgb %%xmm7,%%xmm0 \n"
853 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
854 "pavgb %%xmm7,%%xmm1 \n"
855 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
856 "pavgb %%xmm7,%%xmm2 \n"
857 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
858 "pavgb %%xmm7,%%xmm6 \n"
859 "lea 0x40(%0),%0 \n"
860 "movdqa %%xmm0,%%xmm7 \n"
861 "shufps $0x88,%%xmm1,%%xmm0 \n"
862 "shufps $0xdd,%%xmm1,%%xmm7 \n"
863 "pavgb %%xmm7,%%xmm0 \n"
864 "movdqa %%xmm2,%%xmm7 \n"
865 "shufps $0x88,%%xmm6,%%xmm2 \n"
866 "shufps $0xdd,%%xmm6,%%xmm7 \n"
867 "pavgb %%xmm7,%%xmm2 \n"
868 "movdqa %%xmm0,%%xmm1 \n"
869 "movdqa %%xmm2,%%xmm6 \n"
870 "pmaddubsw %%xmm4,%%xmm0 \n"
871 "pmaddubsw %%xmm4,%%xmm2 \n"
872 "pmaddubsw %%xmm3,%%xmm1 \n"
873 "pmaddubsw %%xmm3,%%xmm6 \n"
874 "phaddw %%xmm2,%%xmm0 \n"
875 "phaddw %%xmm6,%%xmm1 \n"
876 "psraw $0x8,%%xmm0 \n"
877 "psraw $0x8,%%xmm1 \n"
878 "packsswb %%xmm1,%%xmm0 \n"
879 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000880 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000881 "movlps %%xmm0,(%1) \n"
882 "movhps %%xmm0,(%1,%2,1) \n"
883 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000884 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000885 : "+r"(src_argb0), // %0
886 "+r"(dst_u), // %1
887 "+r"(dst_v), // %2
888 "+rm"(width) // %3
889 : "r"(static_cast<intptr_t>(src_stride_argb))
890 : "memory", "cc"
891#if defined(__SSE2__)
892 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
893#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000894 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000895}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000896
fbarchard@google.com714050a2012-02-17 22:59:56 +0000897void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000898 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000899 "movdqa %4,%%xmm5 \n"
900 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000901 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000902 "1: \n"
903 "movdqa (%0),%%xmm0 \n"
904 "movdqa 0x10(%0),%%xmm1 \n"
905 "movdqa 0x20(%0),%%xmm2 \n"
906 "movdqa 0x30(%0),%%xmm3 \n"
907 "pmaddubsw %%xmm4,%%xmm0 \n"
908 "pmaddubsw %%xmm4,%%xmm1 \n"
909 "pmaddubsw %%xmm4,%%xmm2 \n"
910 "pmaddubsw %%xmm4,%%xmm3 \n"
911 "lea 0x40(%0),%0 \n"
912 "phaddw %%xmm1,%%xmm0 \n"
913 "phaddw %%xmm3,%%xmm2 \n"
914 "psrlw $0x7,%%xmm0 \n"
915 "psrlw $0x7,%%xmm2 \n"
916 "packuswb %%xmm2,%%xmm0 \n"
917 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000918 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000919 "movdqa %%xmm0,(%1) \n"
920 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000921 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000922 : "+r"(src_bgra), // %0
923 "+r"(dst_y), // %1
924 "+r"(pix) // %2
925 : "m"(kBGRAToY), // %3
926 "m"(kAddY16) // %4
927 : "memory", "cc"
928#if defined(__SSE2__)
929 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000930#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000931 );
932}
933
934void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000935 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000936 "movdqa %4,%%xmm5 \n"
937 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000938 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000939 "1: \n"
940 "movdqu (%0),%%xmm0 \n"
941 "movdqu 0x10(%0),%%xmm1 \n"
942 "movdqu 0x20(%0),%%xmm2 \n"
943 "movdqu 0x30(%0),%%xmm3 \n"
944 "pmaddubsw %%xmm4,%%xmm0 \n"
945 "pmaddubsw %%xmm4,%%xmm1 \n"
946 "pmaddubsw %%xmm4,%%xmm2 \n"
947 "pmaddubsw %%xmm4,%%xmm3 \n"
948 "lea 0x40(%0),%0 \n"
949 "phaddw %%xmm1,%%xmm0 \n"
950 "phaddw %%xmm3,%%xmm2 \n"
951 "psrlw $0x7,%%xmm0 \n"
952 "psrlw $0x7,%%xmm2 \n"
953 "packuswb %%xmm2,%%xmm0 \n"
954 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000955 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000956 "movdqu %%xmm0,(%1) \n"
957 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000958 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000959 : "+r"(src_bgra), // %0
960 "+r"(dst_y), // %1
961 "+r"(pix) // %2
962 : "m"(kBGRAToY), // %3
963 "m"(kAddY16) // %4
964 : "memory", "cc"
965#if defined(__SSE2__)
966 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
967#endif
968 );
969}
970
971void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
972 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000973 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000974 "movdqa %0,%%xmm4 \n"
975 "movdqa %1,%%xmm3 \n"
976 "movdqa %2,%%xmm5 \n"
977 :
978 : "m"(kBGRAToU), // %0
979 "m"(kBGRAToV), // %1
980 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000981 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000982 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000983 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000984 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000985 "1: \n"
986 "movdqa (%0),%%xmm0 \n"
987 "movdqa 0x10(%0),%%xmm1 \n"
988 "movdqa 0x20(%0),%%xmm2 \n"
989 "movdqa 0x30(%0),%%xmm6 \n"
990 "pavgb (%0,%4,1),%%xmm0 \n"
991 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
992 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
993 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
994 "lea 0x40(%0),%0 \n"
995 "movdqa %%xmm0,%%xmm7 \n"
996 "shufps $0x88,%%xmm1,%%xmm0 \n"
997 "shufps $0xdd,%%xmm1,%%xmm7 \n"
998 "pavgb %%xmm7,%%xmm0 \n"
999 "movdqa %%xmm2,%%xmm7 \n"
1000 "shufps $0x88,%%xmm6,%%xmm2 \n"
1001 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1002 "pavgb %%xmm7,%%xmm2 \n"
1003 "movdqa %%xmm0,%%xmm1 \n"
1004 "movdqa %%xmm2,%%xmm6 \n"
1005 "pmaddubsw %%xmm4,%%xmm0 \n"
1006 "pmaddubsw %%xmm4,%%xmm2 \n"
1007 "pmaddubsw %%xmm3,%%xmm1 \n"
1008 "pmaddubsw %%xmm3,%%xmm6 \n"
1009 "phaddw %%xmm2,%%xmm0 \n"
1010 "phaddw %%xmm6,%%xmm1 \n"
1011 "psraw $0x8,%%xmm0 \n"
1012 "psraw $0x8,%%xmm1 \n"
1013 "packsswb %%xmm1,%%xmm0 \n"
1014 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001015 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001016 "movlps %%xmm0,(%1) \n"
1017 "movhps %%xmm0,(%1,%2,1) \n"
1018 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001019 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001020 : "+r"(src_bgra0), // %0
1021 "+r"(dst_u), // %1
1022 "+r"(dst_v), // %2
1023 "+rm"(width) // %3
1024 : "r"(static_cast<intptr_t>(src_stride_bgra))
1025 : "memory", "cc"
1026#if defined(__SSE2__)
1027 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1028#endif
1029 );
1030}
1031
1032void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1033 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001034 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001035 "movdqa %0,%%xmm4 \n"
1036 "movdqa %1,%%xmm3 \n"
1037 "movdqa %2,%%xmm5 \n"
1038 :
1039 : "m"(kBGRAToU), // %0
1040 "m"(kBGRAToV), // %1
1041 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001042 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001043 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001044 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001045 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001046 "1: \n"
1047 "movdqu (%0),%%xmm0 \n"
1048 "movdqu 0x10(%0),%%xmm1 \n"
1049 "movdqu 0x20(%0),%%xmm2 \n"
1050 "movdqu 0x30(%0),%%xmm6 \n"
1051 "movdqu (%0,%4,1),%%xmm7 \n"
1052 "pavgb %%xmm7,%%xmm0 \n"
1053 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1054 "pavgb %%xmm7,%%xmm1 \n"
1055 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1056 "pavgb %%xmm7,%%xmm2 \n"
1057 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1058 "pavgb %%xmm7,%%xmm6 \n"
1059 "lea 0x40(%0),%0 \n"
1060 "movdqa %%xmm0,%%xmm7 \n"
1061 "shufps $0x88,%%xmm1,%%xmm0 \n"
1062 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1063 "pavgb %%xmm7,%%xmm0 \n"
1064 "movdqa %%xmm2,%%xmm7 \n"
1065 "shufps $0x88,%%xmm6,%%xmm2 \n"
1066 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1067 "pavgb %%xmm7,%%xmm2 \n"
1068 "movdqa %%xmm0,%%xmm1 \n"
1069 "movdqa %%xmm2,%%xmm6 \n"
1070 "pmaddubsw %%xmm4,%%xmm0 \n"
1071 "pmaddubsw %%xmm4,%%xmm2 \n"
1072 "pmaddubsw %%xmm3,%%xmm1 \n"
1073 "pmaddubsw %%xmm3,%%xmm6 \n"
1074 "phaddw %%xmm2,%%xmm0 \n"
1075 "phaddw %%xmm6,%%xmm1 \n"
1076 "psraw $0x8,%%xmm0 \n"
1077 "psraw $0x8,%%xmm1 \n"
1078 "packsswb %%xmm1,%%xmm0 \n"
1079 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001080 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001081 "movlps %%xmm0,(%1) \n"
1082 "movhps %%xmm0,(%1,%2,1) \n"
1083 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001084 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001085 : "+r"(src_bgra0), // %0
1086 "+r"(dst_u), // %1
1087 "+r"(dst_v), // %2
1088 "+rm"(width) // %3
1089 : "r"(static_cast<intptr_t>(src_stride_bgra))
1090 : "memory", "cc"
1091#if defined(__SSE2__)
1092 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1093#endif
1094 );
1095}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001096
1097void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001098 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001099 "movdqa %4,%%xmm5 \n"
1100 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001101 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001102 "1: \n"
1103 "movdqa (%0),%%xmm0 \n"
1104 "movdqa 0x10(%0),%%xmm1 \n"
1105 "movdqa 0x20(%0),%%xmm2 \n"
1106 "movdqa 0x30(%0),%%xmm3 \n"
1107 "pmaddubsw %%xmm4,%%xmm0 \n"
1108 "pmaddubsw %%xmm4,%%xmm1 \n"
1109 "pmaddubsw %%xmm4,%%xmm2 \n"
1110 "pmaddubsw %%xmm4,%%xmm3 \n"
1111 "lea 0x40(%0),%0 \n"
1112 "phaddw %%xmm1,%%xmm0 \n"
1113 "phaddw %%xmm3,%%xmm2 \n"
1114 "psrlw $0x7,%%xmm0 \n"
1115 "psrlw $0x7,%%xmm2 \n"
1116 "packuswb %%xmm2,%%xmm0 \n"
1117 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001118 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001119 "movdqa %%xmm0,(%1) \n"
1120 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001121 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001122 : "+r"(src_abgr), // %0
1123 "+r"(dst_y), // %1
1124 "+r"(pix) // %2
1125 : "m"(kABGRToY), // %3
1126 "m"(kAddY16) // %4
1127 : "memory", "cc"
1128#if defined(__SSE2__)
1129 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1130#endif
1131 );
1132}
1133
1134void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001135 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001136 "movdqa %4,%%xmm5 \n"
1137 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001138 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 "1: \n"
1140 "movdqu (%0),%%xmm0 \n"
1141 "movdqu 0x10(%0),%%xmm1 \n"
1142 "movdqu 0x20(%0),%%xmm2 \n"
1143 "movdqu 0x30(%0),%%xmm3 \n"
1144 "pmaddubsw %%xmm4,%%xmm0 \n"
1145 "pmaddubsw %%xmm4,%%xmm1 \n"
1146 "pmaddubsw %%xmm4,%%xmm2 \n"
1147 "pmaddubsw %%xmm4,%%xmm3 \n"
1148 "lea 0x40(%0),%0 \n"
1149 "phaddw %%xmm1,%%xmm0 \n"
1150 "phaddw %%xmm3,%%xmm2 \n"
1151 "psrlw $0x7,%%xmm0 \n"
1152 "psrlw $0x7,%%xmm2 \n"
1153 "packuswb %%xmm2,%%xmm0 \n"
1154 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001155 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001156 "movdqu %%xmm0,(%1) \n"
1157 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001158 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001159 : "+r"(src_abgr), // %0
1160 "+r"(dst_y), // %1
1161 "+r"(pix) // %2
1162 : "m"(kABGRToY), // %3
1163 "m"(kAddY16) // %4
1164 : "memory", "cc"
1165#if defined(__SSE2__)
1166 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1167#endif
1168 );
1169}
1170
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001171void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1172 asm volatile (
1173 "movdqa %4,%%xmm5 \n"
1174 "movdqa %3,%%xmm4 \n"
1175 ".p2align 4 \n"
1176 "1: \n"
1177 "movdqa (%0),%%xmm0 \n"
1178 "movdqa 0x10(%0),%%xmm1 \n"
1179 "movdqa 0x20(%0),%%xmm2 \n"
1180 "movdqa 0x30(%0),%%xmm3 \n"
1181 "pmaddubsw %%xmm4,%%xmm0 \n"
1182 "pmaddubsw %%xmm4,%%xmm1 \n"
1183 "pmaddubsw %%xmm4,%%xmm2 \n"
1184 "pmaddubsw %%xmm4,%%xmm3 \n"
1185 "lea 0x40(%0),%0 \n"
1186 "phaddw %%xmm1,%%xmm0 \n"
1187 "phaddw %%xmm3,%%xmm2 \n"
1188 "psrlw $0x7,%%xmm0 \n"
1189 "psrlw $0x7,%%xmm2 \n"
1190 "packuswb %%xmm2,%%xmm0 \n"
1191 "paddb %%xmm5,%%xmm0 \n"
1192 "sub $0x10,%2 \n"
1193 "movdqa %%xmm0,(%1) \n"
1194 "lea 0x10(%1),%1 \n"
1195 "jg 1b \n"
1196 : "+r"(src_rgba), // %0
1197 "+r"(dst_y), // %1
1198 "+r"(pix) // %2
1199 : "m"(kRGBAToY), // %3
1200 "m"(kAddY16) // %4
1201 : "memory", "cc"
1202#if defined(__SSE2__)
1203 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1204#endif
1205 );
1206}
1207
1208void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1209 asm volatile (
1210 "movdqa %4,%%xmm5 \n"
1211 "movdqa %3,%%xmm4 \n"
1212 ".p2align 4 \n"
1213 "1: \n"
1214 "movdqu (%0),%%xmm0 \n"
1215 "movdqu 0x10(%0),%%xmm1 \n"
1216 "movdqu 0x20(%0),%%xmm2 \n"
1217 "movdqu 0x30(%0),%%xmm3 \n"
1218 "pmaddubsw %%xmm4,%%xmm0 \n"
1219 "pmaddubsw %%xmm4,%%xmm1 \n"
1220 "pmaddubsw %%xmm4,%%xmm2 \n"
1221 "pmaddubsw %%xmm4,%%xmm3 \n"
1222 "lea 0x40(%0),%0 \n"
1223 "phaddw %%xmm1,%%xmm0 \n"
1224 "phaddw %%xmm3,%%xmm2 \n"
1225 "psrlw $0x7,%%xmm0 \n"
1226 "psrlw $0x7,%%xmm2 \n"
1227 "packuswb %%xmm2,%%xmm0 \n"
1228 "paddb %%xmm5,%%xmm0 \n"
1229 "sub $0x10,%2 \n"
1230 "movdqu %%xmm0,(%1) \n"
1231 "lea 0x10(%1),%1 \n"
1232 "jg 1b \n"
1233 : "+r"(src_rgba), // %0
1234 "+r"(dst_y), // %1
1235 "+r"(pix) // %2
1236 : "m"(kRGBAToY), // %3
1237 "m"(kAddY16) // %4
1238 : "memory", "cc"
1239#if defined(__SSE2__)
1240 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1241#endif
1242 );
1243}
1244
fbarchard@google.com714050a2012-02-17 22:59:56 +00001245void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1246 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001247 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001248 "movdqa %0,%%xmm4 \n"
1249 "movdqa %1,%%xmm3 \n"
1250 "movdqa %2,%%xmm5 \n"
1251 :
1252 : "m"(kABGRToU), // %0
1253 "m"(kABGRToV), // %1
1254 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001255 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001256 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001257 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001258 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001259 "1: \n"
1260 "movdqa (%0),%%xmm0 \n"
1261 "movdqa 0x10(%0),%%xmm1 \n"
1262 "movdqa 0x20(%0),%%xmm2 \n"
1263 "movdqa 0x30(%0),%%xmm6 \n"
1264 "pavgb (%0,%4,1),%%xmm0 \n"
1265 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1266 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1267 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1268 "lea 0x40(%0),%0 \n"
1269 "movdqa %%xmm0,%%xmm7 \n"
1270 "shufps $0x88,%%xmm1,%%xmm0 \n"
1271 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1272 "pavgb %%xmm7,%%xmm0 \n"
1273 "movdqa %%xmm2,%%xmm7 \n"
1274 "shufps $0x88,%%xmm6,%%xmm2 \n"
1275 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1276 "pavgb %%xmm7,%%xmm2 \n"
1277 "movdqa %%xmm0,%%xmm1 \n"
1278 "movdqa %%xmm2,%%xmm6 \n"
1279 "pmaddubsw %%xmm4,%%xmm0 \n"
1280 "pmaddubsw %%xmm4,%%xmm2 \n"
1281 "pmaddubsw %%xmm3,%%xmm1 \n"
1282 "pmaddubsw %%xmm3,%%xmm6 \n"
1283 "phaddw %%xmm2,%%xmm0 \n"
1284 "phaddw %%xmm6,%%xmm1 \n"
1285 "psraw $0x8,%%xmm0 \n"
1286 "psraw $0x8,%%xmm1 \n"
1287 "packsswb %%xmm1,%%xmm0 \n"
1288 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001289 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001290 "movlps %%xmm0,(%1) \n"
1291 "movhps %%xmm0,(%1,%2,1) \n"
1292 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001293 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001294 : "+r"(src_abgr0), // %0
1295 "+r"(dst_u), // %1
1296 "+r"(dst_v), // %2
1297 "+rm"(width) // %3
1298 : "r"(static_cast<intptr_t>(src_stride_abgr))
1299 : "memory", "cc"
1300#if defined(__SSE2__)
1301 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1302#endif
1303 );
1304}
1305
1306void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1307 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001308 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001309 "movdqa %0,%%xmm4 \n"
1310 "movdqa %1,%%xmm3 \n"
1311 "movdqa %2,%%xmm5 \n"
1312 :
1313 : "m"(kABGRToU), // %0
1314 "m"(kABGRToV), // %1
1315 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001316 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001317 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001318 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001319 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001320 "1: \n"
1321 "movdqu (%0),%%xmm0 \n"
1322 "movdqu 0x10(%0),%%xmm1 \n"
1323 "movdqu 0x20(%0),%%xmm2 \n"
1324 "movdqu 0x30(%0),%%xmm6 \n"
1325 "movdqu (%0,%4,1),%%xmm7 \n"
1326 "pavgb %%xmm7,%%xmm0 \n"
1327 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1328 "pavgb %%xmm7,%%xmm1 \n"
1329 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1330 "pavgb %%xmm7,%%xmm2 \n"
1331 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1332 "pavgb %%xmm7,%%xmm6 \n"
1333 "lea 0x40(%0),%0 \n"
1334 "movdqa %%xmm0,%%xmm7 \n"
1335 "shufps $0x88,%%xmm1,%%xmm0 \n"
1336 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1337 "pavgb %%xmm7,%%xmm0 \n"
1338 "movdqa %%xmm2,%%xmm7 \n"
1339 "shufps $0x88,%%xmm6,%%xmm2 \n"
1340 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1341 "pavgb %%xmm7,%%xmm2 \n"
1342 "movdqa %%xmm0,%%xmm1 \n"
1343 "movdqa %%xmm2,%%xmm6 \n"
1344 "pmaddubsw %%xmm4,%%xmm0 \n"
1345 "pmaddubsw %%xmm4,%%xmm2 \n"
1346 "pmaddubsw %%xmm3,%%xmm1 \n"
1347 "pmaddubsw %%xmm3,%%xmm6 \n"
1348 "phaddw %%xmm2,%%xmm0 \n"
1349 "phaddw %%xmm6,%%xmm1 \n"
1350 "psraw $0x8,%%xmm0 \n"
1351 "psraw $0x8,%%xmm1 \n"
1352 "packsswb %%xmm1,%%xmm0 \n"
1353 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001354 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001355 "movlps %%xmm0,(%1) \n"
1356 "movhps %%xmm0,(%1,%2,1) \n"
1357 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001358 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001359 : "+r"(src_abgr0), // %0
1360 "+r"(dst_u), // %1
1361 "+r"(dst_v), // %2
1362 "+rm"(width) // %3
1363 : "r"(static_cast<intptr_t>(src_stride_abgr))
1364 : "memory", "cc"
1365#if defined(__SSE2__)
1366 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1367#endif
1368 );
1369}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001370
1371void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1372 uint8* dst_u, uint8* dst_v, int width) {
1373 asm volatile (
1374 "movdqa %0,%%xmm4 \n"
1375 "movdqa %1,%%xmm3 \n"
1376 "movdqa %2,%%xmm5 \n"
1377 :
1378 : "m"(kRGBAToU), // %0
1379 "m"(kRGBAToV), // %1
1380 "m"(kAddUV128) // %2
1381 );
1382 asm volatile (
1383 "sub %1,%2 \n"
1384 ".p2align 4 \n"
1385 "1: \n"
1386 "movdqa (%0),%%xmm0 \n"
1387 "movdqa 0x10(%0),%%xmm1 \n"
1388 "movdqa 0x20(%0),%%xmm2 \n"
1389 "movdqa 0x30(%0),%%xmm6 \n"
1390 "pavgb (%0,%4,1),%%xmm0 \n"
1391 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1392 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1393 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1394 "lea 0x40(%0),%0 \n"
1395 "movdqa %%xmm0,%%xmm7 \n"
1396 "shufps $0x88,%%xmm1,%%xmm0 \n"
1397 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1398 "pavgb %%xmm7,%%xmm0 \n"
1399 "movdqa %%xmm2,%%xmm7 \n"
1400 "shufps $0x88,%%xmm6,%%xmm2 \n"
1401 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1402 "pavgb %%xmm7,%%xmm2 \n"
1403 "movdqa %%xmm0,%%xmm1 \n"
1404 "movdqa %%xmm2,%%xmm6 \n"
1405 "pmaddubsw %%xmm4,%%xmm0 \n"
1406 "pmaddubsw %%xmm4,%%xmm2 \n"
1407 "pmaddubsw %%xmm3,%%xmm1 \n"
1408 "pmaddubsw %%xmm3,%%xmm6 \n"
1409 "phaddw %%xmm2,%%xmm0 \n"
1410 "phaddw %%xmm6,%%xmm1 \n"
1411 "psraw $0x8,%%xmm0 \n"
1412 "psraw $0x8,%%xmm1 \n"
1413 "packsswb %%xmm1,%%xmm0 \n"
1414 "paddb %%xmm5,%%xmm0 \n"
1415 "sub $0x10,%3 \n"
1416 "movlps %%xmm0,(%1) \n"
1417 "movhps %%xmm0,(%1,%2,1) \n"
1418 "lea 0x8(%1),%1 \n"
1419 "jg 1b \n"
1420 : "+r"(src_rgba0), // %0
1421 "+r"(dst_u), // %1
1422 "+r"(dst_v), // %2
1423 "+rm"(width) // %3
1424 : "r"(static_cast<intptr_t>(src_stride_rgba))
1425 : "memory", "cc"
1426#if defined(__SSE2__)
1427 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1428#endif
1429 );
1430}
1431
1432void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1433 uint8* dst_u, uint8* dst_v, int width) {
1434 asm volatile (
1435 "movdqa %0,%%xmm4 \n"
1436 "movdqa %1,%%xmm3 \n"
1437 "movdqa %2,%%xmm5 \n"
1438 :
1439 : "m"(kRGBAToU), // %0
1440 "m"(kRGBAToV), // %1
1441 "m"(kAddUV128) // %2
1442 );
1443 asm volatile (
1444 "sub %1,%2 \n"
1445 ".p2align 4 \n"
1446 "1: \n"
1447 "movdqu (%0),%%xmm0 \n"
1448 "movdqu 0x10(%0),%%xmm1 \n"
1449 "movdqu 0x20(%0),%%xmm2 \n"
1450 "movdqu 0x30(%0),%%xmm6 \n"
1451 "movdqu (%0,%4,1),%%xmm7 \n"
1452 "pavgb %%xmm7,%%xmm0 \n"
1453 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1454 "pavgb %%xmm7,%%xmm1 \n"
1455 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1456 "pavgb %%xmm7,%%xmm2 \n"
1457 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1458 "pavgb %%xmm7,%%xmm6 \n"
1459 "lea 0x40(%0),%0 \n"
1460 "movdqa %%xmm0,%%xmm7 \n"
1461 "shufps $0x88,%%xmm1,%%xmm0 \n"
1462 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1463 "pavgb %%xmm7,%%xmm0 \n"
1464 "movdqa %%xmm2,%%xmm7 \n"
1465 "shufps $0x88,%%xmm6,%%xmm2 \n"
1466 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1467 "pavgb %%xmm7,%%xmm2 \n"
1468 "movdqa %%xmm0,%%xmm1 \n"
1469 "movdqa %%xmm2,%%xmm6 \n"
1470 "pmaddubsw %%xmm4,%%xmm0 \n"
1471 "pmaddubsw %%xmm4,%%xmm2 \n"
1472 "pmaddubsw %%xmm3,%%xmm1 \n"
1473 "pmaddubsw %%xmm3,%%xmm6 \n"
1474 "phaddw %%xmm2,%%xmm0 \n"
1475 "phaddw %%xmm6,%%xmm1 \n"
1476 "psraw $0x8,%%xmm0 \n"
1477 "psraw $0x8,%%xmm1 \n"
1478 "packsswb %%xmm1,%%xmm0 \n"
1479 "paddb %%xmm5,%%xmm0 \n"
1480 "sub $0x10,%3 \n"
1481 "movlps %%xmm0,(%1) \n"
1482 "movhps %%xmm0,(%1,%2,1) \n"
1483 "lea 0x8(%1),%1 \n"
1484 "jg 1b \n"
1485 : "+r"(src_rgba0), // %0
1486 "+r"(dst_u), // %1
1487 "+r"(dst_v), // %2
1488 "+rm"(width) // %3
1489 : "r"(static_cast<intptr_t>(src_stride_rgba))
1490 : "memory", "cc"
1491#if defined(__SSE2__)
1492 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1493#endif
1494 );
1495}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001496#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001497
fbarchard@google.come214fe32012-06-04 23:47:11 +00001498#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001499#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1500#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1501#define UR 0
1502
1503#define VB 0
1504#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1505#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1506
1507// Bias
1508#define BB UB * 128 + VB * 128
1509#define BG UG * 128 + VG * 128
1510#define BR UR * 128 + VR * 128
1511
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001512#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001513
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001514struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001515 vec8 kUVToB; // 0
1516 vec8 kUVToG; // 16
1517 vec8 kUVToR; // 32
1518 vec16 kUVBiasB; // 48
1519 vec16 kUVBiasG; // 64
1520 vec16 kUVBiasR; // 80
1521 vec16 kYSub16; // 96
1522 vec16 kYToRgb; // 112
1523 vec8 kVUToB; // 128
1524 vec8 kVUToG; // 144
1525 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001526} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001527 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1528 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1529 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1530 { BB, BB, BB, BB, BB, BB, BB, BB },
1531 { BG, BG, BG, BG, BG, BG, BG, BG },
1532 { BR, BR, BR, BR, BR, BR, BR, BR },
1533 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001534 { YG, YG, YG, YG, YG, YG, YG, YG },
1535 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1536 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1537 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001538};
1539
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001540
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001541// Read 8 UV from 411
1542#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001543 "movq (%[u_buf]),%%xmm0 \n" \
1544 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1545 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001546 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001547
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001548// Read 4 UV from 422, upsample to 8 UV
1549#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001550 "movd (%[u_buf]),%%xmm0 \n" \
1551 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1552 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001553 "punpcklbw %%xmm1,%%xmm0 \n" \
1554 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001555
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001556// Read 2 UV from 411, upsample to 8 UV
1557#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001558 "movd (%[u_buf]),%%xmm0 \n" \
1559 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1560 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001561 "punpcklbw %%xmm1,%%xmm0 \n" \
1562 "punpcklwd %%xmm0,%%xmm0 \n" \
1563 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001564
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001565// Read 4 UV from NV12, upsample to 8 UV
1566#define READNV12 \
1567 "movq (%[uv_buf]),%%xmm0 \n" \
1568 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001569 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001570
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001571// Convert 8 pixels: 8 UV and 8 Y
1572#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001573 "movdqa %%xmm0,%%xmm1 \n" \
1574 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001575 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1576 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1577 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1578 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1579 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1580 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1581 "movq (%[y_buf]),%%xmm3 \n" \
1582 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001583 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001584 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1585 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001586 "paddsw %%xmm3,%%xmm0 \n" \
1587 "paddsw %%xmm3,%%xmm1 \n" \
1588 "paddsw %%xmm3,%%xmm2 \n" \
1589 "psraw $0x6,%%xmm0 \n" \
1590 "psraw $0x6,%%xmm1 \n" \
1591 "psraw $0x6,%%xmm2 \n" \
1592 "packuswb %%xmm0,%%xmm0 \n" \
1593 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001594 "packuswb %%xmm2,%%xmm2 \n" \
1595
1596// Convert 8 pixels: 8 VU and 8 Y
1597#define YVUTORGB \
1598 "movdqa %%xmm0,%%xmm1 \n" \
1599 "movdqa %%xmm0,%%xmm2 \n" \
1600 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1601 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1602 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1603 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1604 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1605 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1606 "movq (%[y_buf]),%%xmm3 \n" \
1607 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1608 "punpcklbw %%xmm4,%%xmm3 \n" \
1609 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1610 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1611 "paddsw %%xmm3,%%xmm0 \n" \
1612 "paddsw %%xmm3,%%xmm1 \n" \
1613 "paddsw %%xmm3,%%xmm2 \n" \
1614 "psraw $0x6,%%xmm0 \n" \
1615 "psraw $0x6,%%xmm1 \n" \
1616 "psraw $0x6,%%xmm2 \n" \
1617 "packuswb %%xmm0,%%xmm0 \n" \
1618 "packuswb %%xmm1,%%xmm1 \n" \
1619 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001620
1621void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001622 const uint8* u_buf,
1623 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001624 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001625 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001626 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001627 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001628 "pcmpeqb %%xmm5,%%xmm5 \n"
1629 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001630 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001631 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001632 READYUV444
1633 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001634 "punpcklbw %%xmm1,%%xmm0 \n"
1635 "punpcklbw %%xmm5,%%xmm2 \n"
1636 "movdqa %%xmm0,%%xmm1 \n"
1637 "punpcklwd %%xmm2,%%xmm0 \n"
1638 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001639 "movdqa %%xmm0,(%[argb_buf]) \n"
1640 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1641 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1642 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001643 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001644 : [y_buf]"+r"(y_buf), // %[y_buf]
1645 [u_buf]"+r"(u_buf), // %[u_buf]
1646 [v_buf]"+r"(v_buf), // %[v_buf]
1647 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1648 [width]"+rm"(width) // %[width]
1649 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001650 : "memory", "cc"
1651#if defined(__SSE2__)
1652 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1653#endif
1654 );
1655}
1656
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001657void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1658 const uint8* u_buf,
1659 const uint8* v_buf,
1660 uint8* rgb24_buf,
1661 int width) {
1662// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1663#ifdef __APPLE__
1664 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001665 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1666 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1667 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1668 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001669#endif
1670
1671 asm volatile (
1672#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001673 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1674 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001675#endif
1676 "sub %[u_buf],%[v_buf] \n"
1677 "pxor %%xmm4,%%xmm4 \n"
1678 ".p2align 4 \n"
1679 "1: \n"
1680 READYUV422
1681 YUVTORGB
1682 "punpcklbw %%xmm1,%%xmm0 \n"
1683 "punpcklbw %%xmm2,%%xmm2 \n"
1684 "movdqa %%xmm0,%%xmm1 \n"
1685 "punpcklwd %%xmm2,%%xmm0 \n"
1686 "punpckhwd %%xmm2,%%xmm1 \n"
1687 "pshufb %%xmm5,%%xmm0 \n"
1688 "pshufb %%xmm6,%%xmm1 \n"
1689 "palignr $0xc,%%xmm0,%%xmm1 \n"
1690 "movq %%xmm0,(%[rgb24_buf]) \n"
1691 "movdqu %%xmm1,0x8(%[rgb24_buf]) \n"
1692 "lea 0x18(%[rgb24_buf]),%[rgb24_buf] \n"
1693 "sub $0x8,%[width] \n"
1694 "jg 1b \n"
1695 : [y_buf]"+r"(y_buf), // %[y_buf]
1696 [u_buf]"+r"(u_buf), // %[u_buf]
1697 [v_buf]"+r"(v_buf), // %[v_buf]
1698 [rgb24_buf]"+r"(rgb24_buf), // %[rgb24_buf]
1699 [width]"+rm"(width) // %[width]
1700 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1701#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001702 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1703 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001704#endif
1705 : "memory", "cc"
1706#if defined(__SSE2__)
1707 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1708#endif
1709 );
1710}
1711
1712void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1713 const uint8* u_buf,
1714 const uint8* v_buf,
1715 uint8* raw_buf,
1716 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001717// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001718#ifdef __APPLE__
1719 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001720 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1721 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1722 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1723 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001724#endif
1725
1726 asm volatile (
1727#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001728 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1729 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001730#endif
1731 "sub %[u_buf],%[v_buf] \n"
1732 "pxor %%xmm4,%%xmm4 \n"
1733 ".p2align 4 \n"
1734 "1: \n"
1735 READYUV422
1736 YUVTORGB
1737 "punpcklbw %%xmm1,%%xmm0 \n"
1738 "punpcklbw %%xmm2,%%xmm2 \n"
1739 "movdqa %%xmm0,%%xmm1 \n"
1740 "punpcklwd %%xmm2,%%xmm0 \n"
1741 "punpckhwd %%xmm2,%%xmm1 \n"
1742 "pshufb %%xmm5,%%xmm0 \n"
1743 "pshufb %%xmm6,%%xmm1 \n"
1744 "palignr $0xc,%%xmm0,%%xmm1 \n"
1745 "movq %%xmm0,(%[raw_buf]) \n"
1746 "movdqu %%xmm1,0x8(%[raw_buf]) \n"
1747 "lea 0x18(%[raw_buf]),%[raw_buf] \n"
1748 "sub $0x8,%[width] \n"
1749 "jg 1b \n"
1750 : [y_buf]"+r"(y_buf), // %[y_buf]
1751 [u_buf]"+r"(u_buf), // %[u_buf]
1752 [v_buf]"+r"(v_buf), // %[v_buf]
1753 [raw_buf]"+r"(raw_buf), // %[raw_buf]
1754 [width]"+rm"(width) // %[width]
1755 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1756#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001757 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1758 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001759#endif
1760 : "memory", "cc"
1761#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001762 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001763#endif
1764 );
1765}
1766
fbarchard@google.come214fe32012-06-04 23:47:11 +00001767void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001768 const uint8* u_buf,
1769 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001770 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001771 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001772 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001773 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001774 "pcmpeqb %%xmm5,%%xmm5 \n"
1775 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001776 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001777 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001778 READYUV422
1779 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001780 "punpcklbw %%xmm1,%%xmm0 \n"
1781 "punpcklbw %%xmm5,%%xmm2 \n"
1782 "movdqa %%xmm0,%%xmm1 \n"
1783 "punpcklwd %%xmm2,%%xmm0 \n"
1784 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001785 "movdqa %%xmm0,(%[argb_buf]) \n"
1786 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1787 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1788 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001789 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001790 : [y_buf]"+r"(y_buf), // %[y_buf]
1791 [u_buf]"+r"(u_buf), // %[u_buf]
1792 [v_buf]"+r"(v_buf), // %[v_buf]
1793 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1794 [width]"+rm"(width) // %[width]
1795 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001796 : "memory", "cc"
1797#if defined(__SSE2__)
1798 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1799#endif
1800 );
1801}
1802
1803void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1804 const uint8* u_buf,
1805 const uint8* v_buf,
1806 uint8* argb_buf,
1807 int width) {
1808 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001809 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001810 "pcmpeqb %%xmm5,%%xmm5 \n"
1811 "pxor %%xmm4,%%xmm4 \n"
1812 ".p2align 4 \n"
1813 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001814 READYUV411
1815 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001816 "punpcklbw %%xmm1,%%xmm0 \n"
1817 "punpcklbw %%xmm5,%%xmm2 \n"
1818 "movdqa %%xmm0,%%xmm1 \n"
1819 "punpcklwd %%xmm2,%%xmm0 \n"
1820 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001821 "movdqa %%xmm0,(%[argb_buf]) \n"
1822 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1823 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1824 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001825 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001826 : [y_buf]"+r"(y_buf), // %[y_buf]
1827 [u_buf]"+r"(u_buf), // %[u_buf]
1828 [v_buf]"+r"(v_buf), // %[v_buf]
1829 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1830 [width]"+rm"(width) // %[width]
1831 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1832 : "memory", "cc"
1833#if defined(__SSE2__)
1834 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1835#endif
1836 );
1837}
1838
1839void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1840 const uint8* uv_buf,
1841 uint8* argb_buf,
1842 int width) {
1843 asm volatile (
1844 "pcmpeqb %%xmm5,%%xmm5 \n"
1845 "pxor %%xmm4,%%xmm4 \n"
1846 ".p2align 4 \n"
1847 "1: \n"
1848 READNV12
1849 YUVTORGB
1850 "punpcklbw %%xmm1,%%xmm0 \n"
1851 "punpcklbw %%xmm5,%%xmm2 \n"
1852 "movdqa %%xmm0,%%xmm1 \n"
1853 "punpcklwd %%xmm2,%%xmm0 \n"
1854 "punpckhwd %%xmm2,%%xmm1 \n"
1855 "movdqa %%xmm0,(%[argb_buf]) \n"
1856 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1857 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1858 "sub $0x8,%[width] \n"
1859 "jg 1b \n"
1860 : [y_buf]"+r"(y_buf), // %[y_buf]
1861 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1862 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1863 [width]"+rm"(width) // %[width]
1864 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1865 : "memory", "cc"
1866#if defined(__SSE2__)
1867 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1868#endif
1869 );
1870}
1871
1872void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1873 const uint8* vu_buf,
1874 uint8* argb_buf,
1875 int width) {
1876 asm volatile (
1877 "pcmpeqb %%xmm5,%%xmm5 \n"
1878 "pxor %%xmm4,%%xmm4 \n"
1879 ".p2align 4 \n"
1880 "1: \n"
1881 READNV12
1882 YVUTORGB
1883 "punpcklbw %%xmm1,%%xmm0 \n"
1884 "punpcklbw %%xmm5,%%xmm2 \n"
1885 "movdqa %%xmm0,%%xmm1 \n"
1886 "punpcklwd %%xmm2,%%xmm0 \n"
1887 "punpckhwd %%xmm2,%%xmm1 \n"
1888 "movdqa %%xmm0,(%[argb_buf]) \n"
1889 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1890 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1891 "sub $0x8,%[width] \n"
1892 "jg 1b \n"
1893 : [y_buf]"+r"(y_buf), // %[y_buf]
1894 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1895 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1896 [width]"+rm"(width) // %[width]
1897 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001898 : "memory", "cc"
1899#if defined(__SSE2__)
1900 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1901#endif
1902 );
1903}
1904
1905void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1906 const uint8* u_buf,
1907 const uint8* v_buf,
1908 uint8* argb_buf,
1909 int width) {
1910 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001911 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001912 "pcmpeqb %%xmm5,%%xmm5 \n"
1913 "pxor %%xmm4,%%xmm4 \n"
1914 ".p2align 4 \n"
1915 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001916 READYUV444
1917 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001918 "punpcklbw %%xmm1,%%xmm0 \n"
1919 "punpcklbw %%xmm5,%%xmm2 \n"
1920 "movdqa %%xmm0,%%xmm1 \n"
1921 "punpcklwd %%xmm2,%%xmm0 \n"
1922 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001923 "movdqu %%xmm0,(%[argb_buf]) \n"
1924 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1925 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1926 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001927 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001928 : [y_buf]"+r"(y_buf), // %[y_buf]
1929 [u_buf]"+r"(u_buf), // %[u_buf]
1930 [v_buf]"+r"(v_buf), // %[v_buf]
1931 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1932 [width]"+rm"(width) // %[width]
1933 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001934 : "memory", "cc"
1935#if defined(__SSE2__)
1936 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1937#endif
1938 );
1939}
1940
1941void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1942 const uint8* u_buf,
1943 const uint8* v_buf,
1944 uint8* argb_buf,
1945 int width) {
1946 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001947 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001948 "pcmpeqb %%xmm5,%%xmm5 \n"
1949 "pxor %%xmm4,%%xmm4 \n"
1950 ".p2align 4 \n"
1951 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001952 READYUV422
1953 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001954 "punpcklbw %%xmm1,%%xmm0 \n"
1955 "punpcklbw %%xmm5,%%xmm2 \n"
1956 "movdqa %%xmm0,%%xmm1 \n"
1957 "punpcklwd %%xmm2,%%xmm0 \n"
1958 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001959 "movdqu %%xmm0,(%[argb_buf]) \n"
1960 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1961 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1962 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001963 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001964 : [y_buf]"+r"(y_buf), // %[y_buf]
1965 [u_buf]"+r"(u_buf), // %[u_buf]
1966 [v_buf]"+r"(v_buf), // %[v_buf]
1967 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1968 [width]"+rm"(width) // %[width]
1969 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001970 : "memory", "cc"
1971#if defined(__SSE2__)
1972 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1973#endif
1974 );
1975}
1976
1977void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1978 const uint8* u_buf,
1979 const uint8* v_buf,
1980 uint8* argb_buf,
1981 int width) {
1982 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001983 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001984 "pcmpeqb %%xmm5,%%xmm5 \n"
1985 "pxor %%xmm4,%%xmm4 \n"
1986 ".p2align 4 \n"
1987 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001988 READYUV411
1989 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001990 "punpcklbw %%xmm1,%%xmm0 \n"
1991 "punpcklbw %%xmm5,%%xmm2 \n"
1992 "movdqa %%xmm0,%%xmm1 \n"
1993 "punpcklwd %%xmm2,%%xmm0 \n"
1994 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001995 "movdqu %%xmm0,(%[argb_buf]) \n"
1996 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1997 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1998 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001999 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002000 : [y_buf]"+r"(y_buf), // %[y_buf]
2001 [u_buf]"+r"(u_buf), // %[u_buf]
2002 [v_buf]"+r"(v_buf), // %[v_buf]
2003 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2004 [width]"+rm"(width) // %[width]
2005 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2006 : "memory", "cc"
2007#if defined(__SSE2__)
2008 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2009#endif
2010 );
2011}
2012
2013void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2014 const uint8* uv_buf,
2015 uint8* argb_buf,
2016 int width) {
2017 asm volatile (
2018 "pcmpeqb %%xmm5,%%xmm5 \n"
2019 "pxor %%xmm4,%%xmm4 \n"
2020 ".p2align 4 \n"
2021 "1: \n"
2022 READNV12
2023 YUVTORGB
2024 "punpcklbw %%xmm1,%%xmm0 \n"
2025 "punpcklbw %%xmm5,%%xmm2 \n"
2026 "movdqa %%xmm0,%%xmm1 \n"
2027 "punpcklwd %%xmm2,%%xmm0 \n"
2028 "punpckhwd %%xmm2,%%xmm1 \n"
2029 "movdqu %%xmm0,(%[argb_buf]) \n"
2030 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2031 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2032 "sub $0x8,%[width] \n"
2033 "jg 1b \n"
2034 : [y_buf]"+r"(y_buf), // %[y_buf]
2035 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2036 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2037 [width]"+rm"(width) // %[width]
2038 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2039 : "memory", "cc"
2040#if defined(__SSE2__)
2041 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2042#endif
2043 );
2044}
2045
2046void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2047 const uint8* vu_buf,
2048 uint8* argb_buf,
2049 int width) {
2050 asm volatile (
2051 "pcmpeqb %%xmm5,%%xmm5 \n"
2052 "pxor %%xmm4,%%xmm4 \n"
2053 ".p2align 4 \n"
2054 "1: \n"
2055 READNV12
2056 YVUTORGB
2057 "punpcklbw %%xmm1,%%xmm0 \n"
2058 "punpcklbw %%xmm5,%%xmm2 \n"
2059 "movdqa %%xmm0,%%xmm1 \n"
2060 "punpcklwd %%xmm2,%%xmm0 \n"
2061 "punpckhwd %%xmm2,%%xmm1 \n"
2062 "movdqu %%xmm0,(%[argb_buf]) \n"
2063 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2064 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2065 "sub $0x8,%[width] \n"
2066 "jg 1b \n"
2067 : [y_buf]"+r"(y_buf), // %[y_buf]
2068 [uv_buf]"+r"(vu_buf), // %[uv_buf]
2069 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2070 [width]"+rm"(width) // %[width]
2071 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002072 : "memory", "cc"
2073#if defined(__SSE2__)
2074 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2075#endif
2076 );
2077}
2078
2079void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2080 const uint8* u_buf,
2081 const uint8* v_buf,
2082 uint8* bgra_buf,
2083 int width) {
2084 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002085 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002086 "pcmpeqb %%xmm5,%%xmm5 \n"
2087 "pxor %%xmm4,%%xmm4 \n"
2088 ".p2align 4 \n"
2089 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002090 READYUV422
2091 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002092 "pcmpeqb %%xmm5,%%xmm5 \n"
2093 "punpcklbw %%xmm0,%%xmm1 \n"
2094 "punpcklbw %%xmm2,%%xmm5 \n"
2095 "movdqa %%xmm5,%%xmm0 \n"
2096 "punpcklwd %%xmm1,%%xmm5 \n"
2097 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002098 "movdqa %%xmm5,(%[argb_buf]) \n"
2099 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2100 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2101 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002102 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002103 : [y_buf]"+r"(y_buf), // %[y_buf]
2104 [u_buf]"+r"(u_buf), // %[u_buf]
2105 [v_buf]"+r"(v_buf), // %[v_buf]
2106 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2107 [width]"+rm"(width) // %[width]
2108 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002109 : "memory", "cc"
2110#if defined(__SSE2__)
2111 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2112#endif
2113 );
2114}
2115
fbarchard@google.come214fe32012-06-04 23:47:11 +00002116void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002117 const uint8* u_buf,
2118 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002119 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002120 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002121 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002122 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002123 "pcmpeqb %%xmm5,%%xmm5 \n"
2124 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002125 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002126 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002127 READYUV422
2128 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002129 "punpcklbw %%xmm1,%%xmm2 \n"
2130 "punpcklbw %%xmm5,%%xmm0 \n"
2131 "movdqa %%xmm2,%%xmm1 \n"
2132 "punpcklwd %%xmm0,%%xmm2 \n"
2133 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002134 "movdqa %%xmm2,(%[argb_buf]) \n"
2135 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
2136 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2137 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002138 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002139 : [y_buf]"+r"(y_buf), // %[y_buf]
2140 [u_buf]"+r"(u_buf), // %[u_buf]
2141 [v_buf]"+r"(v_buf), // %[v_buf]
2142 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2143 [width]"+rm"(width) // %[width]
2144 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002145 : "memory", "cc"
2146#if defined(__SSE2__)
2147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2148#endif
2149 );
2150}
2151
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002152void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2153 const uint8* u_buf,
2154 const uint8* v_buf,
2155 uint8* rgba_buf,
2156 int width) {
2157 asm volatile (
2158 "sub %[u_buf],%[v_buf] \n"
2159 "pcmpeqb %%xmm5,%%xmm5 \n"
2160 "pxor %%xmm4,%%xmm4 \n"
2161 ".p2align 4 \n"
2162 "1: \n"
2163 READYUV422
2164 YUVTORGB
2165 "pcmpeqb %%xmm5,%%xmm5 \n"
2166 "punpcklbw %%xmm2,%%xmm1 \n"
2167 "punpcklbw %%xmm0,%%xmm5 \n"
2168 "movdqa %%xmm5,%%xmm0 \n"
2169 "punpcklwd %%xmm1,%%xmm5 \n"
2170 "punpckhwd %%xmm1,%%xmm0 \n"
2171 "movdqa %%xmm5,(%[argb_buf]) \n"
2172 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2173 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2174 "sub $0x8,%[width] \n"
2175 "jg 1b \n"
2176 : [y_buf]"+r"(y_buf), // %[y_buf]
2177 [u_buf]"+r"(u_buf), // %[u_buf]
2178 [v_buf]"+r"(v_buf), // %[v_buf]
2179 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2180 [width]"+rm"(width) // %[width]
2181 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2182 : "memory", "cc"
2183#if defined(__SSE2__)
2184 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2185#endif
2186 );
2187}
2188
fbarchard@google.come214fe32012-06-04 23:47:11 +00002189void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002190 const uint8* u_buf,
2191 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002192 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002193 int width) {
2194 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002195 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002196 "pcmpeqb %%xmm5,%%xmm5 \n"
2197 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002198 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002199 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002200 READYUV422
2201 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002202 "pcmpeqb %%xmm5,%%xmm5 \n"
2203 "punpcklbw %%xmm0,%%xmm1 \n"
2204 "punpcklbw %%xmm2,%%xmm5 \n"
2205 "movdqa %%xmm5,%%xmm0 \n"
2206 "punpcklwd %%xmm1,%%xmm5 \n"
2207 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002208 "movdqu %%xmm5,(%[argb_buf]) \n"
2209 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
2210 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2211 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002212 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002213 : [y_buf]"+r"(y_buf), // %[y_buf]
2214 [u_buf]"+r"(u_buf), // %[u_buf]
2215 [v_buf]"+r"(v_buf), // %[v_buf]
2216 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2217 [width]"+rm"(width) // %[width]
2218 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002219 : "memory", "cc"
2220#if defined(__SSE2__)
2221 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2222#endif
2223 );
2224}
2225
fbarchard@google.come214fe32012-06-04 23:47:11 +00002226void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002227 const uint8* u_buf,
2228 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002229 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002230 int width) {
2231 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002232 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002233 "pcmpeqb %%xmm5,%%xmm5 \n"
2234 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002235 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002236 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002237 READYUV422
2238 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002239 "punpcklbw %%xmm1,%%xmm2 \n"
2240 "punpcklbw %%xmm5,%%xmm0 \n"
2241 "movdqa %%xmm2,%%xmm1 \n"
2242 "punpcklwd %%xmm0,%%xmm2 \n"
2243 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002244 "movdqu %%xmm2,(%[argb_buf]) \n"
2245 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2246 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2247 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002248 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002249 : [y_buf]"+r"(y_buf), // %[y_buf]
2250 [u_buf]"+r"(u_buf), // %[u_buf]
2251 [v_buf]"+r"(v_buf), // %[v_buf]
2252 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2253 [width]"+rm"(width) // %[width]
2254 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002255 : "memory", "cc"
2256#if defined(__SSE2__)
2257 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2258#endif
2259 );
2260}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002261
2262void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2263 const uint8* u_buf,
2264 const uint8* v_buf,
2265 uint8* rgba_buf,
2266 int width) {
2267 asm volatile (
2268 "sub %[u_buf],%[v_buf] \n"
2269 "pcmpeqb %%xmm5,%%xmm5 \n"
2270 "pxor %%xmm4,%%xmm4 \n"
2271 ".p2align 4 \n"
2272 "1: \n"
2273 READYUV422
2274 YUVTORGB
2275 "pcmpeqb %%xmm5,%%xmm5 \n"
2276 "punpcklbw %%xmm2,%%xmm1 \n"
2277 "punpcklbw %%xmm0,%%xmm5 \n"
2278 "movdqa %%xmm5,%%xmm0 \n"
2279 "punpcklwd %%xmm1,%%xmm5 \n"
2280 "punpckhwd %%xmm1,%%xmm0 \n"
2281 "movdqa %%xmm5,(%[argb_buf]) \n"
2282 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2283 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2284 "sub $0x8,%[width] \n"
2285 "jg 1b \n"
2286 : [y_buf]"+r"(y_buf), // %[y_buf]
2287 [u_buf]"+r"(u_buf), // %[u_buf]
2288 [v_buf]"+r"(v_buf), // %[v_buf]
2289 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2290 [width]"+rm"(width) // %[width]
2291 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2292 : "memory", "cc"
2293#if defined(__SSE2__)
2294 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2295#endif
2296 );
2297}
2298
fbarchard@google.come214fe32012-06-04 23:47:11 +00002299#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002300
2301#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002302void YToARGBRow_SSE2(const uint8* y_buf,
2303 uint8* rgb_buf,
2304 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002305 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002306 "pcmpeqb %%xmm4,%%xmm4 \n"
2307 "pslld $0x18,%%xmm4 \n"
2308 "mov $0x10001000,%%eax \n"
2309 "movd %%eax,%%xmm3 \n"
2310 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2311 "mov $0x012a012a,%%eax \n"
2312 "movd %%eax,%%xmm2 \n"
2313 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002314 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002315 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002316 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002317 "movq (%0),%%xmm0 \n"
2318 "lea 0x8(%0),%0 \n"
2319 "punpcklbw %%xmm0,%%xmm0 \n"
2320 "psubusw %%xmm3,%%xmm0 \n"
2321 "pmulhuw %%xmm2,%%xmm0 \n"
2322 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002323
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002324 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002325 "punpcklbw %%xmm0,%%xmm0 \n"
2326 "movdqa %%xmm0,%%xmm1 \n"
2327 "punpcklwd %%xmm0,%%xmm0 \n"
2328 "punpckhwd %%xmm1,%%xmm1 \n"
2329 "por %%xmm4,%%xmm0 \n"
2330 "por %%xmm4,%%xmm1 \n"
2331 "movdqa %%xmm0,(%1) \n"
2332 "movdqa %%xmm1,16(%1) \n"
2333 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002334
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002335 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002336 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00002337 : "+r"(y_buf), // %0
2338 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00002339 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002340 :
2341 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002342#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002343 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002344#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002345 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002346}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002347#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002348
fbarchard@google.com42831e02012-01-21 02:54:17 +00002349#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002350// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002351CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002352 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2353};
2354
fbarchard@google.com42831e02012-01-21 02:54:17 +00002355void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002356 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002357 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002358 "movdqa %3,%%xmm5 \n"
2359 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002360 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002361 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002362 "movdqa (%0,%2),%%xmm0 \n"
2363 "pshufb %%xmm5,%%xmm0 \n"
2364 "sub $0x10,%2 \n"
2365 "movdqa %%xmm0,(%1) \n"
2366 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002367 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002368 : "+r"(src), // %0
2369 "+r"(dst), // %1
2370 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002371 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002372 : "memory", "cc"
2373#if defined(__SSE2__)
2374 , "xmm0", "xmm5"
2375#endif
2376 );
2377}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002378#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002379
fbarchard@google.com42831e02012-01-21 02:54:17 +00002380#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002381void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002382 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002383 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002384 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002385 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002386 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002387 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002388 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002389 "psllw $0x8,%%xmm0 \n"
2390 "psrlw $0x8,%%xmm1 \n"
2391 "por %%xmm1,%%xmm0 \n"
2392 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2393 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2394 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2395 "sub $0x10,%2 \n"
2396 "movdqu %%xmm0,(%1) \n"
2397 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002398 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002399 : "+r"(src), // %0
2400 "+r"(dst), // %1
2401 "+r"(temp_width) // %2
2402 :
2403 : "memory", "cc"
2404#if defined(__SSE2__)
2405 , "xmm0", "xmm1"
2406#endif
2407 );
2408}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002409#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002410
fbarchard@google.com16a96642012-03-02 22:38:09 +00002411#ifdef HAS_MIRRORROW_UV_SSSE3
2412// Shuffle table for reversing the bytes of UV channels.
2413CONST uvec8 kShuffleMirrorUV = {
2414 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2415};
2416void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2417 int width) {
2418 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002419 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002420 "movdqa %4,%%xmm1 \n"
2421 "lea -16(%0,%3,2),%0 \n"
2422 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002423 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002424 "1: \n"
2425 "movdqa (%0),%%xmm0 \n"
2426 "lea -16(%0),%0 \n"
2427 "pshufb %%xmm1,%%xmm0 \n"
2428 "sub $8,%3 \n"
2429 "movlpd %%xmm0,(%1) \n"
2430 "movhpd %%xmm0,(%1,%2) \n"
2431 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002432 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002433 : "+r"(src), // %0
2434 "+r"(dst_u), // %1
2435 "+r"(dst_v), // %2
2436 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002437 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002438 : "memory", "cc"
2439#if defined(__SSE2__)
2440 , "xmm0", "xmm1"
2441#endif
2442 );
2443}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002444#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002445
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002446#ifdef HAS_ARGBMIRRORROW_SSSE3
2447// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002448CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002449 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2450};
2451
2452void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2453 intptr_t temp_width = static_cast<intptr_t>(width);
2454 asm volatile (
2455 "movdqa %3,%%xmm5 \n"
2456 "lea -0x10(%0),%0 \n"
2457 ".p2align 4 \n"
2458 "1: \n"
2459 "movdqa (%0,%2,4),%%xmm0 \n"
2460 "pshufb %%xmm5,%%xmm0 \n"
2461 "sub $0x4,%2 \n"
2462 "movdqa %%xmm0,(%1) \n"
2463 "lea 0x10(%1),%1 \n"
2464 "jg 1b \n"
2465 : "+r"(src), // %0
2466 "+r"(dst), // %1
2467 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002468 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002469 : "memory", "cc"
2470#if defined(__SSE2__)
2471 , "xmm0", "xmm5"
2472#endif
2473 );
2474}
2475#endif // HAS_ARGBMIRRORROW_SSSE3
2476
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002477#ifdef HAS_SPLITUV_SSE2
2478void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002479 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002480 "pcmpeqb %%xmm5,%%xmm5 \n"
2481 "psrlw $0x8,%%xmm5 \n"
2482 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002483 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002484 "1: \n"
2485 "movdqa (%0),%%xmm0 \n"
2486 "movdqa 0x10(%0),%%xmm1 \n"
2487 "lea 0x20(%0),%0 \n"
2488 "movdqa %%xmm0,%%xmm2 \n"
2489 "movdqa %%xmm1,%%xmm3 \n"
2490 "pand %%xmm5,%%xmm0 \n"
2491 "pand %%xmm5,%%xmm1 \n"
2492 "packuswb %%xmm1,%%xmm0 \n"
2493 "psrlw $0x8,%%xmm2 \n"
2494 "psrlw $0x8,%%xmm3 \n"
2495 "packuswb %%xmm3,%%xmm2 \n"
2496 "movdqa %%xmm0,(%1) \n"
2497 "movdqa %%xmm2,(%1,%2) \n"
2498 "lea 0x10(%1),%1 \n"
2499 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002500 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002501 : "+r"(src_uv), // %0
2502 "+r"(dst_u), // %1
2503 "+r"(dst_v), // %2
2504 "+r"(pix) // %3
2505 :
2506 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002507#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002508 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002509#endif
2510 );
2511}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002512
2513void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2514 int pix) {
2515 asm volatile (
2516 "pcmpeqb %%xmm5,%%xmm5 \n"
2517 "psrlw $0x8,%%xmm5 \n"
2518 "sub %1,%2 \n"
2519 ".p2align 4 \n"
2520 "1: \n"
2521 "movdqu (%0),%%xmm0 \n"
2522 "movdqu 0x10(%0),%%xmm1 \n"
2523 "lea 0x20(%0),%0 \n"
2524 "movdqa %%xmm0,%%xmm2 \n"
2525 "movdqa %%xmm1,%%xmm3 \n"
2526 "pand %%xmm5,%%xmm0 \n"
2527 "pand %%xmm5,%%xmm1 \n"
2528 "packuswb %%xmm1,%%xmm0 \n"
2529 "psrlw $0x8,%%xmm2 \n"
2530 "psrlw $0x8,%%xmm3 \n"
2531 "packuswb %%xmm3,%%xmm2 \n"
2532 "movdqu %%xmm0,(%1) \n"
2533 "movdqu %%xmm2,(%1,%2) \n"
2534 "lea 0x10(%1),%1 \n"
2535 "sub $0x10,%3 \n"
2536 "jg 1b \n"
2537 : "+r"(src_uv), // %0
2538 "+r"(dst_u), // %1
2539 "+r"(dst_v), // %2
2540 "+r"(pix) // %3
2541 :
2542 : "memory", "cc"
2543#if defined(__SSE2__)
2544 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2545#endif
2546 );
2547}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002548#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002549
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002550#ifdef HAS_MERGEUV_SSE2
2551void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2552 int width) {
2553 asm volatile (
2554 "sub %0,%1 \n"
2555 ".p2align 4 \n"
2556 "1: \n"
2557 "movdqa (%0),%%xmm0 \n"
2558 "movdqa (%0,%1,1),%%xmm1 \n"
2559 "lea 0x10(%0),%0 \n"
2560 "movdqa %%xmm0,%%xmm2 \n"
2561 "punpcklbw %%xmm1,%%xmm0 \n"
2562 "punpckhbw %%xmm1,%%xmm2 \n"
2563 "movdqa %%xmm0,(%2) \n"
2564 "movdqa %%xmm2,0x10(%2) \n"
2565 "lea 0x20(%2),%2 \n"
2566 "sub $0x10,%3 \n"
2567 "jg 1b \n"
2568 : "+r"(src_u), // %0
2569 "+r"(src_v), // %1
2570 "+r"(dst_uv), // %2
2571 "+r"(width) // %3
2572 :
2573 : "memory", "cc"
2574#if defined(__SSE2__)
2575 , "xmm0", "xmm1", "xmm2"
2576#endif
2577 );
2578}
2579#endif // HAS_MERGEUV_SSE2
2580
fbarchard@google.com19932f82012-02-16 22:19:14 +00002581#ifdef HAS_COPYROW_SSE2
2582void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002583 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002584 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002585 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002586 "1: \n"
2587 "movdqa (%0),%%xmm0 \n"
2588 "movdqa 0x10(%0),%%xmm1 \n"
2589 "movdqa %%xmm0,(%0,%1) \n"
2590 "movdqa %%xmm1,0x10(%0,%1) \n"
2591 "lea 0x20(%0),%0 \n"
2592 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002593 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002594 : "+r"(src), // %0
2595 "+r"(dst), // %1
2596 "+r"(count) // %2
2597 :
2598 : "memory", "cc"
2599#if defined(__SSE2__)
2600 , "xmm0", "xmm1"
2601#endif
2602 );
2603}
2604#endif // HAS_COPYROW_SSE2
2605
2606#ifdef HAS_COPYROW_X86
2607void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2608 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002609 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002610 "shr $0x2,%2 \n"
2611 "rep movsl \n"
2612 : "+S"(src), // %0
2613 "+D"(dst), // %1
2614 "+c"(width_tmp) // %2
2615 :
2616 : "memory", "cc"
2617 );
2618}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002619#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002620
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002621#ifdef HAS_SETROW_X86
2622void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2623 size_t width_tmp = static_cast<size_t>(width);
2624 asm volatile (
2625 "shr $0x2,%1 \n"
2626 "rep stosl \n"
2627 : "+D"(dst), // %0
2628 "+c"(width_tmp) // %1
2629 : "a"(v32) // %2
2630 : "memory", "cc");
2631}
2632
2633void SetRows32_X86(uint8* dst, uint32 v32, int width,
2634 int dst_stride, int height) {
2635 for (int y = 0; y < height; ++y) {
2636 size_t width_tmp = static_cast<size_t>(width);
2637 uint32* d = reinterpret_cast<uint32*>(dst);
2638 asm volatile (
2639 "rep stosl \n"
2640 : "+D"(d), // %0
2641 "+c"(width_tmp) // %1
2642 : "a"(v32) // %2
2643 : "memory", "cc");
2644 dst += dst_stride;
2645 }
2646}
2647#endif // HAS_SETROW_X86
2648
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002649#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002650void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002651 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002652 "pcmpeqb %%xmm5,%%xmm5 \n"
2653 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002654 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002655 "1: \n"
2656 "movdqa (%0),%%xmm0 \n"
2657 "movdqa 0x10(%0),%%xmm1 \n"
2658 "lea 0x20(%0),%0 \n"
2659 "pand %%xmm5,%%xmm0 \n"
2660 "pand %%xmm5,%%xmm1 \n"
2661 "packuswb %%xmm1,%%xmm0 \n"
2662 "movdqa %%xmm0,(%1) \n"
2663 "lea 0x10(%1),%1 \n"
2664 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002665 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002666 : "+r"(src_yuy2), // %0
2667 "+r"(dst_y), // %1
2668 "+r"(pix) // %2
2669 :
2670 : "memory", "cc"
2671#if defined(__SSE2__)
2672 , "xmm0", "xmm1", "xmm5"
2673#endif
2674 );
2675}
2676
2677void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002678 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002679 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002680 "pcmpeqb %%xmm5,%%xmm5 \n"
2681 "psrlw $0x8,%%xmm5 \n"
2682 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002683 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002684 "1: \n"
2685 "movdqa (%0),%%xmm0 \n"
2686 "movdqa 0x10(%0),%%xmm1 \n"
2687 "movdqa (%0,%4,1),%%xmm2 \n"
2688 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2689 "lea 0x20(%0),%0 \n"
2690 "pavgb %%xmm2,%%xmm0 \n"
2691 "pavgb %%xmm3,%%xmm1 \n"
2692 "psrlw $0x8,%%xmm0 \n"
2693 "psrlw $0x8,%%xmm1 \n"
2694 "packuswb %%xmm1,%%xmm0 \n"
2695 "movdqa %%xmm0,%%xmm1 \n"
2696 "pand %%xmm5,%%xmm0 \n"
2697 "packuswb %%xmm0,%%xmm0 \n"
2698 "psrlw $0x8,%%xmm1 \n"
2699 "packuswb %%xmm1,%%xmm1 \n"
2700 "movq %%xmm0,(%1) \n"
2701 "movq %%xmm1,(%1,%2) \n"
2702 "lea 0x8(%1),%1 \n"
2703 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002704 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002705 : "+r"(src_yuy2), // %0
2706 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002707 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002708 "+r"(pix) // %3
2709 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2710 : "memory", "cc"
2711#if defined(__SSE2__)
2712 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2713#endif
2714 );
2715}
2716
fbarchard@google.comc704f782012-08-30 19:53:48 +00002717void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2718 uint8* dst_u, uint8* dst_v, int pix) {
2719 asm volatile (
2720 "pcmpeqb %%xmm5,%%xmm5 \n"
2721 "psrlw $0x8,%%xmm5 \n"
2722 "sub %1,%2 \n"
2723 ".p2align 4 \n"
2724 "1: \n"
2725 "movdqa (%0),%%xmm0 \n"
2726 "movdqa 0x10(%0),%%xmm1 \n"
2727 "lea 0x20(%0),%0 \n"
2728 "psrlw $0x8,%%xmm0 \n"
2729 "psrlw $0x8,%%xmm1 \n"
2730 "packuswb %%xmm1,%%xmm0 \n"
2731 "movdqa %%xmm0,%%xmm1 \n"
2732 "pand %%xmm5,%%xmm0 \n"
2733 "packuswb %%xmm0,%%xmm0 \n"
2734 "psrlw $0x8,%%xmm1 \n"
2735 "packuswb %%xmm1,%%xmm1 \n"
2736 "movq %%xmm0,(%1) \n"
2737 "movq %%xmm1,(%1,%2) \n"
2738 "lea 0x8(%1),%1 \n"
2739 "sub $0x10,%3 \n"
2740 "jg 1b \n"
2741 : "+r"(src_yuy2), // %0
2742 "+r"(dst_u), // %1
2743 "+r"(dst_v), // %2
2744 "+r"(pix) // %3
2745 :
2746 : "memory", "cc"
2747#if defined(__SSE2__)
2748 , "xmm0", "xmm1", "xmm5"
2749#endif
2750 );
2751}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002752
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002753void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2754 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002755 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002756 "pcmpeqb %%xmm5,%%xmm5 \n"
2757 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002758 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002759 "1: \n"
2760 "movdqu (%0),%%xmm0 \n"
2761 "movdqu 0x10(%0),%%xmm1 \n"
2762 "lea 0x20(%0),%0 \n"
2763 "pand %%xmm5,%%xmm0 \n"
2764 "pand %%xmm5,%%xmm1 \n"
2765 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002766 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002767 "movdqu %%xmm0,(%1) \n"
2768 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002769 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002770 : "+r"(src_yuy2), // %0
2771 "+r"(dst_y), // %1
2772 "+r"(pix) // %2
2773 :
2774 : "memory", "cc"
2775#if defined(__SSE2__)
2776 , "xmm0", "xmm1", "xmm5"
2777#endif
2778 );
2779}
2780
2781void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2782 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002783 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002784 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002785 "pcmpeqb %%xmm5,%%xmm5 \n"
2786 "psrlw $0x8,%%xmm5 \n"
2787 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002788 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002789 "1: \n"
2790 "movdqu (%0),%%xmm0 \n"
2791 "movdqu 0x10(%0),%%xmm1 \n"
2792 "movdqu (%0,%4,1),%%xmm2 \n"
2793 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2794 "lea 0x20(%0),%0 \n"
2795 "pavgb %%xmm2,%%xmm0 \n"
2796 "pavgb %%xmm3,%%xmm1 \n"
2797 "psrlw $0x8,%%xmm0 \n"
2798 "psrlw $0x8,%%xmm1 \n"
2799 "packuswb %%xmm1,%%xmm0 \n"
2800 "movdqa %%xmm0,%%xmm1 \n"
2801 "pand %%xmm5,%%xmm0 \n"
2802 "packuswb %%xmm0,%%xmm0 \n"
2803 "psrlw $0x8,%%xmm1 \n"
2804 "packuswb %%xmm1,%%xmm1 \n"
2805 "movq %%xmm0,(%1) \n"
2806 "movq %%xmm1,(%1,%2) \n"
2807 "lea 0x8(%1),%1 \n"
2808 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002809 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002810 : "+r"(src_yuy2), // %0
2811 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002812 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002813 "+r"(pix) // %3
2814 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2815 : "memory", "cc"
2816#if defined(__SSE2__)
2817 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2818#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002819 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002820}
2821
fbarchard@google.comc704f782012-08-30 19:53:48 +00002822void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2823 uint8* dst_u, uint8* dst_v, int pix) {
2824 asm volatile (
2825 "pcmpeqb %%xmm5,%%xmm5 \n"
2826 "psrlw $0x8,%%xmm5 \n"
2827 "sub %1,%2 \n"
2828 ".p2align 4 \n"
2829 "1: \n"
2830 "movdqu (%0),%%xmm0 \n"
2831 "movdqu 0x10(%0),%%xmm1 \n"
2832 "lea 0x20(%0),%0 \n"
2833 "psrlw $0x8,%%xmm0 \n"
2834 "psrlw $0x8,%%xmm1 \n"
2835 "packuswb %%xmm1,%%xmm0 \n"
2836 "movdqa %%xmm0,%%xmm1 \n"
2837 "pand %%xmm5,%%xmm0 \n"
2838 "packuswb %%xmm0,%%xmm0 \n"
2839 "psrlw $0x8,%%xmm1 \n"
2840 "packuswb %%xmm1,%%xmm1 \n"
2841 "movq %%xmm0,(%1) \n"
2842 "movq %%xmm1,(%1,%2) \n"
2843 "lea 0x8(%1),%1 \n"
2844 "sub $0x10,%3 \n"
2845 "jg 1b \n"
2846 : "+r"(src_yuy2), // %0
2847 "+r"(dst_u), // %1
2848 "+r"(dst_v), // %2
2849 "+r"(pix) // %3
2850 :
2851 : "memory", "cc"
2852#if defined(__SSE2__)
2853 , "xmm0", "xmm1", "xmm5"
2854#endif
2855 );
2856}
2857
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002858void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002859 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002860 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002861 "1: \n"
2862 "movdqa (%0),%%xmm0 \n"
2863 "movdqa 0x10(%0),%%xmm1 \n"
2864 "lea 0x20(%0),%0 \n"
2865 "psrlw $0x8,%%xmm0 \n"
2866 "psrlw $0x8,%%xmm1 \n"
2867 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002868 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002869 "movdqa %%xmm0,(%1) \n"
2870 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002871 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002872 : "+r"(src_uyvy), // %0
2873 "+r"(dst_y), // %1
2874 "+r"(pix) // %2
2875 :
2876 : "memory", "cc"
2877#if defined(__SSE2__)
2878 , "xmm0", "xmm1"
2879#endif
2880 );
2881}
2882
2883void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002884 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002885 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002886 "pcmpeqb %%xmm5,%%xmm5 \n"
2887 "psrlw $0x8,%%xmm5 \n"
2888 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002889 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002890 "1: \n"
2891 "movdqa (%0),%%xmm0 \n"
2892 "movdqa 0x10(%0),%%xmm1 \n"
2893 "movdqa (%0,%4,1),%%xmm2 \n"
2894 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2895 "lea 0x20(%0),%0 \n"
2896 "pavgb %%xmm2,%%xmm0 \n"
2897 "pavgb %%xmm3,%%xmm1 \n"
2898 "pand %%xmm5,%%xmm0 \n"
2899 "pand %%xmm5,%%xmm1 \n"
2900 "packuswb %%xmm1,%%xmm0 \n"
2901 "movdqa %%xmm0,%%xmm1 \n"
2902 "pand %%xmm5,%%xmm0 \n"
2903 "packuswb %%xmm0,%%xmm0 \n"
2904 "psrlw $0x8,%%xmm1 \n"
2905 "packuswb %%xmm1,%%xmm1 \n"
2906 "movq %%xmm0,(%1) \n"
2907 "movq %%xmm1,(%1,%2) \n"
2908 "lea 0x8(%1),%1 \n"
2909 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002910 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002911 : "+r"(src_uyvy), // %0
2912 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002913 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002914 "+r"(pix) // %3
2915 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2916 : "memory", "cc"
2917#if defined(__SSE2__)
2918 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2919#endif
2920 );
2921}
2922
fbarchard@google.comc704f782012-08-30 19:53:48 +00002923void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2924 uint8* dst_u, uint8* dst_v, int pix) {
2925 asm volatile (
2926 "pcmpeqb %%xmm5,%%xmm5 \n"
2927 "psrlw $0x8,%%xmm5 \n"
2928 "sub %1,%2 \n"
2929 ".p2align 4 \n"
2930 "1: \n"
2931 "movdqa (%0),%%xmm0 \n"
2932 "movdqa 0x10(%0),%%xmm1 \n"
2933 "lea 0x20(%0),%0 \n"
2934 "pand %%xmm5,%%xmm0 \n"
2935 "pand %%xmm5,%%xmm1 \n"
2936 "packuswb %%xmm1,%%xmm0 \n"
2937 "movdqa %%xmm0,%%xmm1 \n"
2938 "pand %%xmm5,%%xmm0 \n"
2939 "packuswb %%xmm0,%%xmm0 \n"
2940 "psrlw $0x8,%%xmm1 \n"
2941 "packuswb %%xmm1,%%xmm1 \n"
2942 "movq %%xmm0,(%1) \n"
2943 "movq %%xmm1,(%1,%2) \n"
2944 "lea 0x8(%1),%1 \n"
2945 "sub $0x10,%3 \n"
2946 "jg 1b \n"
2947 : "+r"(src_uyvy), // %0
2948 "+r"(dst_u), // %1
2949 "+r"(dst_v), // %2
2950 "+r"(pix) // %3
2951 :
2952 : "memory", "cc"
2953#if defined(__SSE2__)
2954 , "xmm0", "xmm1", "xmm5"
2955#endif
2956 );
2957}
2958
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002959void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2960 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002961 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002962 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002963 "1: \n"
2964 "movdqu (%0),%%xmm0 \n"
2965 "movdqu 0x10(%0),%%xmm1 \n"
2966 "lea 0x20(%0),%0 \n"
2967 "psrlw $0x8,%%xmm0 \n"
2968 "psrlw $0x8,%%xmm1 \n"
2969 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002970 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002971 "movdqu %%xmm0,(%1) \n"
2972 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002973 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002974 : "+r"(src_uyvy), // %0
2975 "+r"(dst_y), // %1
2976 "+r"(pix) // %2
2977 :
2978 : "memory", "cc"
2979#if defined(__SSE2__)
2980 , "xmm0", "xmm1"
2981#endif
2982 );
2983}
2984
2985void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002986 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002987 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002988 "pcmpeqb %%xmm5,%%xmm5 \n"
2989 "psrlw $0x8,%%xmm5 \n"
2990 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002991 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002992 "1: \n"
2993 "movdqu (%0),%%xmm0 \n"
2994 "movdqu 0x10(%0),%%xmm1 \n"
2995 "movdqu (%0,%4,1),%%xmm2 \n"
2996 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2997 "lea 0x20(%0),%0 \n"
2998 "pavgb %%xmm2,%%xmm0 \n"
2999 "pavgb %%xmm3,%%xmm1 \n"
3000 "pand %%xmm5,%%xmm0 \n"
3001 "pand %%xmm5,%%xmm1 \n"
3002 "packuswb %%xmm1,%%xmm0 \n"
3003 "movdqa %%xmm0,%%xmm1 \n"
3004 "pand %%xmm5,%%xmm0 \n"
3005 "packuswb %%xmm0,%%xmm0 \n"
3006 "psrlw $0x8,%%xmm1 \n"
3007 "packuswb %%xmm1,%%xmm1 \n"
3008 "movq %%xmm0,(%1) \n"
3009 "movq %%xmm1,(%1,%2) \n"
3010 "lea 0x8(%1),%1 \n"
3011 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003012 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003013 : "+r"(src_uyvy), // %0
3014 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003015 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003016 "+r"(pix) // %3
3017 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3018 : "memory", "cc"
3019#if defined(__SSE2__)
3020 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3021#endif
3022 );
3023}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003024
3025void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3026 uint8* dst_u, uint8* dst_v, int pix) {
3027 asm volatile (
3028 "pcmpeqb %%xmm5,%%xmm5 \n"
3029 "psrlw $0x8,%%xmm5 \n"
3030 "sub %1,%2 \n"
3031 ".p2align 4 \n"
3032 "1: \n"
3033 "movdqu (%0),%%xmm0 \n"
3034 "movdqu 0x10(%0),%%xmm1 \n"
3035 "lea 0x20(%0),%0 \n"
3036 "pand %%xmm5,%%xmm0 \n"
3037 "pand %%xmm5,%%xmm1 \n"
3038 "packuswb %%xmm1,%%xmm0 \n"
3039 "movdqa %%xmm0,%%xmm1 \n"
3040 "pand %%xmm5,%%xmm0 \n"
3041 "packuswb %%xmm0,%%xmm0 \n"
3042 "psrlw $0x8,%%xmm1 \n"
3043 "packuswb %%xmm1,%%xmm1 \n"
3044 "movq %%xmm0,(%1) \n"
3045 "movq %%xmm1,(%1,%2) \n"
3046 "lea 0x8(%1),%1 \n"
3047 "sub $0x10,%3 \n"
3048 "jg 1b \n"
3049 : "+r"(src_uyvy), // %0
3050 "+r"(dst_u), // %1
3051 "+r"(dst_v), // %2
3052 "+r"(pix) // %3
3053 :
3054 : "memory", "cc"
3055#if defined(__SSE2__)
3056 , "xmm0", "xmm1", "xmm5"
3057#endif
3058 );
3059}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003060#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003061
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003062#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003063// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003064void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3065 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003066 asm volatile (
3067 "pcmpeqb %%xmm7,%%xmm7 \n"
3068 "psrlw $0xf,%%xmm7 \n"
3069 "pcmpeqb %%xmm6,%%xmm6 \n"
3070 "psrlw $0x8,%%xmm6 \n"
3071 "pcmpeqb %%xmm5,%%xmm5 \n"
3072 "psllw $0x8,%%xmm5 \n"
3073 "pcmpeqb %%xmm4,%%xmm4 \n"
3074 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003075 "sub $0x1,%3 \n"
3076 "je 91f \n"
3077 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003078
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003079 // 1 pixel loop until destination pointer is aligned.
3080 "10: \n"
3081 "test $0xf,%2 \n"
3082 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003083 "movd (%0),%%xmm3 \n"
3084 "lea 0x4(%0),%0 \n"
3085 "movdqa %%xmm3,%%xmm0 \n"
3086 "pxor %%xmm4,%%xmm3 \n"
3087 "movd (%1),%%xmm2 \n"
3088 "psrlw $0x8,%%xmm3 \n"
3089 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3090 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3091 "pand %%xmm6,%%xmm2 \n"
3092 "paddw %%xmm7,%%xmm3 \n"
3093 "pmullw %%xmm3,%%xmm2 \n"
3094 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003095 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003096 "psrlw $0x8,%%xmm1 \n"
3097 "por %%xmm4,%%xmm0 \n"
3098 "pmullw %%xmm3,%%xmm1 \n"
3099 "psrlw $0x8,%%xmm2 \n"
3100 "paddusb %%xmm2,%%xmm0 \n"
3101 "pand %%xmm5,%%xmm1 \n"
3102 "paddusb %%xmm1,%%xmm0 \n"
3103 "sub $0x1,%3 \n"
3104 "movd %%xmm0,(%2) \n"
3105 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003106 "jge 10b \n"
3107
3108 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003109 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003110 "jl 49f \n"
3111
fbarchard@google.com794fe122012-06-15 01:05:01 +00003112 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003113 ".p2align 2 \n"
3114 "41: \n"
3115 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003116 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003117 "movdqa %%xmm3,%%xmm0 \n"
3118 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003119 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003120 "psrlw $0x8,%%xmm3 \n"
3121 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3122 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003123 "pand %%xmm6,%%xmm2 \n"
3124 "paddw %%xmm7,%%xmm3 \n"
3125 "pmullw %%xmm3,%%xmm2 \n"
3126 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003127 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003128 "psrlw $0x8,%%xmm1 \n"
3129 "por %%xmm4,%%xmm0 \n"
3130 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003131 "psrlw $0x8,%%xmm2 \n"
3132 "paddusb %%xmm2,%%xmm0 \n"
3133 "pand %%xmm5,%%xmm1 \n"
3134 "paddusb %%xmm1,%%xmm0 \n"
3135 "sub $0x4,%3 \n"
3136 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003137 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003138 "jge 41b \n"
3139
3140 "49: \n"
3141 "add $0x3,%3 \n"
3142 "jl 99f \n"
3143
fbarchard@google.com794fe122012-06-15 01:05:01 +00003144 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003145 "91: \n"
3146 "movd (%0),%%xmm3 \n"
3147 "lea 0x4(%0),%0 \n"
3148 "movdqa %%xmm3,%%xmm0 \n"
3149 "pxor %%xmm4,%%xmm3 \n"
3150 "movd (%1),%%xmm2 \n"
3151 "psrlw $0x8,%%xmm3 \n"
3152 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3153 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3154 "pand %%xmm6,%%xmm2 \n"
3155 "paddw %%xmm7,%%xmm3 \n"
3156 "pmullw %%xmm3,%%xmm2 \n"
3157 "movd (%1),%%xmm1 \n"
3158 "lea 0x4(%1),%1 \n"
3159 "psrlw $0x8,%%xmm1 \n"
3160 "por %%xmm4,%%xmm0 \n"
3161 "pmullw %%xmm3,%%xmm1 \n"
3162 "psrlw $0x8,%%xmm2 \n"
3163 "paddusb %%xmm2,%%xmm0 \n"
3164 "pand %%xmm5,%%xmm1 \n"
3165 "paddusb %%xmm1,%%xmm0 \n"
3166 "sub $0x1,%3 \n"
3167 "movd %%xmm0,(%2) \n"
3168 "lea 0x4(%2),%2 \n"
3169 "jge 91b \n"
3170 "99: \n"
3171 : "+r"(src_argb0), // %0
3172 "+r"(src_argb1), // %1
3173 "+r"(dst_argb), // %2
3174 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003175 :
3176 : "memory", "cc"
3177#if defined(__SSE2__)
3178 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3179#endif
3180 );
3181}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003182#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003183
fbarchard@google.com96af8702012-04-06 18:22:27 +00003184#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003185// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003186CONST uvec8 kShuffleAlpha = {
3187 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3188 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3189};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003190
3191// Blend 8 pixels at a time
3192// Shuffle table for reversing the bytes.
3193
3194// Same as SSE2, but replaces
3195// psrlw xmm3, 8 // alpha
3196// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3197// pshuflw xmm3, xmm3,0F5h
3198// with..
3199// pshufb xmm3, kShuffleAlpha // alpha
3200
3201void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3202 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003203 asm volatile (
3204 "pcmpeqb %%xmm7,%%xmm7 \n"
3205 "psrlw $0xf,%%xmm7 \n"
3206 "pcmpeqb %%xmm6,%%xmm6 \n"
3207 "psrlw $0x8,%%xmm6 \n"
3208 "pcmpeqb %%xmm5,%%xmm5 \n"
3209 "psllw $0x8,%%xmm5 \n"
3210 "pcmpeqb %%xmm4,%%xmm4 \n"
3211 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003212 "sub $0x1,%3 \n"
3213 "je 91f \n"
3214 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003215
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003216 // 1 pixel loop until destination pointer is aligned.
3217 "10: \n"
3218 "test $0xf,%2 \n"
3219 "je 19f \n"
3220 "movd (%0),%%xmm3 \n"
3221 "lea 0x4(%0),%0 \n"
3222 "movdqa %%xmm3,%%xmm0 \n"
3223 "pxor %%xmm4,%%xmm3 \n"
3224 "movd (%1),%%xmm2 \n"
3225 "pshufb %4,%%xmm3 \n"
3226 "pand %%xmm6,%%xmm2 \n"
3227 "paddw %%xmm7,%%xmm3 \n"
3228 "pmullw %%xmm3,%%xmm2 \n"
3229 "movd (%1),%%xmm1 \n"
3230 "lea 0x4(%1),%1 \n"
3231 "psrlw $0x8,%%xmm1 \n"
3232 "por %%xmm4,%%xmm0 \n"
3233 "pmullw %%xmm3,%%xmm1 \n"
3234 "psrlw $0x8,%%xmm2 \n"
3235 "paddusb %%xmm2,%%xmm0 \n"
3236 "pand %%xmm5,%%xmm1 \n"
3237 "paddusb %%xmm1,%%xmm0 \n"
3238 "sub $0x1,%3 \n"
3239 "movd %%xmm0,(%2) \n"
3240 "lea 0x4(%2),%2 \n"
3241 "jge 10b \n"
3242
3243 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003244 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003245 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003246 "test $0xf,%0 \n"
3247 "jne 41f \n"
3248 "test $0xf,%1 \n"
3249 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003250
fbarchard@google.com794fe122012-06-15 01:05:01 +00003251 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003252 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003253 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003254 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003255 "lea 0x10(%0),%0 \n"
3256 "movdqa %%xmm3,%%xmm0 \n"
3257 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003258 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003259 "pshufb %4,%%xmm3 \n"
3260 "pand %%xmm6,%%xmm2 \n"
3261 "paddw %%xmm7,%%xmm3 \n"
3262 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003263 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003264 "lea 0x10(%1),%1 \n"
3265 "psrlw $0x8,%%xmm1 \n"
3266 "por %%xmm4,%%xmm0 \n"
3267 "pmullw %%xmm3,%%xmm1 \n"
3268 "psrlw $0x8,%%xmm2 \n"
3269 "paddusb %%xmm2,%%xmm0 \n"
3270 "pand %%xmm5,%%xmm1 \n"
3271 "paddusb %%xmm1,%%xmm0 \n"
3272 "sub $0x4,%3 \n"
3273 "movdqa %%xmm0,(%2) \n"
3274 "lea 0x10(%2),%2 \n"
3275 "jge 40b \n"
3276 "jmp 49f \n"
3277
3278 // 4 pixel unaligned loop.
3279 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003280 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003281 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003282 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003283 "movdqa %%xmm3,%%xmm0 \n"
3284 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003285 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003286 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003287 "pand %%xmm6,%%xmm2 \n"
3288 "paddw %%xmm7,%%xmm3 \n"
3289 "pmullw %%xmm3,%%xmm2 \n"
3290 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003291 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003292 "psrlw $0x8,%%xmm1 \n"
3293 "por %%xmm4,%%xmm0 \n"
3294 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003295 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003296 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003297 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003298 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003299 "sub $0x4,%3 \n"
3300 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003301 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003302 "jge 41b \n"
3303
3304 "49: \n"
3305 "add $0x3,%3 \n"
3306 "jl 99f \n"
3307
fbarchard@google.com794fe122012-06-15 01:05:01 +00003308 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003309 "91: \n"
3310 "movd (%0),%%xmm3 \n"
3311 "lea 0x4(%0),%0 \n"
3312 "movdqa %%xmm3,%%xmm0 \n"
3313 "pxor %%xmm4,%%xmm3 \n"
3314 "movd (%1),%%xmm2 \n"
3315 "pshufb %4,%%xmm3 \n"
3316 "pand %%xmm6,%%xmm2 \n"
3317 "paddw %%xmm7,%%xmm3 \n"
3318 "pmullw %%xmm3,%%xmm2 \n"
3319 "movd (%1),%%xmm1 \n"
3320 "lea 0x4(%1),%1 \n"
3321 "psrlw $0x8,%%xmm1 \n"
3322 "por %%xmm4,%%xmm0 \n"
3323 "pmullw %%xmm3,%%xmm1 \n"
3324 "psrlw $0x8,%%xmm2 \n"
3325 "paddusb %%xmm2,%%xmm0 \n"
3326 "pand %%xmm5,%%xmm1 \n"
3327 "paddusb %%xmm1,%%xmm0 \n"
3328 "sub $0x1,%3 \n"
3329 "movd %%xmm0,(%2) \n"
3330 "lea 0x4(%2),%2 \n"
3331 "jge 91b \n"
3332 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003333 : "+r"(src_argb0), // %0
3334 "+r"(src_argb1), // %1
3335 "+r"(dst_argb), // %2
3336 "+r"(width) // %3
3337 : "m"(kShuffleAlpha) // %4
3338 : "memory", "cc"
3339#if defined(__SSE2__)
3340 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3341#endif
3342 );
3343}
3344#endif // HAS_ARGBBLENDROW_SSSE3
3345
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003346#ifdef HAS_ARGBATTENUATE_SSE2
3347// Attenuate 4 pixels at a time.
3348// aligned to 16 bytes
3349void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3350 asm volatile (
3351 "sub %0,%1 \n"
3352 "pcmpeqb %%xmm4,%%xmm4 \n"
3353 "pslld $0x18,%%xmm4 \n"
3354 "pcmpeqb %%xmm5,%%xmm5 \n"
3355 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003356
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003357 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003358 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003359 "1: \n"
3360 "movdqa (%0),%%xmm0 \n"
3361 "punpcklbw %%xmm0,%%xmm0 \n"
3362 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3363 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3364 "pmulhuw %%xmm2,%%xmm0 \n"
3365 "movdqa (%0),%%xmm1 \n"
3366 "punpckhbw %%xmm1,%%xmm1 \n"
3367 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3368 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3369 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003370 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003371 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003372 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003373 "psrlw $0x8,%%xmm1 \n"
3374 "packuswb %%xmm1,%%xmm0 \n"
3375 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003376 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003377 "sub $0x4,%2 \n"
3378 "movdqa %%xmm0,(%0,%1,1) \n"
3379 "lea 0x10(%0),%0 \n"
3380 "jg 1b \n"
3381 : "+r"(src_argb), // %0
3382 "+r"(dst_argb), // %1
3383 "+r"(width) // %2
3384 :
3385 : "memory", "cc"
3386#if defined(__SSE2__)
3387 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3388#endif
3389 );
3390}
3391#endif // HAS_ARGBATTENUATE_SSE2
3392
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003393#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003394// Shuffle table duplicating alpha
3395CONST uvec8 kShuffleAlpha0 = {
3396 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3397};
3398CONST uvec8 kShuffleAlpha1 = {
3399 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3400 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3401};
3402// Attenuate 4 pixels at a time.
3403// aligned to 16 bytes
3404void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3405 asm volatile (
3406 "sub %0,%1 \n"
3407 "pcmpeqb %%xmm3,%%xmm3 \n"
3408 "pslld $0x18,%%xmm3 \n"
3409 "movdqa %3,%%xmm4 \n"
3410 "movdqa %4,%%xmm5 \n"
3411
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003412 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003413 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003414 "1: \n"
3415 "movdqa (%0),%%xmm0 \n"
3416 "pshufb %%xmm4,%%xmm0 \n"
3417 "movdqa (%0),%%xmm1 \n"
3418 "punpcklbw %%xmm1,%%xmm1 \n"
3419 "pmulhuw %%xmm1,%%xmm0 \n"
3420 "movdqa (%0),%%xmm1 \n"
3421 "pshufb %%xmm5,%%xmm1 \n"
3422 "movdqa (%0),%%xmm2 \n"
3423 "punpckhbw %%xmm2,%%xmm2 \n"
3424 "pmulhuw %%xmm2,%%xmm1 \n"
3425 "movdqa (%0),%%xmm2 \n"
3426 "pand %%xmm3,%%xmm2 \n"
3427 "psrlw $0x8,%%xmm0 \n"
3428 "psrlw $0x8,%%xmm1 \n"
3429 "packuswb %%xmm1,%%xmm0 \n"
3430 "por %%xmm2,%%xmm0 \n"
3431 "sub $0x4,%2 \n"
3432 "movdqa %%xmm0,(%0,%1,1) \n"
3433 "lea 0x10(%0),%0 \n"
3434 "jg 1b \n"
3435 : "+r"(src_argb), // %0
3436 "+r"(dst_argb), // %1
3437 "+r"(width) // %2
3438 : "m"(kShuffleAlpha0), // %3
3439 "m"(kShuffleAlpha1) // %4
3440 : "memory", "cc"
3441#if defined(__SSE2__)
3442 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3443#endif
3444 );
3445}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003446#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003447
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003448#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003449// Unattenuate 4 pixels at a time.
3450// aligned to 16 bytes
3451void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3452 int width) {
3453 uintptr_t alpha = 0;
3454 asm volatile (
3455 "sub %0,%1 \n"
3456 "pcmpeqb %%xmm4,%%xmm4 \n"
3457 "pslld $0x18,%%xmm4 \n"
3458
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003459 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003460 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003461 "1: \n"
3462 "movdqa (%0),%%xmm0 \n"
3463 "movzb 0x3(%0),%3 \n"
3464 "punpcklbw %%xmm0,%%xmm0 \n"
3465 "movd 0x0(%4,%3,4),%%xmm2 \n"
3466 "movzb 0x7(%0),%3 \n"
3467 "movd 0x0(%4,%3,4),%%xmm3 \n"
3468 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3469 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3470 "movlhps %%xmm3,%%xmm2 \n"
3471 "pmulhuw %%xmm2,%%xmm0 \n"
3472 "movdqa (%0),%%xmm1 \n"
3473 "movzb 0xb(%0),%3 \n"
3474 "punpckhbw %%xmm1,%%xmm1 \n"
3475 "movd 0x0(%4,%3,4),%%xmm2 \n"
3476 "movzb 0xf(%0),%3 \n"
3477 "movd 0x0(%4,%3,4),%%xmm3 \n"
3478 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3479 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3480 "movlhps %%xmm3,%%xmm2 \n"
3481 "pmulhuw %%xmm2,%%xmm1 \n"
3482 "movdqa (%0),%%xmm2 \n"
3483 "pand %%xmm4,%%xmm2 \n"
3484 "packuswb %%xmm1,%%xmm0 \n"
3485 "por %%xmm2,%%xmm0 \n"
3486 "sub $0x4,%2 \n"
3487 "movdqa %%xmm0,(%0,%1,1) \n"
3488 "lea 0x10(%0),%0 \n"
3489 "jg 1b \n"
3490 : "+r"(src_argb), // %0
3491 "+r"(dst_argb), // %1
3492 "+r"(width), // %2
3493 "+r"(alpha) // %3
3494 : "r"(fixed_invtbl8) // %4
3495 : "memory", "cc"
3496#if defined(__SSE2__)
3497 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3498#endif
3499 );
3500}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003501#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003502
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003503#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003504// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003505CONST vec8 kARGBToGray = {
3506 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3507};
3508
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003509// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003510void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003511 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003512 "movdqa %3,%%xmm4 \n"
3513 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003514
3515 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003516 ".p2align 4 \n"
3517 "1: \n"
3518 "movdqa (%0),%%xmm0 \n"
3519 "movdqa 0x10(%0),%%xmm1 \n"
3520 "pmaddubsw %%xmm4,%%xmm0 \n"
3521 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003522 "phaddw %%xmm1,%%xmm0 \n"
3523 "psrlw $0x7,%%xmm0 \n"
3524 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003525 "movdqa (%0),%%xmm2 \n"
3526 "movdqa 0x10(%0),%%xmm3 \n"
3527 "psrld $0x18,%%xmm2 \n"
3528 "psrld $0x18,%%xmm3 \n"
3529 "packuswb %%xmm3,%%xmm2 \n"
3530 "packuswb %%xmm2,%%xmm2 \n"
3531 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003532 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003533 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003534 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003535 "punpcklwd %%xmm3,%%xmm0 \n"
3536 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003537 "sub $0x8,%2 \n"
3538 "movdqa %%xmm0,(%0,%1,1) \n"
3539 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003540 "lea 0x20(%0),%0 \n"
3541 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003542 : "+r"(src_argb), // %0
3543 "+r"(dst_argb), // %1
3544 "+r"(width) // %2
3545 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003546 : "memory", "cc"
3547#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003549#endif
3550 );
3551}
3552#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003553
3554#ifdef HAS_ARGBSEPIAROW_SSSE3
3555// b = (r * 35 + g * 68 + b * 17) >> 7
3556// g = (r * 45 + g * 88 + b * 22) >> 7
3557// r = (r * 50 + g * 98 + b * 24) >> 7
3558// Constant for ARGB color to sepia tone
3559CONST vec8 kARGBToSepiaB = {
3560 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3561};
3562
3563CONST vec8 kARGBToSepiaG = {
3564 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3565};
3566
3567CONST vec8 kARGBToSepiaR = {
3568 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3569};
3570
fbarchard@google.come442dc42012-06-18 17:37:09 +00003571// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003572void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3573 asm volatile (
3574 "movdqa %2,%%xmm2 \n"
3575 "movdqa %3,%%xmm3 \n"
3576 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003577
3578 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003579 ".p2align 4 \n"
3580 "1: \n"
3581 "movdqa (%0),%%xmm0 \n"
3582 "movdqa 0x10(%0),%%xmm6 \n"
3583 "pmaddubsw %%xmm2,%%xmm0 \n"
3584 "pmaddubsw %%xmm2,%%xmm6 \n"
3585 "phaddw %%xmm6,%%xmm0 \n"
3586 "psrlw $0x7,%%xmm0 \n"
3587 "packuswb %%xmm0,%%xmm0 \n"
3588 "movdqa (%0),%%xmm5 \n"
3589 "movdqa 0x10(%0),%%xmm1 \n"
3590 "pmaddubsw %%xmm3,%%xmm5 \n"
3591 "pmaddubsw %%xmm3,%%xmm1 \n"
3592 "phaddw %%xmm1,%%xmm5 \n"
3593 "psrlw $0x7,%%xmm5 \n"
3594 "packuswb %%xmm5,%%xmm5 \n"
3595 "punpcklbw %%xmm5,%%xmm0 \n"
3596 "movdqa (%0),%%xmm5 \n"
3597 "movdqa 0x10(%0),%%xmm1 \n"
3598 "pmaddubsw %%xmm4,%%xmm5 \n"
3599 "pmaddubsw %%xmm4,%%xmm1 \n"
3600 "phaddw %%xmm1,%%xmm5 \n"
3601 "psrlw $0x7,%%xmm5 \n"
3602 "packuswb %%xmm5,%%xmm5 \n"
3603 "movdqa (%0),%%xmm6 \n"
3604 "movdqa 0x10(%0),%%xmm1 \n"
3605 "psrld $0x18,%%xmm6 \n"
3606 "psrld $0x18,%%xmm1 \n"
3607 "packuswb %%xmm1,%%xmm6 \n"
3608 "packuswb %%xmm6,%%xmm6 \n"
3609 "punpcklbw %%xmm6,%%xmm5 \n"
3610 "movdqa %%xmm0,%%xmm1 \n"
3611 "punpcklwd %%xmm5,%%xmm0 \n"
3612 "punpckhwd %%xmm5,%%xmm1 \n"
3613 "sub $0x8,%1 \n"
3614 "movdqa %%xmm0,(%0) \n"
3615 "movdqa %%xmm1,0x10(%0) \n"
3616 "lea 0x20(%0),%0 \n"
3617 "jg 1b \n"
3618 : "+r"(dst_argb), // %0
3619 "+r"(width) // %1
3620 : "m"(kARGBToSepiaB), // %2
3621 "m"(kARGBToSepiaG), // %3
3622 "m"(kARGBToSepiaR) // %4
3623 : "memory", "cc"
3624#if defined(__SSE2__)
3625 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3626#endif
3627 );
3628}
3629#endif // HAS_ARGBSEPIAROW_SSSE3
3630
fbarchard@google.come442dc42012-06-18 17:37:09 +00003631#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3632// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3633// Same as Sepia except matrix is provided.
3634void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3635 int width) {
3636 asm volatile (
3637 "movd (%2),%%xmm2 \n"
3638 "movd 0x4(%2),%%xmm3 \n"
3639 "movd 0x8(%2),%%xmm4 \n"
3640 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3641 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3642 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003643
3644 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003645 ".p2align 4 \n"
3646 "1: \n"
3647 "movdqa (%0),%%xmm0 \n"
3648 "movdqa 0x10(%0),%%xmm6 \n"
3649 "pmaddubsw %%xmm2,%%xmm0 \n"
3650 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003651 "movdqa (%0),%%xmm5 \n"
3652 "movdqa 0x10(%0),%%xmm1 \n"
3653 "pmaddubsw %%xmm3,%%xmm5 \n"
3654 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003655 "phaddsw %%xmm6,%%xmm0 \n"
3656 "phaddsw %%xmm1,%%xmm5 \n"
3657 "psraw $0x7,%%xmm0 \n"
3658 "psraw $0x7,%%xmm5 \n"
3659 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003660 "packuswb %%xmm5,%%xmm5 \n"
3661 "punpcklbw %%xmm5,%%xmm0 \n"
3662 "movdqa (%0),%%xmm5 \n"
3663 "movdqa 0x10(%0),%%xmm1 \n"
3664 "pmaddubsw %%xmm4,%%xmm5 \n"
3665 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003666 "phaddsw %%xmm1,%%xmm5 \n"
3667 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003668 "packuswb %%xmm5,%%xmm5 \n"
3669 "movdqa (%0),%%xmm6 \n"
3670 "movdqa 0x10(%0),%%xmm1 \n"
3671 "psrld $0x18,%%xmm6 \n"
3672 "psrld $0x18,%%xmm1 \n"
3673 "packuswb %%xmm1,%%xmm6 \n"
3674 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003675 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003676 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003677 "punpcklwd %%xmm5,%%xmm0 \n"
3678 "punpckhwd %%xmm5,%%xmm1 \n"
3679 "sub $0x8,%1 \n"
3680 "movdqa %%xmm0,(%0) \n"
3681 "movdqa %%xmm1,0x10(%0) \n"
3682 "lea 0x20(%0),%0 \n"
3683 "jg 1b \n"
3684 : "+r"(dst_argb), // %0
3685 "+r"(width) // %1
3686 : "r"(matrix_argb) // %2
3687 : "memory", "cc"
3688#if defined(__SSE2__)
3689 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3690#endif
3691 );
3692}
3693#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3694
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003695#ifdef HAS_ARGBQUANTIZEROW_SSE2
3696// Quantize 4 ARGB pixels (16 bytes).
3697// aligned to 16 bytes
3698void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3699 int interval_offset, int width) {
3700 asm volatile (
3701 "movd %2,%%xmm2 \n"
3702 "movd %3,%%xmm3 \n"
3703 "movd %4,%%xmm4 \n"
3704 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3705 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3706 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3707 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3708 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3709 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3710 "pxor %%xmm5,%%xmm5 \n"
3711 "pcmpeqb %%xmm6,%%xmm6 \n"
3712 "pslld $0x18,%%xmm6 \n"
3713
3714 // 4 pixel loop.
3715 ".p2align 2 \n"
3716 "1: \n"
3717 "movdqa (%0),%%xmm0 \n"
3718 "punpcklbw %%xmm5,%%xmm0 \n"
3719 "pmulhuw %%xmm2,%%xmm0 \n"
3720 "movdqa (%0),%%xmm1 \n"
3721 "punpckhbw %%xmm5,%%xmm1 \n"
3722 "pmulhuw %%xmm2,%%xmm1 \n"
3723 "pmullw %%xmm3,%%xmm0 \n"
3724 "movdqa (%0),%%xmm7 \n"
3725 "pmullw %%xmm3,%%xmm1 \n"
3726 "pand %%xmm6,%%xmm7 \n"
3727 "paddw %%xmm4,%%xmm0 \n"
3728 "paddw %%xmm4,%%xmm1 \n"
3729 "packuswb %%xmm1,%%xmm0 \n"
3730 "por %%xmm7,%%xmm0 \n"
3731 "sub $0x4,%1 \n"
3732 "movdqa %%xmm0,(%0) \n"
3733 "lea 0x10(%0),%0 \n"
3734 "jg 1b \n"
3735 : "+r"(dst_argb), // %0
3736 "+r"(width) // %1
3737 : "r"(scale), // %2
3738 "r"(interval_size), // %3
3739 "r"(interval_offset) // %4
3740 : "memory", "cc"
3741#if defined(__SSE2__)
3742 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3743#endif
3744 );
3745}
3746#endif // HAS_ARGBQUANTIZEROW_SSE2
3747
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003748#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3749// Creates a table of cumulative sums where each value is a sum of all values
3750// above and to the left of the value, inclusive of the value.
3751void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003752 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003753 asm volatile (
3754 "sub %1,%2 \n"
3755 "pxor %%xmm0,%%xmm0 \n"
3756 "pxor %%xmm1,%%xmm1 \n"
3757 "sub $0x4,%3 \n"
3758 "jl 49f \n"
3759 "test $0xf,%1 \n"
3760 "jne 49f \n"
3761
3762 // 4 pixel loop \n"
3763 ".p2align 2 \n"
3764 "40: \n"
3765 "movdqu (%0),%%xmm2 \n"
3766 "lea 0x10(%0),%0 \n"
3767 "movdqa %%xmm2,%%xmm4 \n"
3768 "punpcklbw %%xmm1,%%xmm2 \n"
3769 "movdqa %%xmm2,%%xmm3 \n"
3770 "punpcklwd %%xmm1,%%xmm2 \n"
3771 "punpckhwd %%xmm1,%%xmm3 \n"
3772 "punpckhbw %%xmm1,%%xmm4 \n"
3773 "movdqa %%xmm4,%%xmm5 \n"
3774 "punpcklwd %%xmm1,%%xmm4 \n"
3775 "punpckhwd %%xmm1,%%xmm5 \n"
3776 "paddd %%xmm2,%%xmm0 \n"
3777 "movdqa (%1,%2,1),%%xmm2 \n"
3778 "paddd %%xmm0,%%xmm2 \n"
3779 "paddd %%xmm3,%%xmm0 \n"
3780 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3781 "paddd %%xmm0,%%xmm3 \n"
3782 "paddd %%xmm4,%%xmm0 \n"
3783 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3784 "paddd %%xmm0,%%xmm4 \n"
3785 "paddd %%xmm5,%%xmm0 \n"
3786 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3787 "paddd %%xmm0,%%xmm5 \n"
3788 "movdqa %%xmm2,(%1) \n"
3789 "movdqa %%xmm3,0x10(%1) \n"
3790 "movdqa %%xmm4,0x20(%1) \n"
3791 "movdqa %%xmm5,0x30(%1) \n"
3792 "lea 0x40(%1),%1 \n"
3793 "sub $0x4,%3 \n"
3794 "jge 40b \n"
3795
3796 "49: \n"
3797 "add $0x3,%3 \n"
3798 "jl 19f \n"
3799
3800 // 1 pixel loop \n"
3801 ".p2align 2 \n"
3802 "10: \n"
3803 "movd (%0),%%xmm2 \n"
3804 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003805 "punpcklbw %%xmm1,%%xmm2 \n"
3806 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003807 "paddd %%xmm2,%%xmm0 \n"
3808 "movdqu (%1,%2,1),%%xmm2 \n"
3809 "paddd %%xmm0,%%xmm2 \n"
3810 "movdqu %%xmm2,(%1) \n"
3811 "lea 0x10(%1),%1 \n"
3812 "sub $0x1,%3 \n"
3813 "jge 10b \n"
3814
3815 "19: \n"
3816 : "+r"(row), // %0
3817 "+r"(cumsum), // %1
3818 "+r"(previous_cumsum), // %2
3819 "+r"(width) // %3
3820 :
3821 : "memory", "cc"
3822#if defined(__SSE2__)
3823 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3824#endif
3825 );
3826}
3827#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3828
3829#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3830void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3831 int width, int area, uint8* dst, int count) {
3832 asm volatile (
3833 "movd %5,%%xmm4 \n"
3834 "cvtdq2ps %%xmm4,%%xmm4 \n"
3835 "rcpss %%xmm4,%%xmm4 \n"
3836 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3837 "sub $0x4,%3 \n"
3838 "jl 49f \n"
3839
3840 // 4 pixel loop \n"
3841 ".p2align 2 \n"
3842 "40: \n"
3843 "movdqa (%0),%%xmm0 \n"
3844 "movdqa 0x10(%0),%%xmm1 \n"
3845 "movdqa 0x20(%0),%%xmm2 \n"
3846 "movdqa 0x30(%0),%%xmm3 \n"
3847 "psubd (%0,%4,4),%%xmm0 \n"
3848 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3849 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3850 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3851 "lea 0x40(%0),%0 \n"
3852 "psubd (%1),%%xmm0 \n"
3853 "psubd 0x10(%1),%%xmm1 \n"
3854 "psubd 0x20(%1),%%xmm2 \n"
3855 "psubd 0x30(%1),%%xmm3 \n"
3856 "paddd (%1,%4,4),%%xmm0 \n"
3857 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3858 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3859 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3860 "lea 0x40(%1),%1 \n"
3861 "cvtdq2ps %%xmm0,%%xmm0 \n"
3862 "cvtdq2ps %%xmm1,%%xmm1 \n"
3863 "mulps %%xmm4,%%xmm0 \n"
3864 "mulps %%xmm4,%%xmm1 \n"
3865 "cvtdq2ps %%xmm2,%%xmm2 \n"
3866 "cvtdq2ps %%xmm3,%%xmm3 \n"
3867 "mulps %%xmm4,%%xmm2 \n"
3868 "mulps %%xmm4,%%xmm3 \n"
3869 "cvtps2dq %%xmm0,%%xmm0 \n"
3870 "cvtps2dq %%xmm1,%%xmm1 \n"
3871 "cvtps2dq %%xmm2,%%xmm2 \n"
3872 "cvtps2dq %%xmm3,%%xmm3 \n"
3873 "packssdw %%xmm1,%%xmm0 \n"
3874 "packssdw %%xmm3,%%xmm2 \n"
3875 "packuswb %%xmm2,%%xmm0 \n"
3876 "movdqu %%xmm0,(%2) \n"
3877 "lea 0x10(%2),%2 \n"
3878 "sub $0x4,%3 \n"
3879 "jge 40b \n"
3880
3881 "49: \n"
3882 "add $0x3,%3 \n"
3883 "jl 19f \n"
3884
3885 // 1 pixel loop \n"
3886 ".p2align 2 \n"
3887 "10: \n"
3888 "movdqa (%0),%%xmm0 \n"
3889 "psubd (%0,%4,4),%%xmm0 \n"
3890 "lea 0x10(%0),%0 \n"
3891 "psubd (%1),%%xmm0 \n"
3892 "paddd (%1,%4,4),%%xmm0 \n"
3893 "lea 0x10(%1),%1 \n"
3894 "cvtdq2ps %%xmm0,%%xmm0 \n"
3895 "mulps %%xmm4,%%xmm0 \n"
3896 "cvtps2dq %%xmm0,%%xmm0 \n"
3897 "packssdw %%xmm0,%%xmm0 \n"
3898 "packuswb %%xmm0,%%xmm0 \n"
3899 "movd %%xmm0,(%2) \n"
3900 "lea 0x4(%2),%2 \n"
3901 "sub $0x1,%3 \n"
3902 "jge 10b \n"
3903 "19: \n"
3904 : "+r"(topleft), // %0
3905 "+r"(botleft), // %1
3906 "+r"(dst), // %2
3907 "+rm"(count) // %3
3908 : "r"(static_cast<intptr_t>(width)), // %4
3909 "rm"(area) // %5
3910 : "memory", "cc"
3911#if defined(__SSE2__)
3912 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3913#endif
3914 );
3915}
3916#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003917#ifdef HAS_ARGBSHADE_SSE2
3918// Shade 4 pixels at a time by specified value.
3919// Aligned to 16 bytes.
3920void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3921 uint32 value) {
3922 asm volatile (
3923 "movd %3,%%xmm2 \n"
3924 "sub %0,%1 \n"
3925 "punpcklbw %%xmm2,%%xmm2 \n"
3926 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003927
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003928 // 4 pixel loop.
3929 ".p2align 2 \n"
3930 "1: \n"
3931 "movdqa (%0),%%xmm0 \n"
3932 "movdqa %%xmm0,%%xmm1 \n"
3933 "punpcklbw %%xmm0,%%xmm0 \n"
3934 "punpckhbw %%xmm1,%%xmm1 \n"
3935 "pmulhuw %%xmm2,%%xmm0 \n"
3936 "pmulhuw %%xmm2,%%xmm1 \n"
3937 "psrlw $0x8,%%xmm0 \n"
3938 "psrlw $0x8,%%xmm1 \n"
3939 "packuswb %%xmm1,%%xmm0 \n"
3940 "sub $0x4,%2 \n"
3941 "movdqa %%xmm0,(%0,%1,1) \n"
3942 "lea 0x10(%0),%0 \n"
3943 "jg 1b \n"
3944 : "+r"(src_argb), // %0
3945 "+r"(dst_argb), // %1
3946 "+r"(width) // %2
3947 : "r"(value) // %3
3948 : "memory", "cc"
3949#if defined(__SSE2__)
3950 , "xmm0", "xmm1", "xmm2"
3951#endif
3952 );
3953}
3954#endif // HAS_ARGBSHADE_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003955
fbarchard@google.com73444402012-08-09 17:33:29 +00003956#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003957// TODO(fbarchard): Find 64 bit way to avoid masking.
3958// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
fbarchard@google.com73444402012-08-09 17:33:29 +00003959// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003960// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003961// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003962
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00003963LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00003964void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3965 uint8* dst_argb, const float* uv_dudv, int width) {
3966 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003967 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00003968 asm volatile (
3969 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003970 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003971 "shl $0x10,%1 \n"
3972 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003973 "movd %1,%%xmm5 \n"
3974 "sub $0x4,%4 \n"
3975 "jl 49f \n"
3976
3977 "pshufd $0x44,%%xmm7,%%xmm7 \n"
3978 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003979 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003980 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003981 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003982 "movdqa %%xmm7,%%xmm4 \n"
3983 "addps %%xmm4,%%xmm4 \n"
3984 "movdqa %%xmm2,%%xmm3 \n"
3985 "addps %%xmm4,%%xmm3 \n"
3986 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003987
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003988 // 4 pixel loop \n"
3989 ".p2align 4 \n"
3990 "40: \n"
3991 "cvttps2dq %%xmm2,%%xmm0 \n"
3992 "cvttps2dq %%xmm3,%%xmm1 \n"
3993 "packssdw %%xmm1,%%xmm0 \n"
3994 "pmaddwd %%xmm5,%%xmm0 \n"
3995#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003996 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003997 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003998 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003999 "shr $32,%5 \n"
4000 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4001#else
4002 "movd %%xmm0,%1 \n"
4003 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4004 "movd %%xmm0,%5 \n"
4005 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4006#endif
4007 "movd (%0,%1,1),%%xmm1 \n"
4008 "movd (%0,%5,1),%%xmm6 \n"
4009 "punpckldq %%xmm6,%%xmm1 \n"
4010 "addps %%xmm4,%%xmm2 \n"
4011 "movq %%xmm1,(%2) \n"
4012#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004013 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004014 "mov %1,%5 \n"
4015 "and $0x0fffffff,%1 \n"
4016 "shr $32,%5 \n"
4017#else
4018 "movd %%xmm0,%1 \n"
4019 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4020 "movd %%xmm0,%5 \n"
4021#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004022 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004023 "movd (%0,%5,1),%%xmm6 \n"
4024 "punpckldq %%xmm6,%%xmm0 \n"
4025 "addps %%xmm4,%%xmm3 \n"
4026 "sub $0x4,%4 \n"
4027 "movq %%xmm0,0x08(%2) \n"
4028 "lea 0x10(%2),%2 \n"
4029 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004030
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004031 "49: \n"
4032 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004033 "jl 19f \n"
4034
4035 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004036 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004037 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004038 "cvttps2dq %%xmm2,%%xmm0 \n"
4039 "packssdw %%xmm0,%%xmm0 \n"
4040 "pmaddwd %%xmm5,%%xmm0 \n"
4041 "addps %%xmm7,%%xmm2 \n"
4042 "movd %%xmm0,%1 \n"
4043#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004044 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004045#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004046 "movd (%0,%1,1),%%xmm0 \n"
4047 "sub $0x1,%4 \n"
4048 "movd %%xmm0,(%2) \n"
4049 "lea 0x4(%2),%2 \n"
4050 "jge 10b \n"
4051 "19: \n"
4052 : "+r"(src_argb), // %0
4053 "+r"(src_argb_stride_temp), // %1
4054 "+r"(dst_argb), // %2
4055 "+r"(uv_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004056 "+rm"(width), // %4
4057 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004058 :
4059 : "memory", "cc"
4060#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004061 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004062#endif
4063 );
4064}
4065#endif // HAS_ARGBAFFINEROW_SSE2
4066
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004067// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
4068void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4069 ptrdiff_t src_stride, int dst_width,
4070 int source_y_fraction) {
4071 asm volatile (
4072 "sub %1,%0 \n"
4073 "shr %3 \n"
4074 "cmp $0x0,%3 \n"
4075 "je 2f \n"
4076 "cmp $0x40,%3 \n"
4077 "je 3f \n"
4078 "movd %3,%%xmm0 \n"
4079 "neg %3 \n"
4080 "add $0x80,%3 \n"
4081 "movd %3,%%xmm5 \n"
4082 "punpcklbw %%xmm0,%%xmm5 \n"
4083 "punpcklwd %%xmm5,%%xmm5 \n"
4084 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4085 ".p2align 4 \n"
4086 "1: \n"
4087 "movdqa (%1),%%xmm0 \n"
4088 "movdqa (%1,%4,1),%%xmm2 \n"
4089 "movdqa %%xmm0,%%xmm1 \n"
4090 "punpcklbw %%xmm2,%%xmm0 \n"
4091 "punpckhbw %%xmm2,%%xmm1 \n"
4092 "pmaddubsw %%xmm5,%%xmm0 \n"
4093 "pmaddubsw %%xmm5,%%xmm1 \n"
4094 "psrlw $0x7,%%xmm0 \n"
4095 "psrlw $0x7,%%xmm1 \n"
4096 "packuswb %%xmm1,%%xmm0 \n"
4097 "sub $0x4,%2 \n"
4098 "movdqa %%xmm0,(%1,%0,1) \n"
4099 "lea 0x10(%1),%1 \n"
4100 "jg 1b \n"
4101 "jmp 4f \n"
4102 ".p2align 4 \n"
4103 "2: \n"
4104 "movdqa (%1),%%xmm0 \n"
4105 "sub $0x4,%2 \n"
4106 "movdqa %%xmm0,(%1,%0,1) \n"
4107 "lea 0x10(%1),%1 \n"
4108 "jg 2b \n"
4109 "jmp 4f \n"
4110 ".p2align 4 \n"
4111 "3: \n"
4112 "movdqa (%1),%%xmm0 \n"
4113 "pavgb (%1,%4,1),%%xmm0 \n"
4114 "sub $0x4,%2 \n"
4115 "movdqa %%xmm0,(%1,%0,1) \n"
4116 "lea 0x10(%1),%1 \n"
4117 "jg 3b \n"
4118 "4: \n"
4119 ".p2align 4 \n"
4120 : "+r"(dst_ptr), // %0
4121 "+r"(src_ptr), // %1
4122 "+r"(dst_width), // %2
4123 "+r"(source_y_fraction) // %3
4124 : "r"(static_cast<intptr_t>(src_stride)) // %4
4125 : "memory", "cc"
4126#if defined(__SSE2__)
4127 , "xmm0", "xmm1", "xmm2", "xmm5"
4128#endif
4129 );
4130}
4131
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004132void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4133 uint8* dst_uv, int pix) {
4134 asm volatile (
4135 "sub %0,%1 \n"
4136 ".p2align 4 \n"
4137 "1: \n"
4138 "movdqa (%0),%%xmm0 \n"
4139 "pavgb (%0,%3),%%xmm0 \n"
4140 "sub $0x10,%2 \n"
4141 "movdqa %%xmm0,(%0,%1) \n"
4142 "lea 0x10(%0),%0 \n"
4143 "jg 1b \n"
4144 : "+r"(src_uv), // %0
4145 "+r"(dst_uv), // %1
4146 "+r"(pix) // %2
4147 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4148 : "memory", "cc"
4149#if defined(__SSE2__)
4150 , "xmm0"
4151#endif
4152 );
4153}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004154
4155void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4156 uint32 selector, int pix) {
4157 asm volatile (
4158 "movd %3,%%xmm5 \n"
4159 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4160 ".p2align 4 \n"
4161 "1: \n"
4162 "movdqa (%0),%%xmm0 \n"
4163 "lea 0x10(%0),%0 \n"
4164 "pshufb %%xmm5,%%xmm0 \n"
4165 "sub $0x4,%2 \n"
4166 "movd %%xmm0,(%1) \n"
4167 "lea 0x4(%1),%1 \n"
4168 "jg 1b \n"
4169 : "+r"(src_argb), // %0
4170 "+r"(dst_bayer), // %1
4171 "+r"(pix) // %2
4172 : "g"(selector) // %3
4173 : "memory", "cc"
4174#if defined(__SSE2__)
4175 , "xmm0", "xmm5"
4176#endif
4177 );
4178}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004179
4180void I422ToYUY2Row_SSE2(const uint8* src_y,
4181 const uint8* src_u,
4182 const uint8* src_v,
4183 uint8* dst_frame, int width) {
4184 asm volatile (
4185 "sub %1,%2 \n"
4186 ".p2align 4 \n"
4187 "1: \n"
4188 "movq (%1),%%xmm2 \n"
4189 "movq (%1,%2,1),%%xmm3 \n"
4190 "lea 0x8(%1),%1 \n"
4191 "punpcklbw %%xmm3,%%xmm2 \n"
4192 "movdqa (%0),%%xmm0 \n"
4193 "lea 0x10(%0),%0 \n"
4194 "movdqa %%xmm0,%%xmm1 \n"
4195 "punpcklbw %%xmm2,%%xmm0 \n"
4196 "punpckhbw %%xmm2,%%xmm1 \n"
4197 "movdqa %%xmm0,(%3) \n"
4198 "movdqa %%xmm1,0x10(%3) \n"
4199 "lea 0x20(%3),%3 \n"
4200 "sub $0x10,%4 \n"
4201 "jg 1b \n"
4202 : "+r"(src_y), // %0
4203 "+r"(src_u), // %1
4204 "+r"(src_v), // %2
4205 "+r"(dst_frame), // %3
4206 "+rm"(width) // %4
4207 :
4208 : "memory", "cc"
4209#if defined(__SSE2__)
4210 , "xmm0", "xmm1", "xmm2", "xmm3"
4211#endif
4212 );
4213}
4214
4215void I422ToUYVYRow_SSE2(const uint8* src_y,
4216 const uint8* src_u,
4217 const uint8* src_v,
4218 uint8* dst_frame, int width) {
4219 asm volatile (
4220 "sub %1,%2 \n"
4221 ".p2align 4 \n"
4222 "1: \n"
4223 "movq (%1),%%xmm2 \n"
4224 "movq (%1,%2,1),%%xmm3 \n"
4225 "lea 0x8(%1),%1 \n"
4226 "punpcklbw %%xmm3,%%xmm2 \n"
4227 "movdqa (%0),%%xmm0 \n"
4228 "movdqa %%xmm2,%%xmm1 \n"
4229 "lea 0x10(%0),%0 \n"
4230 "punpcklbw %%xmm0,%%xmm1 \n"
4231 "punpckhbw %%xmm0,%%xmm2 \n"
4232 "movdqa %%xmm1,(%3) \n"
4233 "movdqa %%xmm2,0x10(%3) \n"
4234 "lea 0x20(%3),%3 \n"
4235 "sub $0x10,%4 \n"
4236 "jg 1b \n"
4237 : "+r"(src_y), // %0
4238 "+r"(src_u), // %1
4239 "+r"(src_v), // %2
4240 "+r"(dst_frame), // %3
4241 "+rm"(width) // %4
4242 :
4243 : "memory", "cc"
4244#if defined(__SSE2__)
4245 , "xmm0", "xmm1", "xmm2", "xmm3"
4246#endif
4247 );
4248}
4249
fbarchard@google.com2d11d432012-02-16 02:50:39 +00004250#endif // defined(__x86_64__) || defined(__i386__)
4251
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004252#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00004253} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004254} // namespace libyuv
4255#endif