blob: 1078ed6547e56ac4be4490eae9c6fcb11e6a9934 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
fbarchard@google.com4de0c432012-10-11 01:25:46 +000072// Constants for RGBA.
73CONST vec8 kRGBAToY = {
74 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
75};
76
77CONST vec8 kRGBAToU = {
78 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
79};
80
81CONST vec8 kRGBAToV = {
82 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
83};
84
fbarchard@google.com714050a2012-02-17 22:59:56 +000085CONST uvec8 kAddY16 = {
86 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000087};
fbarchard@google.com2430e042011-11-11 21:57:06 +000088
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000089CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
92};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000093
fbarchard@google.comba1f5262012-01-12 19:22:41 +000094// Shuffle table for converting RGB24 to ARGB.
95CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000096 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
97};
98
99// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000100CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000101 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
102};
103
fbarchard@google.comb6149762011-11-07 21:58:52 +0000104// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000105CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000106 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
107};
108
109// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000110CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
112};
113
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000114// Shuffle table for converting RGBA to ARGB.
115CONST uvec8 kShuffleMaskRGBAToARGB = {
116 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
117};
118
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000119// Shuffle table for converting ARGB to RGBA.
120CONST uvec8 kShuffleMaskARGBToRGBA = {
121 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
122};
123
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000124// Shuffle table for converting ARGB to RGB24.
125CONST uvec8 kShuffleMaskARGBToRGB24 = {
126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
127};
128
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000129// Shuffle table for converting ARGB to RAW.
130CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000131 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000132};
133
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000134// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000135CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
137};
138
139// Shuffle table for converting ARGB to RAW.
140CONST uvec8 kShuffleMaskARGBToRAW_0 = {
141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142};
143
fbarchard@google.comb6149762011-11-07 21:58:52 +0000144void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000145 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pcmpeqb %%xmm5,%%xmm5 \n"
147 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000148 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "1: \n"
150 "movq (%0),%%xmm0 \n"
151 "lea 0x8(%0),%0 \n"
152 "punpcklbw %%xmm0,%%xmm0 \n"
153 "movdqa %%xmm0,%%xmm1 \n"
154 "punpcklwd %%xmm0,%%xmm0 \n"
155 "punpckhwd %%xmm1,%%xmm1 \n"
156 "por %%xmm5,%%xmm0 \n"
157 "por %%xmm5,%%xmm1 \n"
158 "movdqa %%xmm0,(%1) \n"
159 "movdqa %%xmm1,0x10(%1) \n"
160 "lea 0x20(%1),%1 \n"
161 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000162 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163 : "+r"(src_y), // %0
164 "+r"(dst_argb), // %1
165 "+r"(pix) // %2
166 :
167 : "memory", "cc"
168#if defined(__SSE2__)
169 , "xmm0", "xmm1", "xmm5"
170#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000171 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000172}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173
174void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000175 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000176 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000177 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000178 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000179 "1: \n"
180 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000181 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000182 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000183 "movdqa %%xmm0,(%0,%1,1) \n"
184 "lea 0x10(%0),%0 \n"
185 "jg 1b \n"
186
fbarchard@google.comb6149762011-11-07 21:58:52 +0000187 : "+r"(src_abgr), // %0
188 "+r"(dst_argb), // %1
189 "+r"(pix) // %2
190 : "m"(kShuffleMaskABGRToARGB) // %3
191 : "memory", "cc"
192#if defined(__SSE2__)
193 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000194#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000195 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000196}
197
198void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000199 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000200 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000201 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000202 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000203 "1: \n"
204 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000205 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000206 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000207 "movdqa %%xmm0,(%0,%1,1) \n"
208 "lea 0x10(%0),%0 \n"
209 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000210 : "+r"(src_bgra), // %0
211 "+r"(dst_argb), // %1
212 "+r"(pix) // %2
213 : "m"(kShuffleMaskBGRAToARGB) // %3
214 : "memory", "cc"
215#if defined(__SSE2__)
216 , "xmm0", "xmm5"
217#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000218 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000219}
220
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000221void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
222 asm volatile (
223 "movdqa %3,%%xmm5 \n"
224 "sub %0,%1 \n"
225 ".p2align 4 \n"
226 "1: \n"
227 "movdqa (%0),%%xmm0 \n"
228 "pshufb %%xmm5,%%xmm0 \n"
229 "sub $0x4,%2 \n"
230 "movdqa %%xmm0,(%0,%1,1) \n"
231 "lea 0x10(%0),%0 \n"
232 "jg 1b \n"
233
234 : "+r"(src_rgba), // %0
235 "+r"(dst_argb), // %1
236 "+r"(pix) // %2
237 : "m"(kShuffleMaskRGBAToARGB) // %3
238 : "memory", "cc"
239#if defined(__SSE2__)
240 , "xmm0", "xmm5"
241#endif
242 );
243}
244
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000245void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
246 asm volatile (
247 "movdqa %3,%%xmm5 \n"
248 "sub %0,%1 \n"
249 ".p2align 4 \n"
250 "1: \n"
251 "movdqa (%0),%%xmm0 \n"
252 "pshufb %%xmm5,%%xmm0 \n"
253 "sub $0x4,%2 \n"
254 "movdqa %%xmm0,(%0,%1,1) \n"
255 "lea 0x10(%0),%0 \n"
256 "jg 1b \n"
257
258 : "+r"(src_argb), // %0
259 "+r"(dst_rgba), // %1
260 "+r"(pix) // %2
261 : "m"(kShuffleMaskARGBToRGBA) // %3
262 : "memory", "cc"
263#if defined(__SSE2__)
264 , "xmm0", "xmm5"
265#endif
266 );
267}
268
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000269void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000270 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000271 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
272 "pslld $0x18,%%xmm5 \n"
273 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000274 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000275 "1: \n"
276 "movdqu (%0),%%xmm0 \n"
277 "movdqu 0x10(%0),%%xmm1 \n"
278 "movdqu 0x20(%0),%%xmm3 \n"
279 "lea 0x30(%0),%0 \n"
280 "movdqa %%xmm3,%%xmm2 \n"
281 "palignr $0x8,%%xmm1,%%xmm2 \n"
282 "pshufb %%xmm4,%%xmm2 \n"
283 "por %%xmm5,%%xmm2 \n"
284 "palignr $0xc,%%xmm0,%%xmm1 \n"
285 "pshufb %%xmm4,%%xmm0 \n"
286 "movdqa %%xmm2,0x20(%1) \n"
287 "por %%xmm5,%%xmm0 \n"
288 "pshufb %%xmm4,%%xmm1 \n"
289 "movdqa %%xmm0,(%1) \n"
290 "por %%xmm5,%%xmm1 \n"
291 "palignr $0x4,%%xmm3,%%xmm3 \n"
292 "pshufb %%xmm4,%%xmm3 \n"
293 "movdqa %%xmm1,0x10(%1) \n"
294 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000295 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000296 "movdqa %%xmm3,0x30(%1) \n"
297 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000298 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000299 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000300 "+r"(dst_argb), // %1
301 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000302 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000303 : "memory", "cc"
304#if defined(__SSE2__)
305 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
306#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000307 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000308}
309
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000310void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000311 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000312 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
313 "pslld $0x18,%%xmm5 \n"
314 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000315 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000316 "1: \n"
317 "movdqu (%0),%%xmm0 \n"
318 "movdqu 0x10(%0),%%xmm1 \n"
319 "movdqu 0x20(%0),%%xmm3 \n"
320 "lea 0x30(%0),%0 \n"
321 "movdqa %%xmm3,%%xmm2 \n"
322 "palignr $0x8,%%xmm1,%%xmm2 \n"
323 "pshufb %%xmm4,%%xmm2 \n"
324 "por %%xmm5,%%xmm2 \n"
325 "palignr $0xc,%%xmm0,%%xmm1 \n"
326 "pshufb %%xmm4,%%xmm0 \n"
327 "movdqa %%xmm2,0x20(%1) \n"
328 "por %%xmm5,%%xmm0 \n"
329 "pshufb %%xmm4,%%xmm1 \n"
330 "movdqa %%xmm0,(%1) \n"
331 "por %%xmm5,%%xmm1 \n"
332 "palignr $0x4,%%xmm3,%%xmm3 \n"
333 "pshufb %%xmm4,%%xmm3 \n"
334 "movdqa %%xmm1,0x10(%1) \n"
335 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000336 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000337 "movdqa %%xmm3,0x30(%1) \n"
338 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000339 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000340 : "+r"(src_raw), // %0
341 "+r"(dst_argb), // %1
342 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000343 : "m"(kShuffleMaskRAWToARGB) // %3
344 : "memory", "cc"
345#if defined(__SSE2__)
346 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
347#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000348 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000349}
350
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000351void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000352 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000353 "mov $0x1080108,%%eax \n"
354 "movd %%eax,%%xmm5 \n"
355 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000356 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000357 "movd %%eax,%%xmm6 \n"
358 "pshufd $0x0,%%xmm6,%%xmm6 \n"
359 "pcmpeqb %%xmm3,%%xmm3 \n"
360 "psllw $0xb,%%xmm3 \n"
361 "pcmpeqb %%xmm4,%%xmm4 \n"
362 "psllw $0xa,%%xmm4 \n"
363 "psrlw $0x5,%%xmm4 \n"
364 "pcmpeqb %%xmm7,%%xmm7 \n"
365 "psllw $0x8,%%xmm7 \n"
366 "sub %0,%1 \n"
367 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000368 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000369 "1: \n"
370 "movdqu (%0),%%xmm0 \n"
371 "movdqa %%xmm0,%%xmm1 \n"
372 "movdqa %%xmm0,%%xmm2 \n"
373 "pand %%xmm3,%%xmm1 \n"
374 "psllw $0xb,%%xmm2 \n"
375 "pmulhuw %%xmm5,%%xmm1 \n"
376 "pmulhuw %%xmm5,%%xmm2 \n"
377 "psllw $0x8,%%xmm1 \n"
378 "por %%xmm2,%%xmm1 \n"
379 "pand %%xmm4,%%xmm0 \n"
380 "pmulhuw %%xmm6,%%xmm0 \n"
381 "por %%xmm7,%%xmm0 \n"
382 "movdqa %%xmm1,%%xmm2 \n"
383 "punpcklbw %%xmm0,%%xmm1 \n"
384 "punpckhbw %%xmm0,%%xmm2 \n"
385 "movdqa %%xmm1,(%1,%0,2) \n"
386 "movdqa %%xmm2,0x10(%1,%0,2) \n"
387 "lea 0x10(%0),%0 \n"
388 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000389 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000390 : "+r"(src), // %0
391 "+r"(dst), // %1
392 "+r"(pix) // %2
393 :
394 : "memory", "cc", "eax"
395#if defined(__SSE2__)
396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
397#endif
398 );
399}
400
401void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000402 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000403 "mov $0x1080108,%%eax \n"
404 "movd %%eax,%%xmm5 \n"
405 "pshufd $0x0,%%xmm5,%%xmm5 \n"
406 "mov $0x42004200,%%eax \n"
407 "movd %%eax,%%xmm6 \n"
408 "pshufd $0x0,%%xmm6,%%xmm6 \n"
409 "pcmpeqb %%xmm3,%%xmm3 \n"
410 "psllw $0xb,%%xmm3 \n"
411 "movdqa %%xmm3,%%xmm4 \n"
412 "psrlw $0x6,%%xmm4 \n"
413 "pcmpeqb %%xmm7,%%xmm7 \n"
414 "psllw $0x8,%%xmm7 \n"
415 "sub %0,%1 \n"
416 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000417 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000418 "1: \n"
419 "movdqu (%0),%%xmm0 \n"
420 "movdqa %%xmm0,%%xmm1 \n"
421 "movdqa %%xmm0,%%xmm2 \n"
422 "psllw $0x1,%%xmm1 \n"
423 "psllw $0xb,%%xmm2 \n"
424 "pand %%xmm3,%%xmm1 \n"
425 "pmulhuw %%xmm5,%%xmm2 \n"
426 "pmulhuw %%xmm5,%%xmm1 \n"
427 "psllw $0x8,%%xmm1 \n"
428 "por %%xmm2,%%xmm1 \n"
429 "movdqa %%xmm0,%%xmm2 \n"
430 "pand %%xmm4,%%xmm0 \n"
431 "psraw $0x8,%%xmm2 \n"
432 "pmulhuw %%xmm6,%%xmm0 \n"
433 "pand %%xmm7,%%xmm2 \n"
434 "por %%xmm2,%%xmm0 \n"
435 "movdqa %%xmm1,%%xmm2 \n"
436 "punpcklbw %%xmm0,%%xmm1 \n"
437 "punpckhbw %%xmm0,%%xmm2 \n"
438 "movdqa %%xmm1,(%1,%0,2) \n"
439 "movdqa %%xmm2,0x10(%1,%0,2) \n"
440 "lea 0x10(%0),%0 \n"
441 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 :
447 : "memory", "cc", "eax"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
450#endif
451 );
452}
453
454void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "mov $0xf0f0f0f,%%eax \n"
457 "movd %%eax,%%xmm4 \n"
458 "pshufd $0x0,%%xmm4,%%xmm4 \n"
459 "movdqa %%xmm4,%%xmm5 \n"
460 "pslld $0x4,%%xmm5 \n"
461 "sub %0,%1 \n"
462 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000463 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000464 "1: \n"
465 "movdqu (%0),%%xmm0 \n"
466 "movdqa %%xmm0,%%xmm2 \n"
467 "pand %%xmm4,%%xmm0 \n"
468 "pand %%xmm5,%%xmm2 \n"
469 "movdqa %%xmm0,%%xmm1 \n"
470 "movdqa %%xmm2,%%xmm3 \n"
471 "psllw $0x4,%%xmm1 \n"
472 "psrlw $0x4,%%xmm3 \n"
473 "por %%xmm1,%%xmm0 \n"
474 "por %%xmm3,%%xmm2 \n"
475 "movdqa %%xmm0,%%xmm1 \n"
476 "punpcklbw %%xmm2,%%xmm0 \n"
477 "punpckhbw %%xmm2,%%xmm1 \n"
478 "movdqa %%xmm0,(%1,%0,2) \n"
479 "movdqa %%xmm1,0x10(%1,%0,2) \n"
480 "lea 0x10(%0),%0 \n"
481 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000482 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000483 : "+r"(src), // %0
484 "+r"(dst), // %1
485 "+r"(pix) // %2
486 :
487 : "memory", "cc", "eax"
488#if defined(__SSE2__)
489 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
490#endif
491 );
492}
493
494void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000495 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000496 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000497 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000498 "1: \n"
499 "movdqa (%0),%%xmm0 \n"
500 "movdqa 0x10(%0),%%xmm1 \n"
501 "movdqa 0x20(%0),%%xmm2 \n"
502 "movdqa 0x30(%0),%%xmm3 \n"
503 "lea 0x40(%0),%0 \n"
504 "pshufb %%xmm6,%%xmm0 \n"
505 "pshufb %%xmm6,%%xmm1 \n"
506 "pshufb %%xmm6,%%xmm2 \n"
507 "pshufb %%xmm6,%%xmm3 \n"
508 "movdqa %%xmm1,%%xmm4 \n"
509 "psrldq $0x4,%%xmm1 \n"
510 "pslldq $0xc,%%xmm4 \n"
511 "movdqa %%xmm2,%%xmm5 \n"
512 "por %%xmm4,%%xmm0 \n"
513 "pslldq $0x8,%%xmm5 \n"
514 "movdqa %%xmm0,(%1) \n"
515 "por %%xmm5,%%xmm1 \n"
516 "psrldq $0x8,%%xmm2 \n"
517 "pslldq $0x4,%%xmm3 \n"
518 "por %%xmm3,%%xmm2 \n"
519 "movdqa %%xmm1,0x10(%1) \n"
520 "movdqa %%xmm2,0x20(%1) \n"
521 "lea 0x30(%1),%1 \n"
522 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 : "m"(kShuffleMaskARGBToRGB24) // %3
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
531#endif
532 );
533}
534
535void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000538 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000539 "1: \n"
540 "movdqa (%0),%%xmm0 \n"
541 "movdqa 0x10(%0),%%xmm1 \n"
542 "movdqa 0x20(%0),%%xmm2 \n"
543 "movdqa 0x30(%0),%%xmm3 \n"
544 "lea 0x40(%0),%0 \n"
545 "pshufb %%xmm6,%%xmm0 \n"
546 "pshufb %%xmm6,%%xmm1 \n"
547 "pshufb %%xmm6,%%xmm2 \n"
548 "pshufb %%xmm6,%%xmm3 \n"
549 "movdqa %%xmm1,%%xmm4 \n"
550 "psrldq $0x4,%%xmm1 \n"
551 "pslldq $0xc,%%xmm4 \n"
552 "movdqa %%xmm2,%%xmm5 \n"
553 "por %%xmm4,%%xmm0 \n"
554 "pslldq $0x8,%%xmm5 \n"
555 "movdqa %%xmm0,(%1) \n"
556 "por %%xmm5,%%xmm1 \n"
557 "psrldq $0x8,%%xmm2 \n"
558 "pslldq $0x4,%%xmm3 \n"
559 "por %%xmm3,%%xmm2 \n"
560 "movdqa %%xmm1,0x10(%1) \n"
561 "movdqa %%xmm2,0x20(%1) \n"
562 "lea 0x30(%1),%1 \n"
563 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000564 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000565 : "+r"(src), // %0
566 "+r"(dst), // %1
567 "+r"(pix) // %2
568 : "m"(kShuffleMaskARGBToRAW) // %3
569 : "memory", "cc"
570#if defined(__SSE2__)
571 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
572#endif
573 );
574}
575
576void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000577 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000578 "pcmpeqb %%xmm3,%%xmm3 \n"
579 "psrld $0x1b,%%xmm3 \n"
580 "pcmpeqb %%xmm4,%%xmm4 \n"
581 "psrld $0x1a,%%xmm4 \n"
582 "pslld $0x5,%%xmm4 \n"
583 "pcmpeqb %%xmm5,%%xmm5 \n"
584 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "movdqa %%xmm0,%%xmm2 \n"
590 "pslld $0x8,%%xmm0 \n"
591 "psrld $0x3,%%xmm1 \n"
592 "psrld $0x5,%%xmm2 \n"
593 "psrad $0x10,%%xmm0 \n"
594 "pand %%xmm3,%%xmm1 \n"
595 "pand %%xmm4,%%xmm2 \n"
596 "pand %%xmm5,%%xmm0 \n"
597 "por %%xmm2,%%xmm1 \n"
598 "por %%xmm1,%%xmm0 \n"
599 "packssdw %%xmm0,%%xmm0 \n"
600 "lea 0x10(%0),%0 \n"
601 "movq %%xmm0,(%1) \n"
602 "lea 0x8(%1),%1 \n"
603 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000604 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000605 : "+r"(src), // %0
606 "+r"(dst), // %1
607 "+r"(pix) // %2
608 :
609 : "memory", "cc"
610#if defined(__SSE2__)
611 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
612#endif
613 );
614}
615
616void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000617 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000618 "pcmpeqb %%xmm4,%%xmm4 \n"
619 "psrld $0x1b,%%xmm4 \n"
620 "movdqa %%xmm4,%%xmm5 \n"
621 "pslld $0x5,%%xmm5 \n"
622 "movdqa %%xmm4,%%xmm6 \n"
623 "pslld $0xa,%%xmm6 \n"
624 "pcmpeqb %%xmm7,%%xmm7 \n"
625 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000626 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000627 "1: \n"
628 "movdqa (%0),%%xmm0 \n"
629 "movdqa %%xmm0,%%xmm1 \n"
630 "movdqa %%xmm0,%%xmm2 \n"
631 "movdqa %%xmm0,%%xmm3 \n"
632 "psrad $0x10,%%xmm0 \n"
633 "psrld $0x3,%%xmm1 \n"
634 "psrld $0x6,%%xmm2 \n"
635 "psrld $0x9,%%xmm3 \n"
636 "pand %%xmm7,%%xmm0 \n"
637 "pand %%xmm4,%%xmm1 \n"
638 "pand %%xmm5,%%xmm2 \n"
639 "pand %%xmm6,%%xmm3 \n"
640 "por %%xmm1,%%xmm0 \n"
641 "por %%xmm3,%%xmm2 \n"
642 "por %%xmm2,%%xmm0 \n"
643 "packssdw %%xmm0,%%xmm0 \n"
644 "lea 0x10(%0),%0 \n"
645 "movq %%xmm0,(%1) \n"
646 "lea 0x8(%1),%1 \n"
647 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000648 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000649 : "+r"(src), // %0
650 "+r"(dst), // %1
651 "+r"(pix) // %2
652 :
653 : "memory", "cc"
654#if defined(__SSE2__)
655 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
656#endif
657 );
658}
659
660void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000661 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000662 "pcmpeqb %%xmm4,%%xmm4 \n"
663 "psllw $0xc,%%xmm4 \n"
664 "movdqa %%xmm4,%%xmm3 \n"
665 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000666 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000667 "1: \n"
668 "movdqa (%0),%%xmm0 \n"
669 "movdqa %%xmm0,%%xmm1 \n"
670 "pand %%xmm3,%%xmm0 \n"
671 "pand %%xmm4,%%xmm1 \n"
672 "psrlq $0x4,%%xmm0 \n"
673 "psrlq $0x8,%%xmm1 \n"
674 "por %%xmm1,%%xmm0 \n"
675 "packuswb %%xmm0,%%xmm0 \n"
676 "lea 0x10(%0),%0 \n"
677 "movq %%xmm0,(%1) \n"
678 "lea 0x8(%1),%1 \n"
679 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000680 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000681 : "+r"(src), // %0
682 "+r"(dst), // %1
683 "+r"(pix) // %2
684 :
685 : "memory", "cc"
686#if defined(__SSE2__)
687 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
688#endif
689 );
690}
691
fbarchard@google.comb6149762011-11-07 21:58:52 +0000692void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000693 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000694 "movdqa %4,%%xmm5 \n"
695 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000696 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000697 "1: \n"
698 "movdqa (%0),%%xmm0 \n"
699 "movdqa 0x10(%0),%%xmm1 \n"
700 "movdqa 0x20(%0),%%xmm2 \n"
701 "movdqa 0x30(%0),%%xmm3 \n"
702 "pmaddubsw %%xmm4,%%xmm0 \n"
703 "pmaddubsw %%xmm4,%%xmm1 \n"
704 "pmaddubsw %%xmm4,%%xmm2 \n"
705 "pmaddubsw %%xmm4,%%xmm3 \n"
706 "lea 0x40(%0),%0 \n"
707 "phaddw %%xmm1,%%xmm0 \n"
708 "phaddw %%xmm3,%%xmm2 \n"
709 "psrlw $0x7,%%xmm0 \n"
710 "psrlw $0x7,%%xmm2 \n"
711 "packuswb %%xmm2,%%xmm0 \n"
712 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000713 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000714 "movdqa %%xmm0,(%1) \n"
715 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000716 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000717 : "+r"(src_argb), // %0
718 "+r"(dst_y), // %1
719 "+r"(pix) // %2
720 : "m"(kARGBToY), // %3
721 "m"(kAddY16) // %4
722 : "memory", "cc"
723#if defined(__SSE2__)
724 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
725#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000726 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000727}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000728
729void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000730 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000731 "movdqa %4,%%xmm5 \n"
732 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000733 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000734 "1: \n"
735 "movdqu (%0),%%xmm0 \n"
736 "movdqu 0x10(%0),%%xmm1 \n"
737 "movdqu 0x20(%0),%%xmm2 \n"
738 "movdqu 0x30(%0),%%xmm3 \n"
739 "pmaddubsw %%xmm4,%%xmm0 \n"
740 "pmaddubsw %%xmm4,%%xmm1 \n"
741 "pmaddubsw %%xmm4,%%xmm2 \n"
742 "pmaddubsw %%xmm4,%%xmm3 \n"
743 "lea 0x40(%0),%0 \n"
744 "phaddw %%xmm1,%%xmm0 \n"
745 "phaddw %%xmm3,%%xmm2 \n"
746 "psrlw $0x7,%%xmm0 \n"
747 "psrlw $0x7,%%xmm2 \n"
748 "packuswb %%xmm2,%%xmm0 \n"
749 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000750 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000751 "movdqu %%xmm0,(%1) \n"
752 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000753 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000754 : "+r"(src_argb), // %0
755 "+r"(dst_y), // %1
756 "+r"(pix) // %2
757 : "m"(kARGBToY), // %3
758 "m"(kAddY16) // %4
759 : "memory", "cc"
760#if defined(__SSE2__)
761 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
762#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000763 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000764}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000765
fbarchard@google.com714050a2012-02-17 22:59:56 +0000766// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000767// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
768// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
769// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000770// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000771void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
772 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000773 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000774 "movdqa %0,%%xmm4 \n"
775 "movdqa %1,%%xmm3 \n"
776 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000777 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000778 : "m"(kARGBToU), // %0
779 "m"(kARGBToV), // %1
780 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000781 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000782 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000783 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000784 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000785 "1: \n"
786 "movdqa (%0),%%xmm0 \n"
787 "movdqa 0x10(%0),%%xmm1 \n"
788 "movdqa 0x20(%0),%%xmm2 \n"
789 "movdqa 0x30(%0),%%xmm6 \n"
790 "pavgb (%0,%4,1),%%xmm0 \n"
791 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
792 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
793 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
794 "lea 0x40(%0),%0 \n"
795 "movdqa %%xmm0,%%xmm7 \n"
796 "shufps $0x88,%%xmm1,%%xmm0 \n"
797 "shufps $0xdd,%%xmm1,%%xmm7 \n"
798 "pavgb %%xmm7,%%xmm0 \n"
799 "movdqa %%xmm2,%%xmm7 \n"
800 "shufps $0x88,%%xmm6,%%xmm2 \n"
801 "shufps $0xdd,%%xmm6,%%xmm7 \n"
802 "pavgb %%xmm7,%%xmm2 \n"
803 "movdqa %%xmm0,%%xmm1 \n"
804 "movdqa %%xmm2,%%xmm6 \n"
805 "pmaddubsw %%xmm4,%%xmm0 \n"
806 "pmaddubsw %%xmm4,%%xmm2 \n"
807 "pmaddubsw %%xmm3,%%xmm1 \n"
808 "pmaddubsw %%xmm3,%%xmm6 \n"
809 "phaddw %%xmm2,%%xmm0 \n"
810 "phaddw %%xmm6,%%xmm1 \n"
811 "psraw $0x8,%%xmm0 \n"
812 "psraw $0x8,%%xmm1 \n"
813 "packsswb %%xmm1,%%xmm0 \n"
814 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000815 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000816 "movlps %%xmm0,(%1) \n"
817 "movhps %%xmm0,(%1,%2,1) \n"
818 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000819 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000820 : "+r"(src_argb0), // %0
821 "+r"(dst_u), // %1
822 "+r"(dst_v), // %2
823 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000824 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000825 : "memory", "cc"
826#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000827 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000828#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000829 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000830}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000831
832void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
833 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000834 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000835 "movdqa %0,%%xmm4 \n"
836 "movdqa %1,%%xmm3 \n"
837 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000838 :
839 : "m"(kARGBToU), // %0
840 "m"(kARGBToV), // %1
841 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000842 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000843 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000844 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000845 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000846 "1: \n"
847 "movdqu (%0),%%xmm0 \n"
848 "movdqu 0x10(%0),%%xmm1 \n"
849 "movdqu 0x20(%0),%%xmm2 \n"
850 "movdqu 0x30(%0),%%xmm6 \n"
851 "movdqu (%0,%4,1),%%xmm7 \n"
852 "pavgb %%xmm7,%%xmm0 \n"
853 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
854 "pavgb %%xmm7,%%xmm1 \n"
855 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
856 "pavgb %%xmm7,%%xmm2 \n"
857 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
858 "pavgb %%xmm7,%%xmm6 \n"
859 "lea 0x40(%0),%0 \n"
860 "movdqa %%xmm0,%%xmm7 \n"
861 "shufps $0x88,%%xmm1,%%xmm0 \n"
862 "shufps $0xdd,%%xmm1,%%xmm7 \n"
863 "pavgb %%xmm7,%%xmm0 \n"
864 "movdqa %%xmm2,%%xmm7 \n"
865 "shufps $0x88,%%xmm6,%%xmm2 \n"
866 "shufps $0xdd,%%xmm6,%%xmm7 \n"
867 "pavgb %%xmm7,%%xmm2 \n"
868 "movdqa %%xmm0,%%xmm1 \n"
869 "movdqa %%xmm2,%%xmm6 \n"
870 "pmaddubsw %%xmm4,%%xmm0 \n"
871 "pmaddubsw %%xmm4,%%xmm2 \n"
872 "pmaddubsw %%xmm3,%%xmm1 \n"
873 "pmaddubsw %%xmm3,%%xmm6 \n"
874 "phaddw %%xmm2,%%xmm0 \n"
875 "phaddw %%xmm6,%%xmm1 \n"
876 "psraw $0x8,%%xmm0 \n"
877 "psraw $0x8,%%xmm1 \n"
878 "packsswb %%xmm1,%%xmm0 \n"
879 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000880 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000881 "movlps %%xmm0,(%1) \n"
882 "movhps %%xmm0,(%1,%2,1) \n"
883 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000884 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000885 : "+r"(src_argb0), // %0
886 "+r"(dst_u), // %1
887 "+r"(dst_v), // %2
888 "+rm"(width) // %3
889 : "r"(static_cast<intptr_t>(src_stride_argb))
890 : "memory", "cc"
891#if defined(__SSE2__)
892 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
893#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000894 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000895}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000896
fbarchard@google.com714050a2012-02-17 22:59:56 +0000897void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000898 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000899 "movdqa %4,%%xmm5 \n"
900 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000901 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000902 "1: \n"
903 "movdqa (%0),%%xmm0 \n"
904 "movdqa 0x10(%0),%%xmm1 \n"
905 "movdqa 0x20(%0),%%xmm2 \n"
906 "movdqa 0x30(%0),%%xmm3 \n"
907 "pmaddubsw %%xmm4,%%xmm0 \n"
908 "pmaddubsw %%xmm4,%%xmm1 \n"
909 "pmaddubsw %%xmm4,%%xmm2 \n"
910 "pmaddubsw %%xmm4,%%xmm3 \n"
911 "lea 0x40(%0),%0 \n"
912 "phaddw %%xmm1,%%xmm0 \n"
913 "phaddw %%xmm3,%%xmm2 \n"
914 "psrlw $0x7,%%xmm0 \n"
915 "psrlw $0x7,%%xmm2 \n"
916 "packuswb %%xmm2,%%xmm0 \n"
917 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000918 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000919 "movdqa %%xmm0,(%1) \n"
920 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000921 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000922 : "+r"(src_bgra), // %0
923 "+r"(dst_y), // %1
924 "+r"(pix) // %2
925 : "m"(kBGRAToY), // %3
926 "m"(kAddY16) // %4
927 : "memory", "cc"
928#if defined(__SSE2__)
929 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000930#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000931 );
932}
933
934void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000935 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000936 "movdqa %4,%%xmm5 \n"
937 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000938 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000939 "1: \n"
940 "movdqu (%0),%%xmm0 \n"
941 "movdqu 0x10(%0),%%xmm1 \n"
942 "movdqu 0x20(%0),%%xmm2 \n"
943 "movdqu 0x30(%0),%%xmm3 \n"
944 "pmaddubsw %%xmm4,%%xmm0 \n"
945 "pmaddubsw %%xmm4,%%xmm1 \n"
946 "pmaddubsw %%xmm4,%%xmm2 \n"
947 "pmaddubsw %%xmm4,%%xmm3 \n"
948 "lea 0x40(%0),%0 \n"
949 "phaddw %%xmm1,%%xmm0 \n"
950 "phaddw %%xmm3,%%xmm2 \n"
951 "psrlw $0x7,%%xmm0 \n"
952 "psrlw $0x7,%%xmm2 \n"
953 "packuswb %%xmm2,%%xmm0 \n"
954 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000955 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000956 "movdqu %%xmm0,(%1) \n"
957 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000958 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000959 : "+r"(src_bgra), // %0
960 "+r"(dst_y), // %1
961 "+r"(pix) // %2
962 : "m"(kBGRAToY), // %3
963 "m"(kAddY16) // %4
964 : "memory", "cc"
965#if defined(__SSE2__)
966 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
967#endif
968 );
969}
970
971void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
972 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000973 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000974 "movdqa %0,%%xmm4 \n"
975 "movdqa %1,%%xmm3 \n"
976 "movdqa %2,%%xmm5 \n"
977 :
978 : "m"(kBGRAToU), // %0
979 "m"(kBGRAToV), // %1
980 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000981 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000982 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000983 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000984 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000985 "1: \n"
986 "movdqa (%0),%%xmm0 \n"
987 "movdqa 0x10(%0),%%xmm1 \n"
988 "movdqa 0x20(%0),%%xmm2 \n"
989 "movdqa 0x30(%0),%%xmm6 \n"
990 "pavgb (%0,%4,1),%%xmm0 \n"
991 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
992 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
993 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
994 "lea 0x40(%0),%0 \n"
995 "movdqa %%xmm0,%%xmm7 \n"
996 "shufps $0x88,%%xmm1,%%xmm0 \n"
997 "shufps $0xdd,%%xmm1,%%xmm7 \n"
998 "pavgb %%xmm7,%%xmm0 \n"
999 "movdqa %%xmm2,%%xmm7 \n"
1000 "shufps $0x88,%%xmm6,%%xmm2 \n"
1001 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1002 "pavgb %%xmm7,%%xmm2 \n"
1003 "movdqa %%xmm0,%%xmm1 \n"
1004 "movdqa %%xmm2,%%xmm6 \n"
1005 "pmaddubsw %%xmm4,%%xmm0 \n"
1006 "pmaddubsw %%xmm4,%%xmm2 \n"
1007 "pmaddubsw %%xmm3,%%xmm1 \n"
1008 "pmaddubsw %%xmm3,%%xmm6 \n"
1009 "phaddw %%xmm2,%%xmm0 \n"
1010 "phaddw %%xmm6,%%xmm1 \n"
1011 "psraw $0x8,%%xmm0 \n"
1012 "psraw $0x8,%%xmm1 \n"
1013 "packsswb %%xmm1,%%xmm0 \n"
1014 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001015 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001016 "movlps %%xmm0,(%1) \n"
1017 "movhps %%xmm0,(%1,%2,1) \n"
1018 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001019 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001020 : "+r"(src_bgra0), // %0
1021 "+r"(dst_u), // %1
1022 "+r"(dst_v), // %2
1023 "+rm"(width) // %3
1024 : "r"(static_cast<intptr_t>(src_stride_bgra))
1025 : "memory", "cc"
1026#if defined(__SSE2__)
1027 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1028#endif
1029 );
1030}
1031
1032void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1033 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001034 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001035 "movdqa %0,%%xmm4 \n"
1036 "movdqa %1,%%xmm3 \n"
1037 "movdqa %2,%%xmm5 \n"
1038 :
1039 : "m"(kBGRAToU), // %0
1040 "m"(kBGRAToV), // %1
1041 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001042 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001043 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001044 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001045 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001046 "1: \n"
1047 "movdqu (%0),%%xmm0 \n"
1048 "movdqu 0x10(%0),%%xmm1 \n"
1049 "movdqu 0x20(%0),%%xmm2 \n"
1050 "movdqu 0x30(%0),%%xmm6 \n"
1051 "movdqu (%0,%4,1),%%xmm7 \n"
1052 "pavgb %%xmm7,%%xmm0 \n"
1053 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1054 "pavgb %%xmm7,%%xmm1 \n"
1055 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1056 "pavgb %%xmm7,%%xmm2 \n"
1057 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1058 "pavgb %%xmm7,%%xmm6 \n"
1059 "lea 0x40(%0),%0 \n"
1060 "movdqa %%xmm0,%%xmm7 \n"
1061 "shufps $0x88,%%xmm1,%%xmm0 \n"
1062 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1063 "pavgb %%xmm7,%%xmm0 \n"
1064 "movdqa %%xmm2,%%xmm7 \n"
1065 "shufps $0x88,%%xmm6,%%xmm2 \n"
1066 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1067 "pavgb %%xmm7,%%xmm2 \n"
1068 "movdqa %%xmm0,%%xmm1 \n"
1069 "movdqa %%xmm2,%%xmm6 \n"
1070 "pmaddubsw %%xmm4,%%xmm0 \n"
1071 "pmaddubsw %%xmm4,%%xmm2 \n"
1072 "pmaddubsw %%xmm3,%%xmm1 \n"
1073 "pmaddubsw %%xmm3,%%xmm6 \n"
1074 "phaddw %%xmm2,%%xmm0 \n"
1075 "phaddw %%xmm6,%%xmm1 \n"
1076 "psraw $0x8,%%xmm0 \n"
1077 "psraw $0x8,%%xmm1 \n"
1078 "packsswb %%xmm1,%%xmm0 \n"
1079 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001080 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001081 "movlps %%xmm0,(%1) \n"
1082 "movhps %%xmm0,(%1,%2,1) \n"
1083 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001084 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001085 : "+r"(src_bgra0), // %0
1086 "+r"(dst_u), // %1
1087 "+r"(dst_v), // %2
1088 "+rm"(width) // %3
1089 : "r"(static_cast<intptr_t>(src_stride_bgra))
1090 : "memory", "cc"
1091#if defined(__SSE2__)
1092 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1093#endif
1094 );
1095}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001096
1097void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001098 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001099 "movdqa %4,%%xmm5 \n"
1100 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001101 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001102 "1: \n"
1103 "movdqa (%0),%%xmm0 \n"
1104 "movdqa 0x10(%0),%%xmm1 \n"
1105 "movdqa 0x20(%0),%%xmm2 \n"
1106 "movdqa 0x30(%0),%%xmm3 \n"
1107 "pmaddubsw %%xmm4,%%xmm0 \n"
1108 "pmaddubsw %%xmm4,%%xmm1 \n"
1109 "pmaddubsw %%xmm4,%%xmm2 \n"
1110 "pmaddubsw %%xmm4,%%xmm3 \n"
1111 "lea 0x40(%0),%0 \n"
1112 "phaddw %%xmm1,%%xmm0 \n"
1113 "phaddw %%xmm3,%%xmm2 \n"
1114 "psrlw $0x7,%%xmm0 \n"
1115 "psrlw $0x7,%%xmm2 \n"
1116 "packuswb %%xmm2,%%xmm0 \n"
1117 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001118 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001119 "movdqa %%xmm0,(%1) \n"
1120 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001121 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001122 : "+r"(src_abgr), // %0
1123 "+r"(dst_y), // %1
1124 "+r"(pix) // %2
1125 : "m"(kABGRToY), // %3
1126 "m"(kAddY16) // %4
1127 : "memory", "cc"
1128#if defined(__SSE2__)
1129 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1130#endif
1131 );
1132}
1133
1134void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001135 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001136 "movdqa %4,%%xmm5 \n"
1137 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001138 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 "1: \n"
1140 "movdqu (%0),%%xmm0 \n"
1141 "movdqu 0x10(%0),%%xmm1 \n"
1142 "movdqu 0x20(%0),%%xmm2 \n"
1143 "movdqu 0x30(%0),%%xmm3 \n"
1144 "pmaddubsw %%xmm4,%%xmm0 \n"
1145 "pmaddubsw %%xmm4,%%xmm1 \n"
1146 "pmaddubsw %%xmm4,%%xmm2 \n"
1147 "pmaddubsw %%xmm4,%%xmm3 \n"
1148 "lea 0x40(%0),%0 \n"
1149 "phaddw %%xmm1,%%xmm0 \n"
1150 "phaddw %%xmm3,%%xmm2 \n"
1151 "psrlw $0x7,%%xmm0 \n"
1152 "psrlw $0x7,%%xmm2 \n"
1153 "packuswb %%xmm2,%%xmm0 \n"
1154 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001155 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001156 "movdqu %%xmm0,(%1) \n"
1157 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001158 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001159 : "+r"(src_abgr), // %0
1160 "+r"(dst_y), // %1
1161 "+r"(pix) // %2
1162 : "m"(kABGRToY), // %3
1163 "m"(kAddY16) // %4
1164 : "memory", "cc"
1165#if defined(__SSE2__)
1166 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1167#endif
1168 );
1169}
1170
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001171void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1172 asm volatile (
1173 "movdqa %4,%%xmm5 \n"
1174 "movdqa %3,%%xmm4 \n"
1175 ".p2align 4 \n"
1176 "1: \n"
1177 "movdqa (%0),%%xmm0 \n"
1178 "movdqa 0x10(%0),%%xmm1 \n"
1179 "movdqa 0x20(%0),%%xmm2 \n"
1180 "movdqa 0x30(%0),%%xmm3 \n"
1181 "pmaddubsw %%xmm4,%%xmm0 \n"
1182 "pmaddubsw %%xmm4,%%xmm1 \n"
1183 "pmaddubsw %%xmm4,%%xmm2 \n"
1184 "pmaddubsw %%xmm4,%%xmm3 \n"
1185 "lea 0x40(%0),%0 \n"
1186 "phaddw %%xmm1,%%xmm0 \n"
1187 "phaddw %%xmm3,%%xmm2 \n"
1188 "psrlw $0x7,%%xmm0 \n"
1189 "psrlw $0x7,%%xmm2 \n"
1190 "packuswb %%xmm2,%%xmm0 \n"
1191 "paddb %%xmm5,%%xmm0 \n"
1192 "sub $0x10,%2 \n"
1193 "movdqa %%xmm0,(%1) \n"
1194 "lea 0x10(%1),%1 \n"
1195 "jg 1b \n"
1196 : "+r"(src_rgba), // %0
1197 "+r"(dst_y), // %1
1198 "+r"(pix) // %2
1199 : "m"(kRGBAToY), // %3
1200 "m"(kAddY16) // %4
1201 : "memory", "cc"
1202#if defined(__SSE2__)
1203 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1204#endif
1205 );
1206}
1207
1208void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1209 asm volatile (
1210 "movdqa %4,%%xmm5 \n"
1211 "movdqa %3,%%xmm4 \n"
1212 ".p2align 4 \n"
1213 "1: \n"
1214 "movdqu (%0),%%xmm0 \n"
1215 "movdqu 0x10(%0),%%xmm1 \n"
1216 "movdqu 0x20(%0),%%xmm2 \n"
1217 "movdqu 0x30(%0),%%xmm3 \n"
1218 "pmaddubsw %%xmm4,%%xmm0 \n"
1219 "pmaddubsw %%xmm4,%%xmm1 \n"
1220 "pmaddubsw %%xmm4,%%xmm2 \n"
1221 "pmaddubsw %%xmm4,%%xmm3 \n"
1222 "lea 0x40(%0),%0 \n"
1223 "phaddw %%xmm1,%%xmm0 \n"
1224 "phaddw %%xmm3,%%xmm2 \n"
1225 "psrlw $0x7,%%xmm0 \n"
1226 "psrlw $0x7,%%xmm2 \n"
1227 "packuswb %%xmm2,%%xmm0 \n"
1228 "paddb %%xmm5,%%xmm0 \n"
1229 "sub $0x10,%2 \n"
1230 "movdqu %%xmm0,(%1) \n"
1231 "lea 0x10(%1),%1 \n"
1232 "jg 1b \n"
1233 : "+r"(src_rgba), // %0
1234 "+r"(dst_y), // %1
1235 "+r"(pix) // %2
1236 : "m"(kRGBAToY), // %3
1237 "m"(kAddY16) // %4
1238 : "memory", "cc"
1239#if defined(__SSE2__)
1240 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1241#endif
1242 );
1243}
1244
fbarchard@google.com714050a2012-02-17 22:59:56 +00001245void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1246 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001247 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001248 "movdqa %0,%%xmm4 \n"
1249 "movdqa %1,%%xmm3 \n"
1250 "movdqa %2,%%xmm5 \n"
1251 :
1252 : "m"(kABGRToU), // %0
1253 "m"(kABGRToV), // %1
1254 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001255 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001256 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001257 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001258 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001259 "1: \n"
1260 "movdqa (%0),%%xmm0 \n"
1261 "movdqa 0x10(%0),%%xmm1 \n"
1262 "movdqa 0x20(%0),%%xmm2 \n"
1263 "movdqa 0x30(%0),%%xmm6 \n"
1264 "pavgb (%0,%4,1),%%xmm0 \n"
1265 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1266 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1267 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1268 "lea 0x40(%0),%0 \n"
1269 "movdqa %%xmm0,%%xmm7 \n"
1270 "shufps $0x88,%%xmm1,%%xmm0 \n"
1271 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1272 "pavgb %%xmm7,%%xmm0 \n"
1273 "movdqa %%xmm2,%%xmm7 \n"
1274 "shufps $0x88,%%xmm6,%%xmm2 \n"
1275 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1276 "pavgb %%xmm7,%%xmm2 \n"
1277 "movdqa %%xmm0,%%xmm1 \n"
1278 "movdqa %%xmm2,%%xmm6 \n"
1279 "pmaddubsw %%xmm4,%%xmm0 \n"
1280 "pmaddubsw %%xmm4,%%xmm2 \n"
1281 "pmaddubsw %%xmm3,%%xmm1 \n"
1282 "pmaddubsw %%xmm3,%%xmm6 \n"
1283 "phaddw %%xmm2,%%xmm0 \n"
1284 "phaddw %%xmm6,%%xmm1 \n"
1285 "psraw $0x8,%%xmm0 \n"
1286 "psraw $0x8,%%xmm1 \n"
1287 "packsswb %%xmm1,%%xmm0 \n"
1288 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001289 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001290 "movlps %%xmm0,(%1) \n"
1291 "movhps %%xmm0,(%1,%2,1) \n"
1292 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001293 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001294 : "+r"(src_abgr0), // %0
1295 "+r"(dst_u), // %1
1296 "+r"(dst_v), // %2
1297 "+rm"(width) // %3
1298 : "r"(static_cast<intptr_t>(src_stride_abgr))
1299 : "memory", "cc"
1300#if defined(__SSE2__)
1301 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1302#endif
1303 );
1304}
1305
1306void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1307 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001308 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001309 "movdqa %0,%%xmm4 \n"
1310 "movdqa %1,%%xmm3 \n"
1311 "movdqa %2,%%xmm5 \n"
1312 :
1313 : "m"(kABGRToU), // %0
1314 "m"(kABGRToV), // %1
1315 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001316 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001317 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001318 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001319 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001320 "1: \n"
1321 "movdqu (%0),%%xmm0 \n"
1322 "movdqu 0x10(%0),%%xmm1 \n"
1323 "movdqu 0x20(%0),%%xmm2 \n"
1324 "movdqu 0x30(%0),%%xmm6 \n"
1325 "movdqu (%0,%4,1),%%xmm7 \n"
1326 "pavgb %%xmm7,%%xmm0 \n"
1327 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1328 "pavgb %%xmm7,%%xmm1 \n"
1329 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1330 "pavgb %%xmm7,%%xmm2 \n"
1331 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1332 "pavgb %%xmm7,%%xmm6 \n"
1333 "lea 0x40(%0),%0 \n"
1334 "movdqa %%xmm0,%%xmm7 \n"
1335 "shufps $0x88,%%xmm1,%%xmm0 \n"
1336 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1337 "pavgb %%xmm7,%%xmm0 \n"
1338 "movdqa %%xmm2,%%xmm7 \n"
1339 "shufps $0x88,%%xmm6,%%xmm2 \n"
1340 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1341 "pavgb %%xmm7,%%xmm2 \n"
1342 "movdqa %%xmm0,%%xmm1 \n"
1343 "movdqa %%xmm2,%%xmm6 \n"
1344 "pmaddubsw %%xmm4,%%xmm0 \n"
1345 "pmaddubsw %%xmm4,%%xmm2 \n"
1346 "pmaddubsw %%xmm3,%%xmm1 \n"
1347 "pmaddubsw %%xmm3,%%xmm6 \n"
1348 "phaddw %%xmm2,%%xmm0 \n"
1349 "phaddw %%xmm6,%%xmm1 \n"
1350 "psraw $0x8,%%xmm0 \n"
1351 "psraw $0x8,%%xmm1 \n"
1352 "packsswb %%xmm1,%%xmm0 \n"
1353 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001354 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001355 "movlps %%xmm0,(%1) \n"
1356 "movhps %%xmm0,(%1,%2,1) \n"
1357 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001358 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001359 : "+r"(src_abgr0), // %0
1360 "+r"(dst_u), // %1
1361 "+r"(dst_v), // %2
1362 "+rm"(width) // %3
1363 : "r"(static_cast<intptr_t>(src_stride_abgr))
1364 : "memory", "cc"
1365#if defined(__SSE2__)
1366 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1367#endif
1368 );
1369}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001370
1371void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1372 uint8* dst_u, uint8* dst_v, int width) {
1373 asm volatile (
1374 "movdqa %0,%%xmm4 \n"
1375 "movdqa %1,%%xmm3 \n"
1376 "movdqa %2,%%xmm5 \n"
1377 :
1378 : "m"(kRGBAToU), // %0
1379 "m"(kRGBAToV), // %1
1380 "m"(kAddUV128) // %2
1381 );
1382 asm volatile (
1383 "sub %1,%2 \n"
1384 ".p2align 4 \n"
1385 "1: \n"
1386 "movdqa (%0),%%xmm0 \n"
1387 "movdqa 0x10(%0),%%xmm1 \n"
1388 "movdqa 0x20(%0),%%xmm2 \n"
1389 "movdqa 0x30(%0),%%xmm6 \n"
1390 "pavgb (%0,%4,1),%%xmm0 \n"
1391 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1392 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1393 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1394 "lea 0x40(%0),%0 \n"
1395 "movdqa %%xmm0,%%xmm7 \n"
1396 "shufps $0x88,%%xmm1,%%xmm0 \n"
1397 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1398 "pavgb %%xmm7,%%xmm0 \n"
1399 "movdqa %%xmm2,%%xmm7 \n"
1400 "shufps $0x88,%%xmm6,%%xmm2 \n"
1401 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1402 "pavgb %%xmm7,%%xmm2 \n"
1403 "movdqa %%xmm0,%%xmm1 \n"
1404 "movdqa %%xmm2,%%xmm6 \n"
1405 "pmaddubsw %%xmm4,%%xmm0 \n"
1406 "pmaddubsw %%xmm4,%%xmm2 \n"
1407 "pmaddubsw %%xmm3,%%xmm1 \n"
1408 "pmaddubsw %%xmm3,%%xmm6 \n"
1409 "phaddw %%xmm2,%%xmm0 \n"
1410 "phaddw %%xmm6,%%xmm1 \n"
1411 "psraw $0x8,%%xmm0 \n"
1412 "psraw $0x8,%%xmm1 \n"
1413 "packsswb %%xmm1,%%xmm0 \n"
1414 "paddb %%xmm5,%%xmm0 \n"
1415 "sub $0x10,%3 \n"
1416 "movlps %%xmm0,(%1) \n"
1417 "movhps %%xmm0,(%1,%2,1) \n"
1418 "lea 0x8(%1),%1 \n"
1419 "jg 1b \n"
1420 : "+r"(src_rgba0), // %0
1421 "+r"(dst_u), // %1
1422 "+r"(dst_v), // %2
1423 "+rm"(width) // %3
1424 : "r"(static_cast<intptr_t>(src_stride_rgba))
1425 : "memory", "cc"
1426#if defined(__SSE2__)
1427 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1428#endif
1429 );
1430}
1431
1432void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1433 uint8* dst_u, uint8* dst_v, int width) {
1434 asm volatile (
1435 "movdqa %0,%%xmm4 \n"
1436 "movdqa %1,%%xmm3 \n"
1437 "movdqa %2,%%xmm5 \n"
1438 :
1439 : "m"(kRGBAToU), // %0
1440 "m"(kRGBAToV), // %1
1441 "m"(kAddUV128) // %2
1442 );
1443 asm volatile (
1444 "sub %1,%2 \n"
1445 ".p2align 4 \n"
1446 "1: \n"
1447 "movdqu (%0),%%xmm0 \n"
1448 "movdqu 0x10(%0),%%xmm1 \n"
1449 "movdqu 0x20(%0),%%xmm2 \n"
1450 "movdqu 0x30(%0),%%xmm6 \n"
1451 "movdqu (%0,%4,1),%%xmm7 \n"
1452 "pavgb %%xmm7,%%xmm0 \n"
1453 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1454 "pavgb %%xmm7,%%xmm1 \n"
1455 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1456 "pavgb %%xmm7,%%xmm2 \n"
1457 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1458 "pavgb %%xmm7,%%xmm6 \n"
1459 "lea 0x40(%0),%0 \n"
1460 "movdqa %%xmm0,%%xmm7 \n"
1461 "shufps $0x88,%%xmm1,%%xmm0 \n"
1462 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1463 "pavgb %%xmm7,%%xmm0 \n"
1464 "movdqa %%xmm2,%%xmm7 \n"
1465 "shufps $0x88,%%xmm6,%%xmm2 \n"
1466 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1467 "pavgb %%xmm7,%%xmm2 \n"
1468 "movdqa %%xmm0,%%xmm1 \n"
1469 "movdqa %%xmm2,%%xmm6 \n"
1470 "pmaddubsw %%xmm4,%%xmm0 \n"
1471 "pmaddubsw %%xmm4,%%xmm2 \n"
1472 "pmaddubsw %%xmm3,%%xmm1 \n"
1473 "pmaddubsw %%xmm3,%%xmm6 \n"
1474 "phaddw %%xmm2,%%xmm0 \n"
1475 "phaddw %%xmm6,%%xmm1 \n"
1476 "psraw $0x8,%%xmm0 \n"
1477 "psraw $0x8,%%xmm1 \n"
1478 "packsswb %%xmm1,%%xmm0 \n"
1479 "paddb %%xmm5,%%xmm0 \n"
1480 "sub $0x10,%3 \n"
1481 "movlps %%xmm0,(%1) \n"
1482 "movhps %%xmm0,(%1,%2,1) \n"
1483 "lea 0x8(%1),%1 \n"
1484 "jg 1b \n"
1485 : "+r"(src_rgba0), // %0
1486 "+r"(dst_u), // %1
1487 "+r"(dst_v), // %2
1488 "+rm"(width) // %3
1489 : "r"(static_cast<intptr_t>(src_stride_rgba))
1490 : "memory", "cc"
1491#if defined(__SSE2__)
1492 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1493#endif
1494 );
1495}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001496#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001497
fbarchard@google.come214fe32012-06-04 23:47:11 +00001498#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001499#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1500#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1501#define UR 0
1502
1503#define VB 0
1504#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1505#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1506
1507// Bias
1508#define BB UB * 128 + VB * 128
1509#define BG UG * 128 + VG * 128
1510#define BR UR * 128 + VR * 128
1511
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001512#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001513
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001514struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001515 vec8 kUVToB; // 0
1516 vec8 kUVToG; // 16
1517 vec8 kUVToR; // 32
1518 vec16 kUVBiasB; // 48
1519 vec16 kUVBiasG; // 64
1520 vec16 kUVBiasR; // 80
1521 vec16 kYSub16; // 96
1522 vec16 kYToRgb; // 112
1523 vec8 kVUToB; // 128
1524 vec8 kVUToG; // 144
1525 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001526} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001527 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1528 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1529 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1530 { BB, BB, BB, BB, BB, BB, BB, BB },
1531 { BG, BG, BG, BG, BG, BG, BG, BG },
1532 { BR, BR, BR, BR, BR, BR, BR, BR },
1533 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001534 { YG, YG, YG, YG, YG, YG, YG, YG },
1535 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1536 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1537 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001538};
1539
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001540
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001541// Read 8 UV from 411
1542#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001543 "movq (%[u_buf]),%%xmm0 \n" \
1544 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1545 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001546 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001547
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001548// Read 4 UV from 422, upsample to 8 UV
1549#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001550 "movd (%[u_buf]),%%xmm0 \n" \
1551 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1552 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001553 "punpcklbw %%xmm1,%%xmm0 \n" \
1554 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001555
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001556// Read 2 UV from 411, upsample to 8 UV
1557#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001558 "movd (%[u_buf]),%%xmm0 \n" \
1559 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1560 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001561 "punpcklbw %%xmm1,%%xmm0 \n" \
1562 "punpcklwd %%xmm0,%%xmm0 \n" \
1563 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001564
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001565// Read 4 UV from NV12, upsample to 8 UV
1566#define READNV12 \
1567 "movq (%[uv_buf]),%%xmm0 \n" \
1568 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001569 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001570
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001571// Convert 8 pixels: 8 UV and 8 Y
1572#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001573 "movdqa %%xmm0,%%xmm1 \n" \
1574 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001575 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1576 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1577 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1578 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1579 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1580 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1581 "movq (%[y_buf]),%%xmm3 \n" \
1582 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001583 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001584 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1585 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001586 "paddsw %%xmm3,%%xmm0 \n" \
1587 "paddsw %%xmm3,%%xmm1 \n" \
1588 "paddsw %%xmm3,%%xmm2 \n" \
1589 "psraw $0x6,%%xmm0 \n" \
1590 "psraw $0x6,%%xmm1 \n" \
1591 "psraw $0x6,%%xmm2 \n" \
1592 "packuswb %%xmm0,%%xmm0 \n" \
1593 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001594 "packuswb %%xmm2,%%xmm2 \n" \
1595
1596// Convert 8 pixels: 8 VU and 8 Y
1597#define YVUTORGB \
1598 "movdqa %%xmm0,%%xmm1 \n" \
1599 "movdqa %%xmm0,%%xmm2 \n" \
1600 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1601 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1602 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1603 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1604 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1605 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1606 "movq (%[y_buf]),%%xmm3 \n" \
1607 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1608 "punpcklbw %%xmm4,%%xmm3 \n" \
1609 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1610 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1611 "paddsw %%xmm3,%%xmm0 \n" \
1612 "paddsw %%xmm3,%%xmm1 \n" \
1613 "paddsw %%xmm3,%%xmm2 \n" \
1614 "psraw $0x6,%%xmm0 \n" \
1615 "psraw $0x6,%%xmm1 \n" \
1616 "psraw $0x6,%%xmm2 \n" \
1617 "packuswb %%xmm0,%%xmm0 \n" \
1618 "packuswb %%xmm1,%%xmm1 \n" \
1619 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001620
1621void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001622 const uint8* u_buf,
1623 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001624 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001625 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001626 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001627 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001628 "pcmpeqb %%xmm5,%%xmm5 \n"
1629 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001630 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001631 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001632 READYUV444
1633 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001634 "punpcklbw %%xmm1,%%xmm0 \n"
1635 "punpcklbw %%xmm5,%%xmm2 \n"
1636 "movdqa %%xmm0,%%xmm1 \n"
1637 "punpcklwd %%xmm2,%%xmm0 \n"
1638 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001639 "movdqa %%xmm0,(%[argb_buf]) \n"
1640 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1641 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1642 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001643 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001644 : [y_buf]"+r"(y_buf), // %[y_buf]
1645 [u_buf]"+r"(u_buf), // %[u_buf]
1646 [v_buf]"+r"(v_buf), // %[v_buf]
1647 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1648 [width]"+rm"(width) // %[width]
1649 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001650 : "memory", "cc"
1651#if defined(__SSE2__)
1652 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1653#endif
1654 );
1655}
1656
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001657void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1658 const uint8* u_buf,
1659 const uint8* v_buf,
1660 uint8* rgb24_buf,
1661 int width) {
1662// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1663#ifdef __APPLE__
1664 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001665 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1666 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1667 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1668 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001669#endif
1670
1671 asm volatile (
1672#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001673 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1674 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001675#endif
1676 "sub %[u_buf],%[v_buf] \n"
1677 "pxor %%xmm4,%%xmm4 \n"
1678 ".p2align 4 \n"
1679 "1: \n"
1680 READYUV422
1681 YUVTORGB
1682 "punpcklbw %%xmm1,%%xmm0 \n"
1683 "punpcklbw %%xmm2,%%xmm2 \n"
1684 "movdqa %%xmm0,%%xmm1 \n"
1685 "punpcklwd %%xmm2,%%xmm0 \n"
1686 "punpckhwd %%xmm2,%%xmm1 \n"
1687 "pshufb %%xmm5,%%xmm0 \n"
1688 "pshufb %%xmm6,%%xmm1 \n"
1689 "palignr $0xc,%%xmm0,%%xmm1 \n"
1690 "movq %%xmm0,(%[rgb24_buf]) \n"
1691 "movdqu %%xmm1,0x8(%[rgb24_buf]) \n"
1692 "lea 0x18(%[rgb24_buf]),%[rgb24_buf] \n"
1693 "sub $0x8,%[width] \n"
1694 "jg 1b \n"
1695 : [y_buf]"+r"(y_buf), // %[y_buf]
1696 [u_buf]"+r"(u_buf), // %[u_buf]
1697 [v_buf]"+r"(v_buf), // %[v_buf]
1698 [rgb24_buf]"+r"(rgb24_buf), // %[rgb24_buf]
1699 [width]"+rm"(width) // %[width]
1700 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1701#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001702 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1703 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001704#endif
1705 : "memory", "cc"
1706#if defined(__SSE2__)
1707 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1708#endif
1709 );
1710}
1711
1712void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1713 const uint8* u_buf,
1714 const uint8* v_buf,
1715 uint8* raw_buf,
1716 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001717// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001718#ifdef __APPLE__
1719 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001720 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1721 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1722 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1723 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001724#endif
1725
1726 asm volatile (
1727#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001728 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1729 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001730#endif
1731 "sub %[u_buf],%[v_buf] \n"
1732 "pxor %%xmm4,%%xmm4 \n"
1733 ".p2align 4 \n"
1734 "1: \n"
1735 READYUV422
1736 YUVTORGB
1737 "punpcklbw %%xmm1,%%xmm0 \n"
1738 "punpcklbw %%xmm2,%%xmm2 \n"
1739 "movdqa %%xmm0,%%xmm1 \n"
1740 "punpcklwd %%xmm2,%%xmm0 \n"
1741 "punpckhwd %%xmm2,%%xmm1 \n"
1742 "pshufb %%xmm5,%%xmm0 \n"
1743 "pshufb %%xmm6,%%xmm1 \n"
1744 "palignr $0xc,%%xmm0,%%xmm1 \n"
1745 "movq %%xmm0,(%[raw_buf]) \n"
1746 "movdqu %%xmm1,0x8(%[raw_buf]) \n"
1747 "lea 0x18(%[raw_buf]),%[raw_buf] \n"
1748 "sub $0x8,%[width] \n"
1749 "jg 1b \n"
1750 : [y_buf]"+r"(y_buf), // %[y_buf]
1751 [u_buf]"+r"(u_buf), // %[u_buf]
1752 [v_buf]"+r"(v_buf), // %[v_buf]
1753 [raw_buf]"+r"(raw_buf), // %[raw_buf]
1754 [width]"+rm"(width) // %[width]
1755 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1756#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001757 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1758 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001759#endif
1760 : "memory", "cc"
1761#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001762 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001763#endif
1764 );
1765}
1766
fbarchard@google.come214fe32012-06-04 23:47:11 +00001767void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001768 const uint8* u_buf,
1769 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001770 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001771 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001772 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001773 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001774 "pcmpeqb %%xmm5,%%xmm5 \n"
1775 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001776 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001777 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001778 READYUV422
1779 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001780 "punpcklbw %%xmm1,%%xmm0 \n"
1781 "punpcklbw %%xmm5,%%xmm2 \n"
1782 "movdqa %%xmm0,%%xmm1 \n"
1783 "punpcklwd %%xmm2,%%xmm0 \n"
1784 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001785 "movdqa %%xmm0,(%[argb_buf]) \n"
1786 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1787 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1788 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001789 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001790 : [y_buf]"+r"(y_buf), // %[y_buf]
1791 [u_buf]"+r"(u_buf), // %[u_buf]
1792 [v_buf]"+r"(v_buf), // %[v_buf]
1793 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1794 [width]"+rm"(width) // %[width]
1795 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001796 : "memory", "cc"
1797#if defined(__SSE2__)
1798 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1799#endif
1800 );
1801}
1802
1803void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1804 const uint8* u_buf,
1805 const uint8* v_buf,
1806 uint8* argb_buf,
1807 int width) {
1808 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001809 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001810 "pcmpeqb %%xmm5,%%xmm5 \n"
1811 "pxor %%xmm4,%%xmm4 \n"
1812 ".p2align 4 \n"
1813 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001814 READYUV411
1815 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001816 "punpcklbw %%xmm1,%%xmm0 \n"
1817 "punpcklbw %%xmm5,%%xmm2 \n"
1818 "movdqa %%xmm0,%%xmm1 \n"
1819 "punpcklwd %%xmm2,%%xmm0 \n"
1820 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001821 "movdqa %%xmm0,(%[argb_buf]) \n"
1822 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1823 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1824 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001825 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001826 : [y_buf]"+r"(y_buf), // %[y_buf]
1827 [u_buf]"+r"(u_buf), // %[u_buf]
1828 [v_buf]"+r"(v_buf), // %[v_buf]
1829 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1830 [width]"+rm"(width) // %[width]
1831 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1832 : "memory", "cc"
1833#if defined(__SSE2__)
1834 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1835#endif
1836 );
1837}
1838
1839void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1840 const uint8* uv_buf,
1841 uint8* argb_buf,
1842 int width) {
1843 asm volatile (
1844 "pcmpeqb %%xmm5,%%xmm5 \n"
1845 "pxor %%xmm4,%%xmm4 \n"
1846 ".p2align 4 \n"
1847 "1: \n"
1848 READNV12
1849 YUVTORGB
1850 "punpcklbw %%xmm1,%%xmm0 \n"
1851 "punpcklbw %%xmm5,%%xmm2 \n"
1852 "movdqa %%xmm0,%%xmm1 \n"
1853 "punpcklwd %%xmm2,%%xmm0 \n"
1854 "punpckhwd %%xmm2,%%xmm1 \n"
1855 "movdqa %%xmm0,(%[argb_buf]) \n"
1856 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1857 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1858 "sub $0x8,%[width] \n"
1859 "jg 1b \n"
1860 : [y_buf]"+r"(y_buf), // %[y_buf]
1861 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1862 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1863 [width]"+rm"(width) // %[width]
1864 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1865 : "memory", "cc"
1866#if defined(__SSE2__)
1867 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1868#endif
1869 );
1870}
1871
1872void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1873 const uint8* vu_buf,
1874 uint8* argb_buf,
1875 int width) {
1876 asm volatile (
1877 "pcmpeqb %%xmm5,%%xmm5 \n"
1878 "pxor %%xmm4,%%xmm4 \n"
1879 ".p2align 4 \n"
1880 "1: \n"
1881 READNV12
1882 YVUTORGB
1883 "punpcklbw %%xmm1,%%xmm0 \n"
1884 "punpcklbw %%xmm5,%%xmm2 \n"
1885 "movdqa %%xmm0,%%xmm1 \n"
1886 "punpcklwd %%xmm2,%%xmm0 \n"
1887 "punpckhwd %%xmm2,%%xmm1 \n"
1888 "movdqa %%xmm0,(%[argb_buf]) \n"
1889 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1890 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1891 "sub $0x8,%[width] \n"
1892 "jg 1b \n"
1893 : [y_buf]"+r"(y_buf), // %[y_buf]
1894 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1895 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1896 [width]"+rm"(width) // %[width]
1897 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001898 : "memory", "cc"
1899#if defined(__SSE2__)
1900 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1901#endif
1902 );
1903}
1904
1905void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1906 const uint8* u_buf,
1907 const uint8* v_buf,
1908 uint8* argb_buf,
1909 int width) {
1910 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001911 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001912 "pcmpeqb %%xmm5,%%xmm5 \n"
1913 "pxor %%xmm4,%%xmm4 \n"
1914 ".p2align 4 \n"
1915 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001916 READYUV444
1917 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001918 "punpcklbw %%xmm1,%%xmm0 \n"
1919 "punpcklbw %%xmm5,%%xmm2 \n"
1920 "movdqa %%xmm0,%%xmm1 \n"
1921 "punpcklwd %%xmm2,%%xmm0 \n"
1922 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001923 "movdqu %%xmm0,(%[argb_buf]) \n"
1924 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1925 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1926 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001927 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001928 : [y_buf]"+r"(y_buf), // %[y_buf]
1929 [u_buf]"+r"(u_buf), // %[u_buf]
1930 [v_buf]"+r"(v_buf), // %[v_buf]
1931 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1932 [width]"+rm"(width) // %[width]
1933 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001934 : "memory", "cc"
1935#if defined(__SSE2__)
1936 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1937#endif
1938 );
1939}
1940
1941void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1942 const uint8* u_buf,
1943 const uint8* v_buf,
1944 uint8* argb_buf,
1945 int width) {
1946 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001947 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001948 "pcmpeqb %%xmm5,%%xmm5 \n"
1949 "pxor %%xmm4,%%xmm4 \n"
1950 ".p2align 4 \n"
1951 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001952 READYUV422
1953 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001954 "punpcklbw %%xmm1,%%xmm0 \n"
1955 "punpcklbw %%xmm5,%%xmm2 \n"
1956 "movdqa %%xmm0,%%xmm1 \n"
1957 "punpcklwd %%xmm2,%%xmm0 \n"
1958 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001959 "movdqu %%xmm0,(%[argb_buf]) \n"
1960 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1961 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1962 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001963 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001964 : [y_buf]"+r"(y_buf), // %[y_buf]
1965 [u_buf]"+r"(u_buf), // %[u_buf]
1966 [v_buf]"+r"(v_buf), // %[v_buf]
1967 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1968 [width]"+rm"(width) // %[width]
1969 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001970 : "memory", "cc"
1971#if defined(__SSE2__)
1972 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1973#endif
1974 );
1975}
1976
1977void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1978 const uint8* u_buf,
1979 const uint8* v_buf,
1980 uint8* argb_buf,
1981 int width) {
1982 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001983 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001984 "pcmpeqb %%xmm5,%%xmm5 \n"
1985 "pxor %%xmm4,%%xmm4 \n"
1986 ".p2align 4 \n"
1987 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001988 READYUV411
1989 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001990 "punpcklbw %%xmm1,%%xmm0 \n"
1991 "punpcklbw %%xmm5,%%xmm2 \n"
1992 "movdqa %%xmm0,%%xmm1 \n"
1993 "punpcklwd %%xmm2,%%xmm0 \n"
1994 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001995 "movdqu %%xmm0,(%[argb_buf]) \n"
1996 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1997 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1998 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001999 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002000 : [y_buf]"+r"(y_buf), // %[y_buf]
2001 [u_buf]"+r"(u_buf), // %[u_buf]
2002 [v_buf]"+r"(v_buf), // %[v_buf]
2003 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2004 [width]"+rm"(width) // %[width]
2005 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2006 : "memory", "cc"
2007#if defined(__SSE2__)
2008 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2009#endif
2010 );
2011}
2012
2013void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2014 const uint8* uv_buf,
2015 uint8* argb_buf,
2016 int width) {
2017 asm volatile (
2018 "pcmpeqb %%xmm5,%%xmm5 \n"
2019 "pxor %%xmm4,%%xmm4 \n"
2020 ".p2align 4 \n"
2021 "1: \n"
2022 READNV12
2023 YUVTORGB
2024 "punpcklbw %%xmm1,%%xmm0 \n"
2025 "punpcklbw %%xmm5,%%xmm2 \n"
2026 "movdqa %%xmm0,%%xmm1 \n"
2027 "punpcklwd %%xmm2,%%xmm0 \n"
2028 "punpckhwd %%xmm2,%%xmm1 \n"
2029 "movdqu %%xmm0,(%[argb_buf]) \n"
2030 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2031 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2032 "sub $0x8,%[width] \n"
2033 "jg 1b \n"
2034 : [y_buf]"+r"(y_buf), // %[y_buf]
2035 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2036 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2037 [width]"+rm"(width) // %[width]
2038 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2039 : "memory", "cc"
2040#if defined(__SSE2__)
2041 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2042#endif
2043 );
2044}
2045
2046void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2047 const uint8* vu_buf,
2048 uint8* argb_buf,
2049 int width) {
2050 asm volatile (
2051 "pcmpeqb %%xmm5,%%xmm5 \n"
2052 "pxor %%xmm4,%%xmm4 \n"
2053 ".p2align 4 \n"
2054 "1: \n"
2055 READNV12
2056 YVUTORGB
2057 "punpcklbw %%xmm1,%%xmm0 \n"
2058 "punpcklbw %%xmm5,%%xmm2 \n"
2059 "movdqa %%xmm0,%%xmm1 \n"
2060 "punpcklwd %%xmm2,%%xmm0 \n"
2061 "punpckhwd %%xmm2,%%xmm1 \n"
2062 "movdqu %%xmm0,(%[argb_buf]) \n"
2063 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2064 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2065 "sub $0x8,%[width] \n"
2066 "jg 1b \n"
2067 : [y_buf]"+r"(y_buf), // %[y_buf]
2068 [uv_buf]"+r"(vu_buf), // %[uv_buf]
2069 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2070 [width]"+rm"(width) // %[width]
2071 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002072 : "memory", "cc"
2073#if defined(__SSE2__)
2074 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2075#endif
2076 );
2077}
2078
2079void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2080 const uint8* u_buf,
2081 const uint8* v_buf,
2082 uint8* bgra_buf,
2083 int width) {
2084 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002085 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002086 "pcmpeqb %%xmm5,%%xmm5 \n"
2087 "pxor %%xmm4,%%xmm4 \n"
2088 ".p2align 4 \n"
2089 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002090 READYUV422
2091 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002092 "pcmpeqb %%xmm5,%%xmm5 \n"
2093 "punpcklbw %%xmm0,%%xmm1 \n"
2094 "punpcklbw %%xmm2,%%xmm5 \n"
2095 "movdqa %%xmm5,%%xmm0 \n"
2096 "punpcklwd %%xmm1,%%xmm5 \n"
2097 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002098 "movdqa %%xmm5,(%[argb_buf]) \n"
2099 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2100 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2101 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002102 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002103 : [y_buf]"+r"(y_buf), // %[y_buf]
2104 [u_buf]"+r"(u_buf), // %[u_buf]
2105 [v_buf]"+r"(v_buf), // %[v_buf]
2106 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2107 [width]"+rm"(width) // %[width]
2108 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002109 : "memory", "cc"
2110#if defined(__SSE2__)
2111 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2112#endif
2113 );
2114}
2115
fbarchard@google.come214fe32012-06-04 23:47:11 +00002116void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002117 const uint8* u_buf,
2118 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002119 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002120 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002121 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002122 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002123 "pcmpeqb %%xmm5,%%xmm5 \n"
2124 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002125 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002126 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002127 READYUV422
2128 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002129 "punpcklbw %%xmm1,%%xmm2 \n"
2130 "punpcklbw %%xmm5,%%xmm0 \n"
2131 "movdqa %%xmm2,%%xmm1 \n"
2132 "punpcklwd %%xmm0,%%xmm2 \n"
2133 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002134 "movdqa %%xmm2,(%[argb_buf]) \n"
2135 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
2136 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2137 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002138 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002139 : [y_buf]"+r"(y_buf), // %[y_buf]
2140 [u_buf]"+r"(u_buf), // %[u_buf]
2141 [v_buf]"+r"(v_buf), // %[v_buf]
2142 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2143 [width]"+rm"(width) // %[width]
2144 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002145 : "memory", "cc"
2146#if defined(__SSE2__)
2147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2148#endif
2149 );
2150}
2151
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002152void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2153 const uint8* u_buf,
2154 const uint8* v_buf,
2155 uint8* rgba_buf,
2156 int width) {
2157 asm volatile (
2158 "sub %[u_buf],%[v_buf] \n"
2159 "pcmpeqb %%xmm5,%%xmm5 \n"
2160 "pxor %%xmm4,%%xmm4 \n"
2161 ".p2align 4 \n"
2162 "1: \n"
2163 READYUV422
2164 YUVTORGB
2165 "pcmpeqb %%xmm5,%%xmm5 \n"
2166 "punpcklbw %%xmm2,%%xmm1 \n"
2167 "punpcklbw %%xmm0,%%xmm5 \n"
2168 "movdqa %%xmm5,%%xmm0 \n"
2169 "punpcklwd %%xmm1,%%xmm5 \n"
2170 "punpckhwd %%xmm1,%%xmm0 \n"
2171 "movdqa %%xmm5,(%[argb_buf]) \n"
2172 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2173 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2174 "sub $0x8,%[width] \n"
2175 "jg 1b \n"
2176 : [y_buf]"+r"(y_buf), // %[y_buf]
2177 [u_buf]"+r"(u_buf), // %[u_buf]
2178 [v_buf]"+r"(v_buf), // %[v_buf]
2179 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2180 [width]"+rm"(width) // %[width]
2181 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2182 : "memory", "cc"
2183#if defined(__SSE2__)
2184 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2185#endif
2186 );
2187}
2188
fbarchard@google.come214fe32012-06-04 23:47:11 +00002189void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002190 const uint8* u_buf,
2191 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002192 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002193 int width) {
2194 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002195 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002196 "pcmpeqb %%xmm5,%%xmm5 \n"
2197 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002198 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002199 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002200 READYUV422
2201 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002202 "pcmpeqb %%xmm5,%%xmm5 \n"
2203 "punpcklbw %%xmm0,%%xmm1 \n"
2204 "punpcklbw %%xmm2,%%xmm5 \n"
2205 "movdqa %%xmm5,%%xmm0 \n"
2206 "punpcklwd %%xmm1,%%xmm5 \n"
2207 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002208 "movdqu %%xmm5,(%[argb_buf]) \n"
2209 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
2210 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2211 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002212 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002213 : [y_buf]"+r"(y_buf), // %[y_buf]
2214 [u_buf]"+r"(u_buf), // %[u_buf]
2215 [v_buf]"+r"(v_buf), // %[v_buf]
2216 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2217 [width]"+rm"(width) // %[width]
2218 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002219 : "memory", "cc"
2220#if defined(__SSE2__)
2221 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2222#endif
2223 );
2224}
2225
fbarchard@google.come214fe32012-06-04 23:47:11 +00002226void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002227 const uint8* u_buf,
2228 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002229 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002230 int width) {
2231 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002232 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002233 "pcmpeqb %%xmm5,%%xmm5 \n"
2234 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002235 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002236 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002237 READYUV422
2238 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002239 "punpcklbw %%xmm1,%%xmm2 \n"
2240 "punpcklbw %%xmm5,%%xmm0 \n"
2241 "movdqa %%xmm2,%%xmm1 \n"
2242 "punpcklwd %%xmm0,%%xmm2 \n"
2243 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002244 "movdqu %%xmm2,(%[argb_buf]) \n"
2245 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2246 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2247 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002248 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002249 : [y_buf]"+r"(y_buf), // %[y_buf]
2250 [u_buf]"+r"(u_buf), // %[u_buf]
2251 [v_buf]"+r"(v_buf), // %[v_buf]
2252 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2253 [width]"+rm"(width) // %[width]
2254 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002255 : "memory", "cc"
2256#if defined(__SSE2__)
2257 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2258#endif
2259 );
2260}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002261
2262void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2263 const uint8* u_buf,
2264 const uint8* v_buf,
2265 uint8* rgba_buf,
2266 int width) {
2267 asm volatile (
2268 "sub %[u_buf],%[v_buf] \n"
2269 "pcmpeqb %%xmm5,%%xmm5 \n"
2270 "pxor %%xmm4,%%xmm4 \n"
2271 ".p2align 4 \n"
2272 "1: \n"
2273 READYUV422
2274 YUVTORGB
2275 "pcmpeqb %%xmm5,%%xmm5 \n"
2276 "punpcklbw %%xmm2,%%xmm1 \n"
2277 "punpcklbw %%xmm0,%%xmm5 \n"
2278 "movdqa %%xmm5,%%xmm0 \n"
2279 "punpcklwd %%xmm1,%%xmm5 \n"
2280 "punpckhwd %%xmm1,%%xmm0 \n"
2281 "movdqa %%xmm5,(%[argb_buf]) \n"
2282 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2283 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2284 "sub $0x8,%[width] \n"
2285 "jg 1b \n"
2286 : [y_buf]"+r"(y_buf), // %[y_buf]
2287 [u_buf]"+r"(u_buf), // %[u_buf]
2288 [v_buf]"+r"(v_buf), // %[v_buf]
2289 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2290 [width]"+rm"(width) // %[width]
2291 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2292 : "memory", "cc"
2293#if defined(__SSE2__)
2294 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2295#endif
2296 );
2297}
2298
fbarchard@google.come214fe32012-06-04 23:47:11 +00002299#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002300
2301#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002302void YToARGBRow_SSE2(const uint8* y_buf,
2303 uint8* rgb_buf,
2304 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002305 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002306 "pcmpeqb %%xmm4,%%xmm4 \n"
2307 "pslld $0x18,%%xmm4 \n"
2308 "mov $0x10001000,%%eax \n"
2309 "movd %%eax,%%xmm3 \n"
2310 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2311 "mov $0x012a012a,%%eax \n"
2312 "movd %%eax,%%xmm2 \n"
2313 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002314 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002315 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002316 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002317 "movq (%0),%%xmm0 \n"
2318 "lea 0x8(%0),%0 \n"
2319 "punpcklbw %%xmm0,%%xmm0 \n"
2320 "psubusw %%xmm3,%%xmm0 \n"
2321 "pmulhuw %%xmm2,%%xmm0 \n"
2322 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002323
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002324 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002325 "punpcklbw %%xmm0,%%xmm0 \n"
2326 "movdqa %%xmm0,%%xmm1 \n"
2327 "punpcklwd %%xmm0,%%xmm0 \n"
2328 "punpckhwd %%xmm1,%%xmm1 \n"
2329 "por %%xmm4,%%xmm0 \n"
2330 "por %%xmm4,%%xmm1 \n"
2331 "movdqa %%xmm0,(%1) \n"
2332 "movdqa %%xmm1,16(%1) \n"
2333 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002334
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002335 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002336 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00002337 : "+r"(y_buf), // %0
2338 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00002339 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002340 :
2341 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002342#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002343 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002344#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002345 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002346}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002347#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002348
fbarchard@google.com42831e02012-01-21 02:54:17 +00002349#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002350// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002351CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002352 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2353};
2354
fbarchard@google.com42831e02012-01-21 02:54:17 +00002355void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002356 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002357 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002358 "movdqa %3,%%xmm5 \n"
2359 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002360 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002361 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002362 "movdqa (%0,%2),%%xmm0 \n"
2363 "pshufb %%xmm5,%%xmm0 \n"
2364 "sub $0x10,%2 \n"
2365 "movdqa %%xmm0,(%1) \n"
2366 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002367 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002368 : "+r"(src), // %0
2369 "+r"(dst), // %1
2370 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002371 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002372 : "memory", "cc"
2373#if defined(__SSE2__)
2374 , "xmm0", "xmm5"
2375#endif
2376 );
2377}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002378#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002379
fbarchard@google.com42831e02012-01-21 02:54:17 +00002380#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002381void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002382 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002383 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002384 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002385 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002386 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002387 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002388 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002389 "psllw $0x8,%%xmm0 \n"
2390 "psrlw $0x8,%%xmm1 \n"
2391 "por %%xmm1,%%xmm0 \n"
2392 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2393 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2394 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2395 "sub $0x10,%2 \n"
2396 "movdqu %%xmm0,(%1) \n"
2397 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002398 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002399 : "+r"(src), // %0
2400 "+r"(dst), // %1
2401 "+r"(temp_width) // %2
2402 :
2403 : "memory", "cc"
2404#if defined(__SSE2__)
2405 , "xmm0", "xmm1"
2406#endif
2407 );
2408}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002409#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002410
fbarchard@google.com16a96642012-03-02 22:38:09 +00002411#ifdef HAS_MIRRORROW_UV_SSSE3
2412// Shuffle table for reversing the bytes of UV channels.
2413CONST uvec8 kShuffleMirrorUV = {
2414 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2415};
2416void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2417 int width) {
2418 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002419 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002420 "movdqa %4,%%xmm1 \n"
2421 "lea -16(%0,%3,2),%0 \n"
2422 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002423 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002424 "1: \n"
2425 "movdqa (%0),%%xmm0 \n"
2426 "lea -16(%0),%0 \n"
2427 "pshufb %%xmm1,%%xmm0 \n"
2428 "sub $8,%3 \n"
2429 "movlpd %%xmm0,(%1) \n"
2430 "movhpd %%xmm0,(%1,%2) \n"
2431 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002432 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002433 : "+r"(src), // %0
2434 "+r"(dst_u), // %1
2435 "+r"(dst_v), // %2
2436 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002437 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002438 : "memory", "cc"
2439#if defined(__SSE2__)
2440 , "xmm0", "xmm1"
2441#endif
2442 );
2443}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002444#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002445
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002446#ifdef HAS_ARGBMIRRORROW_SSSE3
2447// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002448CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002449 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2450};
2451
2452void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2453 intptr_t temp_width = static_cast<intptr_t>(width);
2454 asm volatile (
2455 "movdqa %3,%%xmm5 \n"
2456 "lea -0x10(%0),%0 \n"
2457 ".p2align 4 \n"
2458 "1: \n"
2459 "movdqa (%0,%2,4),%%xmm0 \n"
2460 "pshufb %%xmm5,%%xmm0 \n"
2461 "sub $0x4,%2 \n"
2462 "movdqa %%xmm0,(%1) \n"
2463 "lea 0x10(%1),%1 \n"
2464 "jg 1b \n"
2465 : "+r"(src), // %0
2466 "+r"(dst), // %1
2467 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002468 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002469 : "memory", "cc"
2470#if defined(__SSE2__)
2471 , "xmm0", "xmm5"
2472#endif
2473 );
2474}
2475#endif // HAS_ARGBMIRRORROW_SSSE3
2476
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002477#ifdef HAS_SPLITUV_SSE2
2478void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002479 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002480 "pcmpeqb %%xmm5,%%xmm5 \n"
2481 "psrlw $0x8,%%xmm5 \n"
2482 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002483 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002484 "1: \n"
2485 "movdqa (%0),%%xmm0 \n"
2486 "movdqa 0x10(%0),%%xmm1 \n"
2487 "lea 0x20(%0),%0 \n"
2488 "movdqa %%xmm0,%%xmm2 \n"
2489 "movdqa %%xmm1,%%xmm3 \n"
2490 "pand %%xmm5,%%xmm0 \n"
2491 "pand %%xmm5,%%xmm1 \n"
2492 "packuswb %%xmm1,%%xmm0 \n"
2493 "psrlw $0x8,%%xmm2 \n"
2494 "psrlw $0x8,%%xmm3 \n"
2495 "packuswb %%xmm3,%%xmm2 \n"
2496 "movdqa %%xmm0,(%1) \n"
2497 "movdqa %%xmm2,(%1,%2) \n"
2498 "lea 0x10(%1),%1 \n"
2499 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002500 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002501 : "+r"(src_uv), // %0
2502 "+r"(dst_u), // %1
2503 "+r"(dst_v), // %2
2504 "+r"(pix) // %3
2505 :
2506 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002507#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002508 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002509#endif
2510 );
2511}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002512
2513void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2514 int pix) {
2515 asm volatile (
2516 "pcmpeqb %%xmm5,%%xmm5 \n"
2517 "psrlw $0x8,%%xmm5 \n"
2518 "sub %1,%2 \n"
2519 ".p2align 4 \n"
2520 "1: \n"
2521 "movdqu (%0),%%xmm0 \n"
2522 "movdqu 0x10(%0),%%xmm1 \n"
2523 "lea 0x20(%0),%0 \n"
2524 "movdqa %%xmm0,%%xmm2 \n"
2525 "movdqa %%xmm1,%%xmm3 \n"
2526 "pand %%xmm5,%%xmm0 \n"
2527 "pand %%xmm5,%%xmm1 \n"
2528 "packuswb %%xmm1,%%xmm0 \n"
2529 "psrlw $0x8,%%xmm2 \n"
2530 "psrlw $0x8,%%xmm3 \n"
2531 "packuswb %%xmm3,%%xmm2 \n"
2532 "movdqu %%xmm0,(%1) \n"
2533 "movdqu %%xmm2,(%1,%2) \n"
2534 "lea 0x10(%1),%1 \n"
2535 "sub $0x10,%3 \n"
2536 "jg 1b \n"
2537 : "+r"(src_uv), // %0
2538 "+r"(dst_u), // %1
2539 "+r"(dst_v), // %2
2540 "+r"(pix) // %3
2541 :
2542 : "memory", "cc"
2543#if defined(__SSE2__)
2544 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2545#endif
2546 );
2547}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002548#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002549
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002550#ifdef HAS_MERGEUV_SSE2
2551void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2552 int width) {
2553 asm volatile (
2554 "sub %0,%1 \n"
2555 ".p2align 4 \n"
2556 "1: \n"
2557 "movdqa (%0),%%xmm0 \n"
2558 "movdqa (%0,%1,1),%%xmm1 \n"
2559 "lea 0x10(%0),%0 \n"
2560 "movdqa %%xmm0,%%xmm2 \n"
2561 "punpcklbw %%xmm1,%%xmm0 \n"
2562 "punpckhbw %%xmm1,%%xmm2 \n"
2563 "movdqa %%xmm0,(%2) \n"
2564 "movdqa %%xmm2,0x10(%2) \n"
2565 "lea 0x20(%2),%2 \n"
2566 "sub $0x10,%3 \n"
2567 "jg 1b \n"
2568 : "+r"(src_u), // %0
2569 "+r"(src_v), // %1
2570 "+r"(dst_uv), // %2
2571 "+r"(width) // %3
2572 :
2573 : "memory", "cc"
2574#if defined(__SSE2__)
2575 , "xmm0", "xmm1", "xmm2"
2576#endif
2577 );
2578}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002579
2580void MergeUV_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2581 uint8* dst_uv, int width) {
2582 asm volatile (
2583 "sub %0,%1 \n"
2584 ".p2align 4 \n"
2585 "1: \n"
2586 "movdqu (%0),%%xmm0 \n"
2587 "movdqu (%0,%1,1),%%xmm1 \n"
2588 "lea 0x10(%0),%0 \n"
2589 "movdqa %%xmm0,%%xmm2 \n"
2590 "punpcklbw %%xmm1,%%xmm0 \n"
2591 "punpckhbw %%xmm1,%%xmm2 \n"
2592 "movdqu %%xmm0,(%2) \n"
2593 "movdqu %%xmm2,0x10(%2) \n"
2594 "lea 0x20(%2),%2 \n"
2595 "sub $0x10,%3 \n"
2596 "jg 1b \n"
2597 : "+r"(src_u), // %0
2598 "+r"(src_v), // %1
2599 "+r"(dst_uv), // %2
2600 "+r"(width) // %3
2601 :
2602 : "memory", "cc"
2603#if defined(__SSE2__)
2604 , "xmm0", "xmm1", "xmm2"
2605#endif
2606 );
2607}
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002608#endif // HAS_MERGEUV_SSE2
2609
fbarchard@google.com19932f82012-02-16 22:19:14 +00002610#ifdef HAS_COPYROW_SSE2
2611void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002612 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002613 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002614 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002615 "1: \n"
2616 "movdqa (%0),%%xmm0 \n"
2617 "movdqa 0x10(%0),%%xmm1 \n"
2618 "movdqa %%xmm0,(%0,%1) \n"
2619 "movdqa %%xmm1,0x10(%0,%1) \n"
2620 "lea 0x20(%0),%0 \n"
2621 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002622 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002623 : "+r"(src), // %0
2624 "+r"(dst), // %1
2625 "+r"(count) // %2
2626 :
2627 : "memory", "cc"
2628#if defined(__SSE2__)
2629 , "xmm0", "xmm1"
2630#endif
2631 );
2632}
2633#endif // HAS_COPYROW_SSE2
2634
2635#ifdef HAS_COPYROW_X86
2636void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2637 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002638 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002639 "shr $0x2,%2 \n"
2640 "rep movsl \n"
2641 : "+S"(src), // %0
2642 "+D"(dst), // %1
2643 "+c"(width_tmp) // %2
2644 :
2645 : "memory", "cc"
2646 );
2647}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002648#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002649
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002650#ifdef HAS_SETROW_X86
2651void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2652 size_t width_tmp = static_cast<size_t>(width);
2653 asm volatile (
2654 "shr $0x2,%1 \n"
2655 "rep stosl \n"
2656 : "+D"(dst), // %0
2657 "+c"(width_tmp) // %1
2658 : "a"(v32) // %2
2659 : "memory", "cc");
2660}
2661
2662void SetRows32_X86(uint8* dst, uint32 v32, int width,
2663 int dst_stride, int height) {
2664 for (int y = 0; y < height; ++y) {
2665 size_t width_tmp = static_cast<size_t>(width);
2666 uint32* d = reinterpret_cast<uint32*>(dst);
2667 asm volatile (
2668 "rep stosl \n"
2669 : "+D"(d), // %0
2670 "+c"(width_tmp) // %1
2671 : "a"(v32) // %2
2672 : "memory", "cc");
2673 dst += dst_stride;
2674 }
2675}
2676#endif // HAS_SETROW_X86
2677
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002678#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002679void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002680 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002681 "pcmpeqb %%xmm5,%%xmm5 \n"
2682 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002683 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002684 "1: \n"
2685 "movdqa (%0),%%xmm0 \n"
2686 "movdqa 0x10(%0),%%xmm1 \n"
2687 "lea 0x20(%0),%0 \n"
2688 "pand %%xmm5,%%xmm0 \n"
2689 "pand %%xmm5,%%xmm1 \n"
2690 "packuswb %%xmm1,%%xmm0 \n"
2691 "movdqa %%xmm0,(%1) \n"
2692 "lea 0x10(%1),%1 \n"
2693 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002694 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002695 : "+r"(src_yuy2), // %0
2696 "+r"(dst_y), // %1
2697 "+r"(pix) // %2
2698 :
2699 : "memory", "cc"
2700#if defined(__SSE2__)
2701 , "xmm0", "xmm1", "xmm5"
2702#endif
2703 );
2704}
2705
2706void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002707 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002708 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002709 "pcmpeqb %%xmm5,%%xmm5 \n"
2710 "psrlw $0x8,%%xmm5 \n"
2711 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002712 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002713 "1: \n"
2714 "movdqa (%0),%%xmm0 \n"
2715 "movdqa 0x10(%0),%%xmm1 \n"
2716 "movdqa (%0,%4,1),%%xmm2 \n"
2717 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2718 "lea 0x20(%0),%0 \n"
2719 "pavgb %%xmm2,%%xmm0 \n"
2720 "pavgb %%xmm3,%%xmm1 \n"
2721 "psrlw $0x8,%%xmm0 \n"
2722 "psrlw $0x8,%%xmm1 \n"
2723 "packuswb %%xmm1,%%xmm0 \n"
2724 "movdqa %%xmm0,%%xmm1 \n"
2725 "pand %%xmm5,%%xmm0 \n"
2726 "packuswb %%xmm0,%%xmm0 \n"
2727 "psrlw $0x8,%%xmm1 \n"
2728 "packuswb %%xmm1,%%xmm1 \n"
2729 "movq %%xmm0,(%1) \n"
2730 "movq %%xmm1,(%1,%2) \n"
2731 "lea 0x8(%1),%1 \n"
2732 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002733 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002734 : "+r"(src_yuy2), // %0
2735 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002736 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002737 "+r"(pix) // %3
2738 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2739 : "memory", "cc"
2740#if defined(__SSE2__)
2741 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2742#endif
2743 );
2744}
2745
fbarchard@google.comc704f782012-08-30 19:53:48 +00002746void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2747 uint8* dst_u, uint8* dst_v, int pix) {
2748 asm volatile (
2749 "pcmpeqb %%xmm5,%%xmm5 \n"
2750 "psrlw $0x8,%%xmm5 \n"
2751 "sub %1,%2 \n"
2752 ".p2align 4 \n"
2753 "1: \n"
2754 "movdqa (%0),%%xmm0 \n"
2755 "movdqa 0x10(%0),%%xmm1 \n"
2756 "lea 0x20(%0),%0 \n"
2757 "psrlw $0x8,%%xmm0 \n"
2758 "psrlw $0x8,%%xmm1 \n"
2759 "packuswb %%xmm1,%%xmm0 \n"
2760 "movdqa %%xmm0,%%xmm1 \n"
2761 "pand %%xmm5,%%xmm0 \n"
2762 "packuswb %%xmm0,%%xmm0 \n"
2763 "psrlw $0x8,%%xmm1 \n"
2764 "packuswb %%xmm1,%%xmm1 \n"
2765 "movq %%xmm0,(%1) \n"
2766 "movq %%xmm1,(%1,%2) \n"
2767 "lea 0x8(%1),%1 \n"
2768 "sub $0x10,%3 \n"
2769 "jg 1b \n"
2770 : "+r"(src_yuy2), // %0
2771 "+r"(dst_u), // %1
2772 "+r"(dst_v), // %2
2773 "+r"(pix) // %3
2774 :
2775 : "memory", "cc"
2776#if defined(__SSE2__)
2777 , "xmm0", "xmm1", "xmm5"
2778#endif
2779 );
2780}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002781
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002782void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2783 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002784 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002785 "pcmpeqb %%xmm5,%%xmm5 \n"
2786 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002787 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002788 "1: \n"
2789 "movdqu (%0),%%xmm0 \n"
2790 "movdqu 0x10(%0),%%xmm1 \n"
2791 "lea 0x20(%0),%0 \n"
2792 "pand %%xmm5,%%xmm0 \n"
2793 "pand %%xmm5,%%xmm1 \n"
2794 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002795 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002796 "movdqu %%xmm0,(%1) \n"
2797 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002798 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002799 : "+r"(src_yuy2), // %0
2800 "+r"(dst_y), // %1
2801 "+r"(pix) // %2
2802 :
2803 : "memory", "cc"
2804#if defined(__SSE2__)
2805 , "xmm0", "xmm1", "xmm5"
2806#endif
2807 );
2808}
2809
2810void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2811 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002812 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002813 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002814 "pcmpeqb %%xmm5,%%xmm5 \n"
2815 "psrlw $0x8,%%xmm5 \n"
2816 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002817 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002818 "1: \n"
2819 "movdqu (%0),%%xmm0 \n"
2820 "movdqu 0x10(%0),%%xmm1 \n"
2821 "movdqu (%0,%4,1),%%xmm2 \n"
2822 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2823 "lea 0x20(%0),%0 \n"
2824 "pavgb %%xmm2,%%xmm0 \n"
2825 "pavgb %%xmm3,%%xmm1 \n"
2826 "psrlw $0x8,%%xmm0 \n"
2827 "psrlw $0x8,%%xmm1 \n"
2828 "packuswb %%xmm1,%%xmm0 \n"
2829 "movdqa %%xmm0,%%xmm1 \n"
2830 "pand %%xmm5,%%xmm0 \n"
2831 "packuswb %%xmm0,%%xmm0 \n"
2832 "psrlw $0x8,%%xmm1 \n"
2833 "packuswb %%xmm1,%%xmm1 \n"
2834 "movq %%xmm0,(%1) \n"
2835 "movq %%xmm1,(%1,%2) \n"
2836 "lea 0x8(%1),%1 \n"
2837 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002838 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002839 : "+r"(src_yuy2), // %0
2840 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002841 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002842 "+r"(pix) // %3
2843 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2844 : "memory", "cc"
2845#if defined(__SSE2__)
2846 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2847#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002848 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002849}
2850
fbarchard@google.comc704f782012-08-30 19:53:48 +00002851void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2852 uint8* dst_u, uint8* dst_v, int pix) {
2853 asm volatile (
2854 "pcmpeqb %%xmm5,%%xmm5 \n"
2855 "psrlw $0x8,%%xmm5 \n"
2856 "sub %1,%2 \n"
2857 ".p2align 4 \n"
2858 "1: \n"
2859 "movdqu (%0),%%xmm0 \n"
2860 "movdqu 0x10(%0),%%xmm1 \n"
2861 "lea 0x20(%0),%0 \n"
2862 "psrlw $0x8,%%xmm0 \n"
2863 "psrlw $0x8,%%xmm1 \n"
2864 "packuswb %%xmm1,%%xmm0 \n"
2865 "movdqa %%xmm0,%%xmm1 \n"
2866 "pand %%xmm5,%%xmm0 \n"
2867 "packuswb %%xmm0,%%xmm0 \n"
2868 "psrlw $0x8,%%xmm1 \n"
2869 "packuswb %%xmm1,%%xmm1 \n"
2870 "movq %%xmm0,(%1) \n"
2871 "movq %%xmm1,(%1,%2) \n"
2872 "lea 0x8(%1),%1 \n"
2873 "sub $0x10,%3 \n"
2874 "jg 1b \n"
2875 : "+r"(src_yuy2), // %0
2876 "+r"(dst_u), // %1
2877 "+r"(dst_v), // %2
2878 "+r"(pix) // %3
2879 :
2880 : "memory", "cc"
2881#if defined(__SSE2__)
2882 , "xmm0", "xmm1", "xmm5"
2883#endif
2884 );
2885}
2886
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002887void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002888 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002889 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002890 "1: \n"
2891 "movdqa (%0),%%xmm0 \n"
2892 "movdqa 0x10(%0),%%xmm1 \n"
2893 "lea 0x20(%0),%0 \n"
2894 "psrlw $0x8,%%xmm0 \n"
2895 "psrlw $0x8,%%xmm1 \n"
2896 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002897 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002898 "movdqa %%xmm0,(%1) \n"
2899 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002900 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002901 : "+r"(src_uyvy), // %0
2902 "+r"(dst_y), // %1
2903 "+r"(pix) // %2
2904 :
2905 : "memory", "cc"
2906#if defined(__SSE2__)
2907 , "xmm0", "xmm1"
2908#endif
2909 );
2910}
2911
2912void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002913 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002914 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002915 "pcmpeqb %%xmm5,%%xmm5 \n"
2916 "psrlw $0x8,%%xmm5 \n"
2917 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002918 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002919 "1: \n"
2920 "movdqa (%0),%%xmm0 \n"
2921 "movdqa 0x10(%0),%%xmm1 \n"
2922 "movdqa (%0,%4,1),%%xmm2 \n"
2923 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2924 "lea 0x20(%0),%0 \n"
2925 "pavgb %%xmm2,%%xmm0 \n"
2926 "pavgb %%xmm3,%%xmm1 \n"
2927 "pand %%xmm5,%%xmm0 \n"
2928 "pand %%xmm5,%%xmm1 \n"
2929 "packuswb %%xmm1,%%xmm0 \n"
2930 "movdqa %%xmm0,%%xmm1 \n"
2931 "pand %%xmm5,%%xmm0 \n"
2932 "packuswb %%xmm0,%%xmm0 \n"
2933 "psrlw $0x8,%%xmm1 \n"
2934 "packuswb %%xmm1,%%xmm1 \n"
2935 "movq %%xmm0,(%1) \n"
2936 "movq %%xmm1,(%1,%2) \n"
2937 "lea 0x8(%1),%1 \n"
2938 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002939 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002940 : "+r"(src_uyvy), // %0
2941 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002942 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002943 "+r"(pix) // %3
2944 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2945 : "memory", "cc"
2946#if defined(__SSE2__)
2947 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2948#endif
2949 );
2950}
2951
fbarchard@google.comc704f782012-08-30 19:53:48 +00002952void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2953 uint8* dst_u, uint8* dst_v, int pix) {
2954 asm volatile (
2955 "pcmpeqb %%xmm5,%%xmm5 \n"
2956 "psrlw $0x8,%%xmm5 \n"
2957 "sub %1,%2 \n"
2958 ".p2align 4 \n"
2959 "1: \n"
2960 "movdqa (%0),%%xmm0 \n"
2961 "movdqa 0x10(%0),%%xmm1 \n"
2962 "lea 0x20(%0),%0 \n"
2963 "pand %%xmm5,%%xmm0 \n"
2964 "pand %%xmm5,%%xmm1 \n"
2965 "packuswb %%xmm1,%%xmm0 \n"
2966 "movdqa %%xmm0,%%xmm1 \n"
2967 "pand %%xmm5,%%xmm0 \n"
2968 "packuswb %%xmm0,%%xmm0 \n"
2969 "psrlw $0x8,%%xmm1 \n"
2970 "packuswb %%xmm1,%%xmm1 \n"
2971 "movq %%xmm0,(%1) \n"
2972 "movq %%xmm1,(%1,%2) \n"
2973 "lea 0x8(%1),%1 \n"
2974 "sub $0x10,%3 \n"
2975 "jg 1b \n"
2976 : "+r"(src_uyvy), // %0
2977 "+r"(dst_u), // %1
2978 "+r"(dst_v), // %2
2979 "+r"(pix) // %3
2980 :
2981 : "memory", "cc"
2982#if defined(__SSE2__)
2983 , "xmm0", "xmm1", "xmm5"
2984#endif
2985 );
2986}
2987
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002988void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2989 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002990 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002991 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002992 "1: \n"
2993 "movdqu (%0),%%xmm0 \n"
2994 "movdqu 0x10(%0),%%xmm1 \n"
2995 "lea 0x20(%0),%0 \n"
2996 "psrlw $0x8,%%xmm0 \n"
2997 "psrlw $0x8,%%xmm1 \n"
2998 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002999 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003000 "movdqu %%xmm0,(%1) \n"
3001 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003002 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003003 : "+r"(src_uyvy), // %0
3004 "+r"(dst_y), // %1
3005 "+r"(pix) // %2
3006 :
3007 : "memory", "cc"
3008#if defined(__SSE2__)
3009 , "xmm0", "xmm1"
3010#endif
3011 );
3012}
3013
3014void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003015 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003016 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003017 "pcmpeqb %%xmm5,%%xmm5 \n"
3018 "psrlw $0x8,%%xmm5 \n"
3019 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003020 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003021 "1: \n"
3022 "movdqu (%0),%%xmm0 \n"
3023 "movdqu 0x10(%0),%%xmm1 \n"
3024 "movdqu (%0,%4,1),%%xmm2 \n"
3025 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3026 "lea 0x20(%0),%0 \n"
3027 "pavgb %%xmm2,%%xmm0 \n"
3028 "pavgb %%xmm3,%%xmm1 \n"
3029 "pand %%xmm5,%%xmm0 \n"
3030 "pand %%xmm5,%%xmm1 \n"
3031 "packuswb %%xmm1,%%xmm0 \n"
3032 "movdqa %%xmm0,%%xmm1 \n"
3033 "pand %%xmm5,%%xmm0 \n"
3034 "packuswb %%xmm0,%%xmm0 \n"
3035 "psrlw $0x8,%%xmm1 \n"
3036 "packuswb %%xmm1,%%xmm1 \n"
3037 "movq %%xmm0,(%1) \n"
3038 "movq %%xmm1,(%1,%2) \n"
3039 "lea 0x8(%1),%1 \n"
3040 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003041 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003042 : "+r"(src_uyvy), // %0
3043 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003044 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003045 "+r"(pix) // %3
3046 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3047 : "memory", "cc"
3048#if defined(__SSE2__)
3049 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3050#endif
3051 );
3052}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003053
3054void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3055 uint8* dst_u, uint8* dst_v, int pix) {
3056 asm volatile (
3057 "pcmpeqb %%xmm5,%%xmm5 \n"
3058 "psrlw $0x8,%%xmm5 \n"
3059 "sub %1,%2 \n"
3060 ".p2align 4 \n"
3061 "1: \n"
3062 "movdqu (%0),%%xmm0 \n"
3063 "movdqu 0x10(%0),%%xmm1 \n"
3064 "lea 0x20(%0),%0 \n"
3065 "pand %%xmm5,%%xmm0 \n"
3066 "pand %%xmm5,%%xmm1 \n"
3067 "packuswb %%xmm1,%%xmm0 \n"
3068 "movdqa %%xmm0,%%xmm1 \n"
3069 "pand %%xmm5,%%xmm0 \n"
3070 "packuswb %%xmm0,%%xmm0 \n"
3071 "psrlw $0x8,%%xmm1 \n"
3072 "packuswb %%xmm1,%%xmm1 \n"
3073 "movq %%xmm0,(%1) \n"
3074 "movq %%xmm1,(%1,%2) \n"
3075 "lea 0x8(%1),%1 \n"
3076 "sub $0x10,%3 \n"
3077 "jg 1b \n"
3078 : "+r"(src_uyvy), // %0
3079 "+r"(dst_u), // %1
3080 "+r"(dst_v), // %2
3081 "+r"(pix) // %3
3082 :
3083 : "memory", "cc"
3084#if defined(__SSE2__)
3085 , "xmm0", "xmm1", "xmm5"
3086#endif
3087 );
3088}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003089#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003090
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003091#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003092// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003093void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3094 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003095 asm volatile (
3096 "pcmpeqb %%xmm7,%%xmm7 \n"
3097 "psrlw $0xf,%%xmm7 \n"
3098 "pcmpeqb %%xmm6,%%xmm6 \n"
3099 "psrlw $0x8,%%xmm6 \n"
3100 "pcmpeqb %%xmm5,%%xmm5 \n"
3101 "psllw $0x8,%%xmm5 \n"
3102 "pcmpeqb %%xmm4,%%xmm4 \n"
3103 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003104 "sub $0x1,%3 \n"
3105 "je 91f \n"
3106 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003107
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003108 // 1 pixel loop until destination pointer is aligned.
3109 "10: \n"
3110 "test $0xf,%2 \n"
3111 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003112 "movd (%0),%%xmm3 \n"
3113 "lea 0x4(%0),%0 \n"
3114 "movdqa %%xmm3,%%xmm0 \n"
3115 "pxor %%xmm4,%%xmm3 \n"
3116 "movd (%1),%%xmm2 \n"
3117 "psrlw $0x8,%%xmm3 \n"
3118 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3119 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3120 "pand %%xmm6,%%xmm2 \n"
3121 "paddw %%xmm7,%%xmm3 \n"
3122 "pmullw %%xmm3,%%xmm2 \n"
3123 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003124 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003125 "psrlw $0x8,%%xmm1 \n"
3126 "por %%xmm4,%%xmm0 \n"
3127 "pmullw %%xmm3,%%xmm1 \n"
3128 "psrlw $0x8,%%xmm2 \n"
3129 "paddusb %%xmm2,%%xmm0 \n"
3130 "pand %%xmm5,%%xmm1 \n"
3131 "paddusb %%xmm1,%%xmm0 \n"
3132 "sub $0x1,%3 \n"
3133 "movd %%xmm0,(%2) \n"
3134 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003135 "jge 10b \n"
3136
3137 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003138 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003139 "jl 49f \n"
3140
fbarchard@google.com794fe122012-06-15 01:05:01 +00003141 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003142 ".p2align 2 \n"
3143 "41: \n"
3144 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003145 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003146 "movdqa %%xmm3,%%xmm0 \n"
3147 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003148 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003149 "psrlw $0x8,%%xmm3 \n"
3150 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3151 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003152 "pand %%xmm6,%%xmm2 \n"
3153 "paddw %%xmm7,%%xmm3 \n"
3154 "pmullw %%xmm3,%%xmm2 \n"
3155 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003156 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003157 "psrlw $0x8,%%xmm1 \n"
3158 "por %%xmm4,%%xmm0 \n"
3159 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003160 "psrlw $0x8,%%xmm2 \n"
3161 "paddusb %%xmm2,%%xmm0 \n"
3162 "pand %%xmm5,%%xmm1 \n"
3163 "paddusb %%xmm1,%%xmm0 \n"
3164 "sub $0x4,%3 \n"
3165 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003166 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003167 "jge 41b \n"
3168
3169 "49: \n"
3170 "add $0x3,%3 \n"
3171 "jl 99f \n"
3172
fbarchard@google.com794fe122012-06-15 01:05:01 +00003173 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003174 "91: \n"
3175 "movd (%0),%%xmm3 \n"
3176 "lea 0x4(%0),%0 \n"
3177 "movdqa %%xmm3,%%xmm0 \n"
3178 "pxor %%xmm4,%%xmm3 \n"
3179 "movd (%1),%%xmm2 \n"
3180 "psrlw $0x8,%%xmm3 \n"
3181 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3182 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3183 "pand %%xmm6,%%xmm2 \n"
3184 "paddw %%xmm7,%%xmm3 \n"
3185 "pmullw %%xmm3,%%xmm2 \n"
3186 "movd (%1),%%xmm1 \n"
3187 "lea 0x4(%1),%1 \n"
3188 "psrlw $0x8,%%xmm1 \n"
3189 "por %%xmm4,%%xmm0 \n"
3190 "pmullw %%xmm3,%%xmm1 \n"
3191 "psrlw $0x8,%%xmm2 \n"
3192 "paddusb %%xmm2,%%xmm0 \n"
3193 "pand %%xmm5,%%xmm1 \n"
3194 "paddusb %%xmm1,%%xmm0 \n"
3195 "sub $0x1,%3 \n"
3196 "movd %%xmm0,(%2) \n"
3197 "lea 0x4(%2),%2 \n"
3198 "jge 91b \n"
3199 "99: \n"
3200 : "+r"(src_argb0), // %0
3201 "+r"(src_argb1), // %1
3202 "+r"(dst_argb), // %2
3203 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003204 :
3205 : "memory", "cc"
3206#if defined(__SSE2__)
3207 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3208#endif
3209 );
3210}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003211#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003212
fbarchard@google.com96af8702012-04-06 18:22:27 +00003213#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003214// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003215CONST uvec8 kShuffleAlpha = {
3216 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3217 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3218};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003219
3220// Blend 8 pixels at a time
3221// Shuffle table for reversing the bytes.
3222
3223// Same as SSE2, but replaces
3224// psrlw xmm3, 8 // alpha
3225// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3226// pshuflw xmm3, xmm3,0F5h
3227// with..
3228// pshufb xmm3, kShuffleAlpha // alpha
3229
3230void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3231 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003232 asm volatile (
3233 "pcmpeqb %%xmm7,%%xmm7 \n"
3234 "psrlw $0xf,%%xmm7 \n"
3235 "pcmpeqb %%xmm6,%%xmm6 \n"
3236 "psrlw $0x8,%%xmm6 \n"
3237 "pcmpeqb %%xmm5,%%xmm5 \n"
3238 "psllw $0x8,%%xmm5 \n"
3239 "pcmpeqb %%xmm4,%%xmm4 \n"
3240 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003241 "sub $0x1,%3 \n"
3242 "je 91f \n"
3243 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003244
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003245 // 1 pixel loop until destination pointer is aligned.
3246 "10: \n"
3247 "test $0xf,%2 \n"
3248 "je 19f \n"
3249 "movd (%0),%%xmm3 \n"
3250 "lea 0x4(%0),%0 \n"
3251 "movdqa %%xmm3,%%xmm0 \n"
3252 "pxor %%xmm4,%%xmm3 \n"
3253 "movd (%1),%%xmm2 \n"
3254 "pshufb %4,%%xmm3 \n"
3255 "pand %%xmm6,%%xmm2 \n"
3256 "paddw %%xmm7,%%xmm3 \n"
3257 "pmullw %%xmm3,%%xmm2 \n"
3258 "movd (%1),%%xmm1 \n"
3259 "lea 0x4(%1),%1 \n"
3260 "psrlw $0x8,%%xmm1 \n"
3261 "por %%xmm4,%%xmm0 \n"
3262 "pmullw %%xmm3,%%xmm1 \n"
3263 "psrlw $0x8,%%xmm2 \n"
3264 "paddusb %%xmm2,%%xmm0 \n"
3265 "pand %%xmm5,%%xmm1 \n"
3266 "paddusb %%xmm1,%%xmm0 \n"
3267 "sub $0x1,%3 \n"
3268 "movd %%xmm0,(%2) \n"
3269 "lea 0x4(%2),%2 \n"
3270 "jge 10b \n"
3271
3272 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003273 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003274 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003275 "test $0xf,%0 \n"
3276 "jne 41f \n"
3277 "test $0xf,%1 \n"
3278 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003279
fbarchard@google.com794fe122012-06-15 01:05:01 +00003280 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003281 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003282 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003283 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003284 "lea 0x10(%0),%0 \n"
3285 "movdqa %%xmm3,%%xmm0 \n"
3286 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003287 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003288 "pshufb %4,%%xmm3 \n"
3289 "pand %%xmm6,%%xmm2 \n"
3290 "paddw %%xmm7,%%xmm3 \n"
3291 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003292 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003293 "lea 0x10(%1),%1 \n"
3294 "psrlw $0x8,%%xmm1 \n"
3295 "por %%xmm4,%%xmm0 \n"
3296 "pmullw %%xmm3,%%xmm1 \n"
3297 "psrlw $0x8,%%xmm2 \n"
3298 "paddusb %%xmm2,%%xmm0 \n"
3299 "pand %%xmm5,%%xmm1 \n"
3300 "paddusb %%xmm1,%%xmm0 \n"
3301 "sub $0x4,%3 \n"
3302 "movdqa %%xmm0,(%2) \n"
3303 "lea 0x10(%2),%2 \n"
3304 "jge 40b \n"
3305 "jmp 49f \n"
3306
3307 // 4 pixel unaligned loop.
3308 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003309 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003310 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003311 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003312 "movdqa %%xmm3,%%xmm0 \n"
3313 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003314 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003315 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003316 "pand %%xmm6,%%xmm2 \n"
3317 "paddw %%xmm7,%%xmm3 \n"
3318 "pmullw %%xmm3,%%xmm2 \n"
3319 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003320 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003321 "psrlw $0x8,%%xmm1 \n"
3322 "por %%xmm4,%%xmm0 \n"
3323 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003324 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003325 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003326 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003327 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003328 "sub $0x4,%3 \n"
3329 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003330 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003331 "jge 41b \n"
3332
3333 "49: \n"
3334 "add $0x3,%3 \n"
3335 "jl 99f \n"
3336
fbarchard@google.com794fe122012-06-15 01:05:01 +00003337 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003338 "91: \n"
3339 "movd (%0),%%xmm3 \n"
3340 "lea 0x4(%0),%0 \n"
3341 "movdqa %%xmm3,%%xmm0 \n"
3342 "pxor %%xmm4,%%xmm3 \n"
3343 "movd (%1),%%xmm2 \n"
3344 "pshufb %4,%%xmm3 \n"
3345 "pand %%xmm6,%%xmm2 \n"
3346 "paddw %%xmm7,%%xmm3 \n"
3347 "pmullw %%xmm3,%%xmm2 \n"
3348 "movd (%1),%%xmm1 \n"
3349 "lea 0x4(%1),%1 \n"
3350 "psrlw $0x8,%%xmm1 \n"
3351 "por %%xmm4,%%xmm0 \n"
3352 "pmullw %%xmm3,%%xmm1 \n"
3353 "psrlw $0x8,%%xmm2 \n"
3354 "paddusb %%xmm2,%%xmm0 \n"
3355 "pand %%xmm5,%%xmm1 \n"
3356 "paddusb %%xmm1,%%xmm0 \n"
3357 "sub $0x1,%3 \n"
3358 "movd %%xmm0,(%2) \n"
3359 "lea 0x4(%2),%2 \n"
3360 "jge 91b \n"
3361 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003362 : "+r"(src_argb0), // %0
3363 "+r"(src_argb1), // %1
3364 "+r"(dst_argb), // %2
3365 "+r"(width) // %3
3366 : "m"(kShuffleAlpha) // %4
3367 : "memory", "cc"
3368#if defined(__SSE2__)
3369 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3370#endif
3371 );
3372}
3373#endif // HAS_ARGBBLENDROW_SSSE3
3374
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003375#ifdef HAS_ARGBATTENUATE_SSE2
3376// Attenuate 4 pixels at a time.
3377// aligned to 16 bytes
3378void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3379 asm volatile (
3380 "sub %0,%1 \n"
3381 "pcmpeqb %%xmm4,%%xmm4 \n"
3382 "pslld $0x18,%%xmm4 \n"
3383 "pcmpeqb %%xmm5,%%xmm5 \n"
3384 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003385
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003386 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003387 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003388 "1: \n"
3389 "movdqa (%0),%%xmm0 \n"
3390 "punpcklbw %%xmm0,%%xmm0 \n"
3391 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3392 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3393 "pmulhuw %%xmm2,%%xmm0 \n"
3394 "movdqa (%0),%%xmm1 \n"
3395 "punpckhbw %%xmm1,%%xmm1 \n"
3396 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3397 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3398 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003399 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003400 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003401 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003402 "psrlw $0x8,%%xmm1 \n"
3403 "packuswb %%xmm1,%%xmm0 \n"
3404 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003405 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003406 "sub $0x4,%2 \n"
3407 "movdqa %%xmm0,(%0,%1,1) \n"
3408 "lea 0x10(%0),%0 \n"
3409 "jg 1b \n"
3410 : "+r"(src_argb), // %0
3411 "+r"(dst_argb), // %1
3412 "+r"(width) // %2
3413 :
3414 : "memory", "cc"
3415#if defined(__SSE2__)
3416 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3417#endif
3418 );
3419}
3420#endif // HAS_ARGBATTENUATE_SSE2
3421
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003422#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003423// Shuffle table duplicating alpha
3424CONST uvec8 kShuffleAlpha0 = {
3425 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3426};
3427CONST uvec8 kShuffleAlpha1 = {
3428 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3429 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3430};
3431// Attenuate 4 pixels at a time.
3432// aligned to 16 bytes
3433void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3434 asm volatile (
3435 "sub %0,%1 \n"
3436 "pcmpeqb %%xmm3,%%xmm3 \n"
3437 "pslld $0x18,%%xmm3 \n"
3438 "movdqa %3,%%xmm4 \n"
3439 "movdqa %4,%%xmm5 \n"
3440
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003441 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003442 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003443 "1: \n"
3444 "movdqa (%0),%%xmm0 \n"
3445 "pshufb %%xmm4,%%xmm0 \n"
3446 "movdqa (%0),%%xmm1 \n"
3447 "punpcklbw %%xmm1,%%xmm1 \n"
3448 "pmulhuw %%xmm1,%%xmm0 \n"
3449 "movdqa (%0),%%xmm1 \n"
3450 "pshufb %%xmm5,%%xmm1 \n"
3451 "movdqa (%0),%%xmm2 \n"
3452 "punpckhbw %%xmm2,%%xmm2 \n"
3453 "pmulhuw %%xmm2,%%xmm1 \n"
3454 "movdqa (%0),%%xmm2 \n"
3455 "pand %%xmm3,%%xmm2 \n"
3456 "psrlw $0x8,%%xmm0 \n"
3457 "psrlw $0x8,%%xmm1 \n"
3458 "packuswb %%xmm1,%%xmm0 \n"
3459 "por %%xmm2,%%xmm0 \n"
3460 "sub $0x4,%2 \n"
3461 "movdqa %%xmm0,(%0,%1,1) \n"
3462 "lea 0x10(%0),%0 \n"
3463 "jg 1b \n"
3464 : "+r"(src_argb), // %0
3465 "+r"(dst_argb), // %1
3466 "+r"(width) // %2
3467 : "m"(kShuffleAlpha0), // %3
3468 "m"(kShuffleAlpha1) // %4
3469 : "memory", "cc"
3470#if defined(__SSE2__)
3471 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3472#endif
3473 );
3474}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003475#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003476
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003477#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003478// Unattenuate 4 pixels at a time.
3479// aligned to 16 bytes
3480void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3481 int width) {
3482 uintptr_t alpha = 0;
3483 asm volatile (
3484 "sub %0,%1 \n"
3485 "pcmpeqb %%xmm4,%%xmm4 \n"
3486 "pslld $0x18,%%xmm4 \n"
3487
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003488 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003489 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003490 "1: \n"
3491 "movdqa (%0),%%xmm0 \n"
3492 "movzb 0x3(%0),%3 \n"
3493 "punpcklbw %%xmm0,%%xmm0 \n"
3494 "movd 0x0(%4,%3,4),%%xmm2 \n"
3495 "movzb 0x7(%0),%3 \n"
3496 "movd 0x0(%4,%3,4),%%xmm3 \n"
3497 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3498 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3499 "movlhps %%xmm3,%%xmm2 \n"
3500 "pmulhuw %%xmm2,%%xmm0 \n"
3501 "movdqa (%0),%%xmm1 \n"
3502 "movzb 0xb(%0),%3 \n"
3503 "punpckhbw %%xmm1,%%xmm1 \n"
3504 "movd 0x0(%4,%3,4),%%xmm2 \n"
3505 "movzb 0xf(%0),%3 \n"
3506 "movd 0x0(%4,%3,4),%%xmm3 \n"
3507 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3508 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3509 "movlhps %%xmm3,%%xmm2 \n"
3510 "pmulhuw %%xmm2,%%xmm1 \n"
3511 "movdqa (%0),%%xmm2 \n"
3512 "pand %%xmm4,%%xmm2 \n"
3513 "packuswb %%xmm1,%%xmm0 \n"
3514 "por %%xmm2,%%xmm0 \n"
3515 "sub $0x4,%2 \n"
3516 "movdqa %%xmm0,(%0,%1,1) \n"
3517 "lea 0x10(%0),%0 \n"
3518 "jg 1b \n"
3519 : "+r"(src_argb), // %0
3520 "+r"(dst_argb), // %1
3521 "+r"(width), // %2
3522 "+r"(alpha) // %3
3523 : "r"(fixed_invtbl8) // %4
3524 : "memory", "cc"
3525#if defined(__SSE2__)
3526 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3527#endif
3528 );
3529}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003530#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003531
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003532#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003533// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003534CONST vec8 kARGBToGray = {
3535 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3536};
3537
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003538// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003539void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003540 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003541 "movdqa %3,%%xmm4 \n"
3542 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003543
3544 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003545 ".p2align 4 \n"
3546 "1: \n"
3547 "movdqa (%0),%%xmm0 \n"
3548 "movdqa 0x10(%0),%%xmm1 \n"
3549 "pmaddubsw %%xmm4,%%xmm0 \n"
3550 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003551 "phaddw %%xmm1,%%xmm0 \n"
3552 "psrlw $0x7,%%xmm0 \n"
3553 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003554 "movdqa (%0),%%xmm2 \n"
3555 "movdqa 0x10(%0),%%xmm3 \n"
3556 "psrld $0x18,%%xmm2 \n"
3557 "psrld $0x18,%%xmm3 \n"
3558 "packuswb %%xmm3,%%xmm2 \n"
3559 "packuswb %%xmm2,%%xmm2 \n"
3560 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003561 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003562 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003563 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003564 "punpcklwd %%xmm3,%%xmm0 \n"
3565 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003566 "sub $0x8,%2 \n"
3567 "movdqa %%xmm0,(%0,%1,1) \n"
3568 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003569 "lea 0x20(%0),%0 \n"
3570 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003571 : "+r"(src_argb), // %0
3572 "+r"(dst_argb), // %1
3573 "+r"(width) // %2
3574 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003575 : "memory", "cc"
3576#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003577 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003578#endif
3579 );
3580}
3581#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003582
3583#ifdef HAS_ARGBSEPIAROW_SSSE3
3584// b = (r * 35 + g * 68 + b * 17) >> 7
3585// g = (r * 45 + g * 88 + b * 22) >> 7
3586// r = (r * 50 + g * 98 + b * 24) >> 7
3587// Constant for ARGB color to sepia tone
3588CONST vec8 kARGBToSepiaB = {
3589 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3590};
3591
3592CONST vec8 kARGBToSepiaG = {
3593 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3594};
3595
3596CONST vec8 kARGBToSepiaR = {
3597 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3598};
3599
fbarchard@google.come442dc42012-06-18 17:37:09 +00003600// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003601void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3602 asm volatile (
3603 "movdqa %2,%%xmm2 \n"
3604 "movdqa %3,%%xmm3 \n"
3605 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003606
3607 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003608 ".p2align 4 \n"
3609 "1: \n"
3610 "movdqa (%0),%%xmm0 \n"
3611 "movdqa 0x10(%0),%%xmm6 \n"
3612 "pmaddubsw %%xmm2,%%xmm0 \n"
3613 "pmaddubsw %%xmm2,%%xmm6 \n"
3614 "phaddw %%xmm6,%%xmm0 \n"
3615 "psrlw $0x7,%%xmm0 \n"
3616 "packuswb %%xmm0,%%xmm0 \n"
3617 "movdqa (%0),%%xmm5 \n"
3618 "movdqa 0x10(%0),%%xmm1 \n"
3619 "pmaddubsw %%xmm3,%%xmm5 \n"
3620 "pmaddubsw %%xmm3,%%xmm1 \n"
3621 "phaddw %%xmm1,%%xmm5 \n"
3622 "psrlw $0x7,%%xmm5 \n"
3623 "packuswb %%xmm5,%%xmm5 \n"
3624 "punpcklbw %%xmm5,%%xmm0 \n"
3625 "movdqa (%0),%%xmm5 \n"
3626 "movdqa 0x10(%0),%%xmm1 \n"
3627 "pmaddubsw %%xmm4,%%xmm5 \n"
3628 "pmaddubsw %%xmm4,%%xmm1 \n"
3629 "phaddw %%xmm1,%%xmm5 \n"
3630 "psrlw $0x7,%%xmm5 \n"
3631 "packuswb %%xmm5,%%xmm5 \n"
3632 "movdqa (%0),%%xmm6 \n"
3633 "movdqa 0x10(%0),%%xmm1 \n"
3634 "psrld $0x18,%%xmm6 \n"
3635 "psrld $0x18,%%xmm1 \n"
3636 "packuswb %%xmm1,%%xmm6 \n"
3637 "packuswb %%xmm6,%%xmm6 \n"
3638 "punpcklbw %%xmm6,%%xmm5 \n"
3639 "movdqa %%xmm0,%%xmm1 \n"
3640 "punpcklwd %%xmm5,%%xmm0 \n"
3641 "punpckhwd %%xmm5,%%xmm1 \n"
3642 "sub $0x8,%1 \n"
3643 "movdqa %%xmm0,(%0) \n"
3644 "movdqa %%xmm1,0x10(%0) \n"
3645 "lea 0x20(%0),%0 \n"
3646 "jg 1b \n"
3647 : "+r"(dst_argb), // %0
3648 "+r"(width) // %1
3649 : "m"(kARGBToSepiaB), // %2
3650 "m"(kARGBToSepiaG), // %3
3651 "m"(kARGBToSepiaR) // %4
3652 : "memory", "cc"
3653#if defined(__SSE2__)
3654 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3655#endif
3656 );
3657}
3658#endif // HAS_ARGBSEPIAROW_SSSE3
3659
fbarchard@google.come442dc42012-06-18 17:37:09 +00003660#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3661// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3662// Same as Sepia except matrix is provided.
3663void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3664 int width) {
3665 asm volatile (
3666 "movd (%2),%%xmm2 \n"
3667 "movd 0x4(%2),%%xmm3 \n"
3668 "movd 0x8(%2),%%xmm4 \n"
3669 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3670 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3671 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003672
3673 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003674 ".p2align 4 \n"
3675 "1: \n"
3676 "movdqa (%0),%%xmm0 \n"
3677 "movdqa 0x10(%0),%%xmm6 \n"
3678 "pmaddubsw %%xmm2,%%xmm0 \n"
3679 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003680 "movdqa (%0),%%xmm5 \n"
3681 "movdqa 0x10(%0),%%xmm1 \n"
3682 "pmaddubsw %%xmm3,%%xmm5 \n"
3683 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003684 "phaddsw %%xmm6,%%xmm0 \n"
3685 "phaddsw %%xmm1,%%xmm5 \n"
3686 "psraw $0x7,%%xmm0 \n"
3687 "psraw $0x7,%%xmm5 \n"
3688 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003689 "packuswb %%xmm5,%%xmm5 \n"
3690 "punpcklbw %%xmm5,%%xmm0 \n"
3691 "movdqa (%0),%%xmm5 \n"
3692 "movdqa 0x10(%0),%%xmm1 \n"
3693 "pmaddubsw %%xmm4,%%xmm5 \n"
3694 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003695 "phaddsw %%xmm1,%%xmm5 \n"
3696 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003697 "packuswb %%xmm5,%%xmm5 \n"
3698 "movdqa (%0),%%xmm6 \n"
3699 "movdqa 0x10(%0),%%xmm1 \n"
3700 "psrld $0x18,%%xmm6 \n"
3701 "psrld $0x18,%%xmm1 \n"
3702 "packuswb %%xmm1,%%xmm6 \n"
3703 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003704 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003705 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003706 "punpcklwd %%xmm5,%%xmm0 \n"
3707 "punpckhwd %%xmm5,%%xmm1 \n"
3708 "sub $0x8,%1 \n"
3709 "movdqa %%xmm0,(%0) \n"
3710 "movdqa %%xmm1,0x10(%0) \n"
3711 "lea 0x20(%0),%0 \n"
3712 "jg 1b \n"
3713 : "+r"(dst_argb), // %0
3714 "+r"(width) // %1
3715 : "r"(matrix_argb) // %2
3716 : "memory", "cc"
3717#if defined(__SSE2__)
3718 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3719#endif
3720 );
3721}
3722#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3723
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003724#ifdef HAS_ARGBQUANTIZEROW_SSE2
3725// Quantize 4 ARGB pixels (16 bytes).
3726// aligned to 16 bytes
3727void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3728 int interval_offset, int width) {
3729 asm volatile (
3730 "movd %2,%%xmm2 \n"
3731 "movd %3,%%xmm3 \n"
3732 "movd %4,%%xmm4 \n"
3733 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3734 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3735 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3736 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3737 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3738 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3739 "pxor %%xmm5,%%xmm5 \n"
3740 "pcmpeqb %%xmm6,%%xmm6 \n"
3741 "pslld $0x18,%%xmm6 \n"
3742
3743 // 4 pixel loop.
3744 ".p2align 2 \n"
3745 "1: \n"
3746 "movdqa (%0),%%xmm0 \n"
3747 "punpcklbw %%xmm5,%%xmm0 \n"
3748 "pmulhuw %%xmm2,%%xmm0 \n"
3749 "movdqa (%0),%%xmm1 \n"
3750 "punpckhbw %%xmm5,%%xmm1 \n"
3751 "pmulhuw %%xmm2,%%xmm1 \n"
3752 "pmullw %%xmm3,%%xmm0 \n"
3753 "movdqa (%0),%%xmm7 \n"
3754 "pmullw %%xmm3,%%xmm1 \n"
3755 "pand %%xmm6,%%xmm7 \n"
3756 "paddw %%xmm4,%%xmm0 \n"
3757 "paddw %%xmm4,%%xmm1 \n"
3758 "packuswb %%xmm1,%%xmm0 \n"
3759 "por %%xmm7,%%xmm0 \n"
3760 "sub $0x4,%1 \n"
3761 "movdqa %%xmm0,(%0) \n"
3762 "lea 0x10(%0),%0 \n"
3763 "jg 1b \n"
3764 : "+r"(dst_argb), // %0
3765 "+r"(width) // %1
3766 : "r"(scale), // %2
3767 "r"(interval_size), // %3
3768 "r"(interval_offset) // %4
3769 : "memory", "cc"
3770#if defined(__SSE2__)
3771 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3772#endif
3773 );
3774}
3775#endif // HAS_ARGBQUANTIZEROW_SSE2
3776
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003777#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3778// Creates a table of cumulative sums where each value is a sum of all values
3779// above and to the left of the value, inclusive of the value.
3780void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003781 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003782 asm volatile (
3783 "sub %1,%2 \n"
3784 "pxor %%xmm0,%%xmm0 \n"
3785 "pxor %%xmm1,%%xmm1 \n"
3786 "sub $0x4,%3 \n"
3787 "jl 49f \n"
3788 "test $0xf,%1 \n"
3789 "jne 49f \n"
3790
3791 // 4 pixel loop \n"
3792 ".p2align 2 \n"
3793 "40: \n"
3794 "movdqu (%0),%%xmm2 \n"
3795 "lea 0x10(%0),%0 \n"
3796 "movdqa %%xmm2,%%xmm4 \n"
3797 "punpcklbw %%xmm1,%%xmm2 \n"
3798 "movdqa %%xmm2,%%xmm3 \n"
3799 "punpcklwd %%xmm1,%%xmm2 \n"
3800 "punpckhwd %%xmm1,%%xmm3 \n"
3801 "punpckhbw %%xmm1,%%xmm4 \n"
3802 "movdqa %%xmm4,%%xmm5 \n"
3803 "punpcklwd %%xmm1,%%xmm4 \n"
3804 "punpckhwd %%xmm1,%%xmm5 \n"
3805 "paddd %%xmm2,%%xmm0 \n"
3806 "movdqa (%1,%2,1),%%xmm2 \n"
3807 "paddd %%xmm0,%%xmm2 \n"
3808 "paddd %%xmm3,%%xmm0 \n"
3809 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3810 "paddd %%xmm0,%%xmm3 \n"
3811 "paddd %%xmm4,%%xmm0 \n"
3812 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3813 "paddd %%xmm0,%%xmm4 \n"
3814 "paddd %%xmm5,%%xmm0 \n"
3815 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3816 "paddd %%xmm0,%%xmm5 \n"
3817 "movdqa %%xmm2,(%1) \n"
3818 "movdqa %%xmm3,0x10(%1) \n"
3819 "movdqa %%xmm4,0x20(%1) \n"
3820 "movdqa %%xmm5,0x30(%1) \n"
3821 "lea 0x40(%1),%1 \n"
3822 "sub $0x4,%3 \n"
3823 "jge 40b \n"
3824
3825 "49: \n"
3826 "add $0x3,%3 \n"
3827 "jl 19f \n"
3828
3829 // 1 pixel loop \n"
3830 ".p2align 2 \n"
3831 "10: \n"
3832 "movd (%0),%%xmm2 \n"
3833 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003834 "punpcklbw %%xmm1,%%xmm2 \n"
3835 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003836 "paddd %%xmm2,%%xmm0 \n"
3837 "movdqu (%1,%2,1),%%xmm2 \n"
3838 "paddd %%xmm0,%%xmm2 \n"
3839 "movdqu %%xmm2,(%1) \n"
3840 "lea 0x10(%1),%1 \n"
3841 "sub $0x1,%3 \n"
3842 "jge 10b \n"
3843
3844 "19: \n"
3845 : "+r"(row), // %0
3846 "+r"(cumsum), // %1
3847 "+r"(previous_cumsum), // %2
3848 "+r"(width) // %3
3849 :
3850 : "memory", "cc"
3851#if defined(__SSE2__)
3852 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3853#endif
3854 );
3855}
3856#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3857
3858#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3859void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3860 int width, int area, uint8* dst, int count) {
3861 asm volatile (
3862 "movd %5,%%xmm4 \n"
3863 "cvtdq2ps %%xmm4,%%xmm4 \n"
3864 "rcpss %%xmm4,%%xmm4 \n"
3865 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3866 "sub $0x4,%3 \n"
3867 "jl 49f \n"
3868
3869 // 4 pixel loop \n"
3870 ".p2align 2 \n"
3871 "40: \n"
3872 "movdqa (%0),%%xmm0 \n"
3873 "movdqa 0x10(%0),%%xmm1 \n"
3874 "movdqa 0x20(%0),%%xmm2 \n"
3875 "movdqa 0x30(%0),%%xmm3 \n"
3876 "psubd (%0,%4,4),%%xmm0 \n"
3877 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3878 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3879 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3880 "lea 0x40(%0),%0 \n"
3881 "psubd (%1),%%xmm0 \n"
3882 "psubd 0x10(%1),%%xmm1 \n"
3883 "psubd 0x20(%1),%%xmm2 \n"
3884 "psubd 0x30(%1),%%xmm3 \n"
3885 "paddd (%1,%4,4),%%xmm0 \n"
3886 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3887 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3888 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3889 "lea 0x40(%1),%1 \n"
3890 "cvtdq2ps %%xmm0,%%xmm0 \n"
3891 "cvtdq2ps %%xmm1,%%xmm1 \n"
3892 "mulps %%xmm4,%%xmm0 \n"
3893 "mulps %%xmm4,%%xmm1 \n"
3894 "cvtdq2ps %%xmm2,%%xmm2 \n"
3895 "cvtdq2ps %%xmm3,%%xmm3 \n"
3896 "mulps %%xmm4,%%xmm2 \n"
3897 "mulps %%xmm4,%%xmm3 \n"
3898 "cvtps2dq %%xmm0,%%xmm0 \n"
3899 "cvtps2dq %%xmm1,%%xmm1 \n"
3900 "cvtps2dq %%xmm2,%%xmm2 \n"
3901 "cvtps2dq %%xmm3,%%xmm3 \n"
3902 "packssdw %%xmm1,%%xmm0 \n"
3903 "packssdw %%xmm3,%%xmm2 \n"
3904 "packuswb %%xmm2,%%xmm0 \n"
3905 "movdqu %%xmm0,(%2) \n"
3906 "lea 0x10(%2),%2 \n"
3907 "sub $0x4,%3 \n"
3908 "jge 40b \n"
3909
3910 "49: \n"
3911 "add $0x3,%3 \n"
3912 "jl 19f \n"
3913
3914 // 1 pixel loop \n"
3915 ".p2align 2 \n"
3916 "10: \n"
3917 "movdqa (%0),%%xmm0 \n"
3918 "psubd (%0,%4,4),%%xmm0 \n"
3919 "lea 0x10(%0),%0 \n"
3920 "psubd (%1),%%xmm0 \n"
3921 "paddd (%1,%4,4),%%xmm0 \n"
3922 "lea 0x10(%1),%1 \n"
3923 "cvtdq2ps %%xmm0,%%xmm0 \n"
3924 "mulps %%xmm4,%%xmm0 \n"
3925 "cvtps2dq %%xmm0,%%xmm0 \n"
3926 "packssdw %%xmm0,%%xmm0 \n"
3927 "packuswb %%xmm0,%%xmm0 \n"
3928 "movd %%xmm0,(%2) \n"
3929 "lea 0x4(%2),%2 \n"
3930 "sub $0x1,%3 \n"
3931 "jge 10b \n"
3932 "19: \n"
3933 : "+r"(topleft), // %0
3934 "+r"(botleft), // %1
3935 "+r"(dst), // %2
3936 "+rm"(count) // %3
3937 : "r"(static_cast<intptr_t>(width)), // %4
3938 "rm"(area) // %5
3939 : "memory", "cc"
3940#if defined(__SSE2__)
3941 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3942#endif
3943 );
3944}
3945#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003946#ifdef HAS_ARGBSHADE_SSE2
3947// Shade 4 pixels at a time by specified value.
3948// Aligned to 16 bytes.
3949void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3950 uint32 value) {
3951 asm volatile (
3952 "movd %3,%%xmm2 \n"
3953 "sub %0,%1 \n"
3954 "punpcklbw %%xmm2,%%xmm2 \n"
3955 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003956
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003957 // 4 pixel loop.
3958 ".p2align 2 \n"
3959 "1: \n"
3960 "movdqa (%0),%%xmm0 \n"
3961 "movdqa %%xmm0,%%xmm1 \n"
3962 "punpcklbw %%xmm0,%%xmm0 \n"
3963 "punpckhbw %%xmm1,%%xmm1 \n"
3964 "pmulhuw %%xmm2,%%xmm0 \n"
3965 "pmulhuw %%xmm2,%%xmm1 \n"
3966 "psrlw $0x8,%%xmm0 \n"
3967 "psrlw $0x8,%%xmm1 \n"
3968 "packuswb %%xmm1,%%xmm0 \n"
3969 "sub $0x4,%2 \n"
3970 "movdqa %%xmm0,(%0,%1,1) \n"
3971 "lea 0x10(%0),%0 \n"
3972 "jg 1b \n"
3973 : "+r"(src_argb), // %0
3974 "+r"(dst_argb), // %1
3975 "+r"(width) // %2
3976 : "r"(value) // %3
3977 : "memory", "cc"
3978#if defined(__SSE2__)
3979 , "xmm0", "xmm1", "xmm2"
3980#endif
3981 );
3982}
3983#endif // HAS_ARGBSHADE_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003984
fbarchard@google.com73444402012-08-09 17:33:29 +00003985#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003986// TODO(fbarchard): Find 64 bit way to avoid masking.
3987// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
fbarchard@google.com73444402012-08-09 17:33:29 +00003988// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003989// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003990// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003991
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00003992LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00003993void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3994 uint8* dst_argb, const float* uv_dudv, int width) {
3995 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003996 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00003997 asm volatile (
3998 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003999 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004000 "shl $0x10,%1 \n"
4001 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004002 "movd %1,%%xmm5 \n"
4003 "sub $0x4,%4 \n"
4004 "jl 49f \n"
4005
4006 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4007 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004008 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004009 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004010 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004011 "movdqa %%xmm7,%%xmm4 \n"
4012 "addps %%xmm4,%%xmm4 \n"
4013 "movdqa %%xmm2,%%xmm3 \n"
4014 "addps %%xmm4,%%xmm3 \n"
4015 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004016
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004017 // 4 pixel loop \n"
4018 ".p2align 4 \n"
4019 "40: \n"
4020 "cvttps2dq %%xmm2,%%xmm0 \n"
4021 "cvttps2dq %%xmm3,%%xmm1 \n"
4022 "packssdw %%xmm1,%%xmm0 \n"
4023 "pmaddwd %%xmm5,%%xmm0 \n"
4024#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004025 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004026 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004027 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004028 "shr $32,%5 \n"
4029 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4030#else
4031 "movd %%xmm0,%1 \n"
4032 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4033 "movd %%xmm0,%5 \n"
4034 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4035#endif
4036 "movd (%0,%1,1),%%xmm1 \n"
4037 "movd (%0,%5,1),%%xmm6 \n"
4038 "punpckldq %%xmm6,%%xmm1 \n"
4039 "addps %%xmm4,%%xmm2 \n"
4040 "movq %%xmm1,(%2) \n"
4041#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004042 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004043 "mov %1,%5 \n"
4044 "and $0x0fffffff,%1 \n"
4045 "shr $32,%5 \n"
4046#else
4047 "movd %%xmm0,%1 \n"
4048 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4049 "movd %%xmm0,%5 \n"
4050#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004051 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004052 "movd (%0,%5,1),%%xmm6 \n"
4053 "punpckldq %%xmm6,%%xmm0 \n"
4054 "addps %%xmm4,%%xmm3 \n"
4055 "sub $0x4,%4 \n"
4056 "movq %%xmm0,0x08(%2) \n"
4057 "lea 0x10(%2),%2 \n"
4058 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004059
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004060 "49: \n"
4061 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004062 "jl 19f \n"
4063
4064 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004065 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004066 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004067 "cvttps2dq %%xmm2,%%xmm0 \n"
4068 "packssdw %%xmm0,%%xmm0 \n"
4069 "pmaddwd %%xmm5,%%xmm0 \n"
4070 "addps %%xmm7,%%xmm2 \n"
4071 "movd %%xmm0,%1 \n"
4072#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004073 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004074#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004075 "movd (%0,%1,1),%%xmm0 \n"
4076 "sub $0x1,%4 \n"
4077 "movd %%xmm0,(%2) \n"
4078 "lea 0x4(%2),%2 \n"
4079 "jge 10b \n"
4080 "19: \n"
4081 : "+r"(src_argb), // %0
4082 "+r"(src_argb_stride_temp), // %1
4083 "+r"(dst_argb), // %2
4084 "+r"(uv_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004085 "+rm"(width), // %4
4086 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004087 :
4088 : "memory", "cc"
4089#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004090 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004091#endif
4092 );
4093}
4094#endif // HAS_ARGBAFFINEROW_SSE2
4095
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004096// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
4097void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4098 ptrdiff_t src_stride, int dst_width,
4099 int source_y_fraction) {
4100 asm volatile (
4101 "sub %1,%0 \n"
4102 "shr %3 \n"
4103 "cmp $0x0,%3 \n"
4104 "je 2f \n"
4105 "cmp $0x40,%3 \n"
4106 "je 3f \n"
4107 "movd %3,%%xmm0 \n"
4108 "neg %3 \n"
4109 "add $0x80,%3 \n"
4110 "movd %3,%%xmm5 \n"
4111 "punpcklbw %%xmm0,%%xmm5 \n"
4112 "punpcklwd %%xmm5,%%xmm5 \n"
4113 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4114 ".p2align 4 \n"
4115 "1: \n"
4116 "movdqa (%1),%%xmm0 \n"
4117 "movdqa (%1,%4,1),%%xmm2 \n"
4118 "movdqa %%xmm0,%%xmm1 \n"
4119 "punpcklbw %%xmm2,%%xmm0 \n"
4120 "punpckhbw %%xmm2,%%xmm1 \n"
4121 "pmaddubsw %%xmm5,%%xmm0 \n"
4122 "pmaddubsw %%xmm5,%%xmm1 \n"
4123 "psrlw $0x7,%%xmm0 \n"
4124 "psrlw $0x7,%%xmm1 \n"
4125 "packuswb %%xmm1,%%xmm0 \n"
4126 "sub $0x4,%2 \n"
4127 "movdqa %%xmm0,(%1,%0,1) \n"
4128 "lea 0x10(%1),%1 \n"
4129 "jg 1b \n"
4130 "jmp 4f \n"
4131 ".p2align 4 \n"
4132 "2: \n"
4133 "movdqa (%1),%%xmm0 \n"
4134 "sub $0x4,%2 \n"
4135 "movdqa %%xmm0,(%1,%0,1) \n"
4136 "lea 0x10(%1),%1 \n"
4137 "jg 2b \n"
4138 "jmp 4f \n"
4139 ".p2align 4 \n"
4140 "3: \n"
4141 "movdqa (%1),%%xmm0 \n"
4142 "pavgb (%1,%4,1),%%xmm0 \n"
4143 "sub $0x4,%2 \n"
4144 "movdqa %%xmm0,(%1,%0,1) \n"
4145 "lea 0x10(%1),%1 \n"
4146 "jg 3b \n"
4147 "4: \n"
4148 ".p2align 4 \n"
4149 : "+r"(dst_ptr), // %0
4150 "+r"(src_ptr), // %1
4151 "+r"(dst_width), // %2
4152 "+r"(source_y_fraction) // %3
4153 : "r"(static_cast<intptr_t>(src_stride)) // %4
4154 : "memory", "cc"
4155#if defined(__SSE2__)
4156 , "xmm0", "xmm1", "xmm2", "xmm5"
4157#endif
4158 );
4159}
4160
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004161void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4162 uint8* dst_uv, int pix) {
4163 asm volatile (
4164 "sub %0,%1 \n"
4165 ".p2align 4 \n"
4166 "1: \n"
4167 "movdqa (%0),%%xmm0 \n"
4168 "pavgb (%0,%3),%%xmm0 \n"
4169 "sub $0x10,%2 \n"
4170 "movdqa %%xmm0,(%0,%1) \n"
4171 "lea 0x10(%0),%0 \n"
4172 "jg 1b \n"
4173 : "+r"(src_uv), // %0
4174 "+r"(dst_uv), // %1
4175 "+r"(pix) // %2
4176 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4177 : "memory", "cc"
4178#if defined(__SSE2__)
4179 , "xmm0"
4180#endif
4181 );
4182}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004183
4184void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4185 uint32 selector, int pix) {
4186 asm volatile (
4187 "movd %3,%%xmm5 \n"
4188 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4189 ".p2align 4 \n"
4190 "1: \n"
4191 "movdqa (%0),%%xmm0 \n"
4192 "lea 0x10(%0),%0 \n"
4193 "pshufb %%xmm5,%%xmm0 \n"
4194 "sub $0x4,%2 \n"
4195 "movd %%xmm0,(%1) \n"
4196 "lea 0x4(%1),%1 \n"
4197 "jg 1b \n"
4198 : "+r"(src_argb), // %0
4199 "+r"(dst_bayer), // %1
4200 "+r"(pix) // %2
4201 : "g"(selector) // %3
4202 : "memory", "cc"
4203#if defined(__SSE2__)
4204 , "xmm0", "xmm5"
4205#endif
4206 );
4207}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004208
4209void I422ToYUY2Row_SSE2(const uint8* src_y,
4210 const uint8* src_u,
4211 const uint8* src_v,
4212 uint8* dst_frame, int width) {
4213 asm volatile (
4214 "sub %1,%2 \n"
4215 ".p2align 4 \n"
4216 "1: \n"
4217 "movq (%1),%%xmm2 \n"
4218 "movq (%1,%2,1),%%xmm3 \n"
4219 "lea 0x8(%1),%1 \n"
4220 "punpcklbw %%xmm3,%%xmm2 \n"
4221 "movdqa (%0),%%xmm0 \n"
4222 "lea 0x10(%0),%0 \n"
4223 "movdqa %%xmm0,%%xmm1 \n"
4224 "punpcklbw %%xmm2,%%xmm0 \n"
4225 "punpckhbw %%xmm2,%%xmm1 \n"
4226 "movdqa %%xmm0,(%3) \n"
4227 "movdqa %%xmm1,0x10(%3) \n"
4228 "lea 0x20(%3),%3 \n"
4229 "sub $0x10,%4 \n"
4230 "jg 1b \n"
4231 : "+r"(src_y), // %0
4232 "+r"(src_u), // %1
4233 "+r"(src_v), // %2
4234 "+r"(dst_frame), // %3
4235 "+rm"(width) // %4
4236 :
4237 : "memory", "cc"
4238#if defined(__SSE2__)
4239 , "xmm0", "xmm1", "xmm2", "xmm3"
4240#endif
4241 );
4242}
4243
4244void I422ToUYVYRow_SSE2(const uint8* src_y,
4245 const uint8* src_u,
4246 const uint8* src_v,
4247 uint8* dst_frame, int width) {
4248 asm volatile (
4249 "sub %1,%2 \n"
4250 ".p2align 4 \n"
4251 "1: \n"
4252 "movq (%1),%%xmm2 \n"
4253 "movq (%1,%2,1),%%xmm3 \n"
4254 "lea 0x8(%1),%1 \n"
4255 "punpcklbw %%xmm3,%%xmm2 \n"
4256 "movdqa (%0),%%xmm0 \n"
4257 "movdqa %%xmm2,%%xmm1 \n"
4258 "lea 0x10(%0),%0 \n"
4259 "punpcklbw %%xmm0,%%xmm1 \n"
4260 "punpckhbw %%xmm0,%%xmm2 \n"
4261 "movdqa %%xmm1,(%3) \n"
4262 "movdqa %%xmm2,0x10(%3) \n"
4263 "lea 0x20(%3),%3 \n"
4264 "sub $0x10,%4 \n"
4265 "jg 1b \n"
4266 : "+r"(src_y), // %0
4267 "+r"(src_u), // %1
4268 "+r"(src_v), // %2
4269 "+r"(dst_frame), // %3
4270 "+rm"(width) // %4
4271 :
4272 : "memory", "cc"
4273#if defined(__SSE2__)
4274 , "xmm0", "xmm1", "xmm2", "xmm3"
4275#endif
4276 );
4277}
4278
fbarchard@google.com2d11d432012-02-16 02:50:39 +00004279#endif // defined(__x86_64__) || defined(__i386__)
4280
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004281#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00004282} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004283} // namespace libyuv
4284#endif