blob: 4f722c726dc4f12ef749bec21a7db6e73d3a0565 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
fbarchard@google.comcde58702013-01-28 00:02:35 +00007 * in the file PATENTS. All contributing project authors may
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00008 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.com83a63e62013-02-27 00:20:29 +000021#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +000038// JPeg full range.
39CONST vec8 kARGBToYJ = {
40 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0
41};
42
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000043CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000044 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
45};
46
fbarchard@google.com714050a2012-02-17 22:59:56 +000047CONST vec8 kARGBToV = {
48 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
49};
50
51// Constants for BGRA
52CONST vec8 kBGRAToY = {
53 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
54};
55
56CONST vec8 kBGRAToU = {
57 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
58};
59
60CONST vec8 kBGRAToV = {
61 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
62};
63
64// Constants for ABGR
65CONST vec8 kABGRToY = {
66 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
67};
68
69CONST vec8 kABGRToU = {
70 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
71};
72
73CONST vec8 kABGRToV = {
74 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
75};
76
fbarchard@google.com4de0c432012-10-11 01:25:46 +000077// Constants for RGBA.
78CONST vec8 kRGBAToY = {
79 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
80};
81
82CONST vec8 kRGBAToU = {
83 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
84};
85
86CONST vec8 kRGBAToV = {
87 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
88};
89
fbarchard@google.com714050a2012-02-17 22:59:56 +000090CONST uvec8 kAddY16 = {
91 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000092};
fbarchard@google.com2430e042011-11-11 21:57:06 +000093
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +000094CONST vec16 kAddYJ64 = {
95 64, 64, 64, 64, 64, 64, 64, 64
96};
97
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000098CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000099 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
100 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
101};
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000102
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000103// Shuffle table for converting RGB24 to ARGB.
104CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000105 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
106};
107
108// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000109CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000110 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
111};
112
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000113// Shuffle table for converting ARGB to RGB24.
114CONST uvec8 kShuffleMaskARGBToRGB24 = {
115 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
116};
117
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000118// Shuffle table for converting ARGB to RAW.
119CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000120 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000121};
122
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000123// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000124CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
125 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
126};
127
128// Shuffle table for converting ARGB to RAW.
129CONST uvec8 kShuffleMaskARGBToRAW_0 = {
130 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
131};
132
fbarchard@google.comb6149762011-11-07 21:58:52 +0000133void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000134 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000135 "pcmpeqb %%xmm5,%%xmm5 \n"
136 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000137 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000138 "1: \n"
139 "movq (%0),%%xmm0 \n"
140 "lea 0x8(%0),%0 \n"
141 "punpcklbw %%xmm0,%%xmm0 \n"
142 "movdqa %%xmm0,%%xmm1 \n"
143 "punpcklwd %%xmm0,%%xmm0 \n"
144 "punpckhwd %%xmm1,%%xmm1 \n"
145 "por %%xmm5,%%xmm0 \n"
146 "por %%xmm5,%%xmm1 \n"
147 "movdqa %%xmm0,(%1) \n"
148 "movdqa %%xmm1,0x10(%1) \n"
149 "lea 0x20(%1),%1 \n"
150 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000151 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000152 : "+r"(src_y), // %0
153 "+r"(dst_argb), // %1
154 "+r"(pix) // %2
155 :
156 : "memory", "cc"
157#if defined(__SSE2__)
158 , "xmm0", "xmm1", "xmm5"
159#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000160 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000161}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000162
fbarchard@google.com00b69a22012-11-02 06:03:28 +0000163void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
164 int pix) {
165 asm volatile (
166 "pcmpeqb %%xmm5,%%xmm5 \n"
167 "pslld $0x18,%%xmm5 \n"
168 ".p2align 4 \n"
169 "1: \n"
170 "movq (%0),%%xmm0 \n"
171 "lea 0x8(%0),%0 \n"
172 "punpcklbw %%xmm0,%%xmm0 \n"
173 "movdqa %%xmm0,%%xmm1 \n"
174 "punpcklwd %%xmm0,%%xmm0 \n"
175 "punpckhwd %%xmm1,%%xmm1 \n"
176 "por %%xmm5,%%xmm0 \n"
177 "por %%xmm5,%%xmm1 \n"
178 "movdqu %%xmm0,(%1) \n"
179 "movdqu %%xmm1,0x10(%1) \n"
180 "lea 0x20(%1),%1 \n"
181 "sub $0x8,%2 \n"
182 "jg 1b \n"
183 : "+r"(src_y), // %0
184 "+r"(dst_argb), // %1
185 "+r"(pix) // %2
186 :
187 : "memory", "cc"
188#if defined(__SSE2__)
189 , "xmm0", "xmm1", "xmm5"
190#endif
191 );
192}
193
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000194void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000195 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000196 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
197 "pslld $0x18,%%xmm5 \n"
198 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000199 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000200 "1: \n"
201 "movdqu (%0),%%xmm0 \n"
202 "movdqu 0x10(%0),%%xmm1 \n"
203 "movdqu 0x20(%0),%%xmm3 \n"
204 "lea 0x30(%0),%0 \n"
205 "movdqa %%xmm3,%%xmm2 \n"
206 "palignr $0x8,%%xmm1,%%xmm2 \n"
207 "pshufb %%xmm4,%%xmm2 \n"
208 "por %%xmm5,%%xmm2 \n"
209 "palignr $0xc,%%xmm0,%%xmm1 \n"
210 "pshufb %%xmm4,%%xmm0 \n"
211 "movdqa %%xmm2,0x20(%1) \n"
212 "por %%xmm5,%%xmm0 \n"
213 "pshufb %%xmm4,%%xmm1 \n"
214 "movdqa %%xmm0,(%1) \n"
215 "por %%xmm5,%%xmm1 \n"
216 "palignr $0x4,%%xmm3,%%xmm3 \n"
217 "pshufb %%xmm4,%%xmm3 \n"
218 "movdqa %%xmm1,0x10(%1) \n"
219 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000220 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000221 "movdqa %%xmm3,0x30(%1) \n"
222 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000223 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000224 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000225 "+r"(dst_argb), // %1
226 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000227 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000228 : "memory", "cc"
229#if defined(__SSE2__)
230 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
231#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000232 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000233}
234
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000235void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000236 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000237 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
238 "pslld $0x18,%%xmm5 \n"
239 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000240 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000241 "1: \n"
242 "movdqu (%0),%%xmm0 \n"
243 "movdqu 0x10(%0),%%xmm1 \n"
244 "movdqu 0x20(%0),%%xmm3 \n"
245 "lea 0x30(%0),%0 \n"
246 "movdqa %%xmm3,%%xmm2 \n"
247 "palignr $0x8,%%xmm1,%%xmm2 \n"
248 "pshufb %%xmm4,%%xmm2 \n"
249 "por %%xmm5,%%xmm2 \n"
250 "palignr $0xc,%%xmm0,%%xmm1 \n"
251 "pshufb %%xmm4,%%xmm0 \n"
252 "movdqa %%xmm2,0x20(%1) \n"
253 "por %%xmm5,%%xmm0 \n"
254 "pshufb %%xmm4,%%xmm1 \n"
255 "movdqa %%xmm0,(%1) \n"
256 "por %%xmm5,%%xmm1 \n"
257 "palignr $0x4,%%xmm3,%%xmm3 \n"
258 "pshufb %%xmm4,%%xmm3 \n"
259 "movdqa %%xmm1,0x10(%1) \n"
260 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000261 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000262 "movdqa %%xmm3,0x30(%1) \n"
263 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000264 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000265 : "+r"(src_raw), // %0
266 "+r"(dst_argb), // %1
267 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000268 : "m"(kShuffleMaskRAWToARGB) // %3
269 : "memory", "cc"
270#if defined(__SSE2__)
271 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
272#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000273 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000274}
275
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000276void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000277 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000278 "mov $0x1080108,%%eax \n"
279 "movd %%eax,%%xmm5 \n"
280 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000281 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000282 "movd %%eax,%%xmm6 \n"
283 "pshufd $0x0,%%xmm6,%%xmm6 \n"
284 "pcmpeqb %%xmm3,%%xmm3 \n"
285 "psllw $0xb,%%xmm3 \n"
286 "pcmpeqb %%xmm4,%%xmm4 \n"
287 "psllw $0xa,%%xmm4 \n"
288 "psrlw $0x5,%%xmm4 \n"
289 "pcmpeqb %%xmm7,%%xmm7 \n"
290 "psllw $0x8,%%xmm7 \n"
291 "sub %0,%1 \n"
292 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000293 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000294 "1: \n"
295 "movdqu (%0),%%xmm0 \n"
296 "movdqa %%xmm0,%%xmm1 \n"
297 "movdqa %%xmm0,%%xmm2 \n"
298 "pand %%xmm3,%%xmm1 \n"
299 "psllw $0xb,%%xmm2 \n"
300 "pmulhuw %%xmm5,%%xmm1 \n"
301 "pmulhuw %%xmm5,%%xmm2 \n"
302 "psllw $0x8,%%xmm1 \n"
303 "por %%xmm2,%%xmm1 \n"
304 "pand %%xmm4,%%xmm0 \n"
305 "pmulhuw %%xmm6,%%xmm0 \n"
306 "por %%xmm7,%%xmm0 \n"
307 "movdqa %%xmm1,%%xmm2 \n"
308 "punpcklbw %%xmm0,%%xmm1 \n"
309 "punpckhbw %%xmm0,%%xmm2 \n"
310 "movdqa %%xmm1,(%1,%0,2) \n"
311 "movdqa %%xmm2,0x10(%1,%0,2) \n"
312 "lea 0x10(%0),%0 \n"
313 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000314 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000315 : "+r"(src), // %0
316 "+r"(dst), // %1
317 "+r"(pix) // %2
318 :
319 : "memory", "cc", "eax"
320#if defined(__SSE2__)
321 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
322#endif
323 );
324}
325
326void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000327 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000328 "mov $0x1080108,%%eax \n"
329 "movd %%eax,%%xmm5 \n"
330 "pshufd $0x0,%%xmm5,%%xmm5 \n"
331 "mov $0x42004200,%%eax \n"
332 "movd %%eax,%%xmm6 \n"
333 "pshufd $0x0,%%xmm6,%%xmm6 \n"
334 "pcmpeqb %%xmm3,%%xmm3 \n"
335 "psllw $0xb,%%xmm3 \n"
336 "movdqa %%xmm3,%%xmm4 \n"
337 "psrlw $0x6,%%xmm4 \n"
338 "pcmpeqb %%xmm7,%%xmm7 \n"
339 "psllw $0x8,%%xmm7 \n"
340 "sub %0,%1 \n"
341 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000342 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000343 "1: \n"
344 "movdqu (%0),%%xmm0 \n"
345 "movdqa %%xmm0,%%xmm1 \n"
346 "movdqa %%xmm0,%%xmm2 \n"
347 "psllw $0x1,%%xmm1 \n"
348 "psllw $0xb,%%xmm2 \n"
349 "pand %%xmm3,%%xmm1 \n"
350 "pmulhuw %%xmm5,%%xmm2 \n"
351 "pmulhuw %%xmm5,%%xmm1 \n"
352 "psllw $0x8,%%xmm1 \n"
353 "por %%xmm2,%%xmm1 \n"
354 "movdqa %%xmm0,%%xmm2 \n"
355 "pand %%xmm4,%%xmm0 \n"
356 "psraw $0x8,%%xmm2 \n"
357 "pmulhuw %%xmm6,%%xmm0 \n"
358 "pand %%xmm7,%%xmm2 \n"
359 "por %%xmm2,%%xmm0 \n"
360 "movdqa %%xmm1,%%xmm2 \n"
361 "punpcklbw %%xmm0,%%xmm1 \n"
362 "punpckhbw %%xmm0,%%xmm2 \n"
363 "movdqa %%xmm1,(%1,%0,2) \n"
364 "movdqa %%xmm2,0x10(%1,%0,2) \n"
365 "lea 0x10(%0),%0 \n"
366 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000367 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000368 : "+r"(src), // %0
369 "+r"(dst), // %1
370 "+r"(pix) // %2
371 :
372 : "memory", "cc", "eax"
373#if defined(__SSE2__)
374 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
375#endif
376 );
377}
378
379void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000380 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000381 "mov $0xf0f0f0f,%%eax \n"
382 "movd %%eax,%%xmm4 \n"
383 "pshufd $0x0,%%xmm4,%%xmm4 \n"
384 "movdqa %%xmm4,%%xmm5 \n"
385 "pslld $0x4,%%xmm5 \n"
386 "sub %0,%1 \n"
387 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000388 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000389 "1: \n"
390 "movdqu (%0),%%xmm0 \n"
391 "movdqa %%xmm0,%%xmm2 \n"
392 "pand %%xmm4,%%xmm0 \n"
393 "pand %%xmm5,%%xmm2 \n"
394 "movdqa %%xmm0,%%xmm1 \n"
395 "movdqa %%xmm2,%%xmm3 \n"
396 "psllw $0x4,%%xmm1 \n"
397 "psrlw $0x4,%%xmm3 \n"
398 "por %%xmm1,%%xmm0 \n"
399 "por %%xmm3,%%xmm2 \n"
400 "movdqa %%xmm0,%%xmm1 \n"
401 "punpcklbw %%xmm2,%%xmm0 \n"
402 "punpckhbw %%xmm2,%%xmm1 \n"
403 "movdqa %%xmm0,(%1,%0,2) \n"
404 "movdqa %%xmm1,0x10(%1,%0,2) \n"
405 "lea 0x10(%0),%0 \n"
406 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000407 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000408 : "+r"(src), // %0
409 "+r"(dst), // %1
410 "+r"(pix) // %2
411 :
412 : "memory", "cc", "eax"
413#if defined(__SSE2__)
414 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
415#endif
416 );
417}
418
419void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000420 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000421 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000422 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000423 "1: \n"
424 "movdqa (%0),%%xmm0 \n"
425 "movdqa 0x10(%0),%%xmm1 \n"
426 "movdqa 0x20(%0),%%xmm2 \n"
427 "movdqa 0x30(%0),%%xmm3 \n"
428 "lea 0x40(%0),%0 \n"
429 "pshufb %%xmm6,%%xmm0 \n"
430 "pshufb %%xmm6,%%xmm1 \n"
431 "pshufb %%xmm6,%%xmm2 \n"
432 "pshufb %%xmm6,%%xmm3 \n"
433 "movdqa %%xmm1,%%xmm4 \n"
434 "psrldq $0x4,%%xmm1 \n"
435 "pslldq $0xc,%%xmm4 \n"
436 "movdqa %%xmm2,%%xmm5 \n"
437 "por %%xmm4,%%xmm0 \n"
438 "pslldq $0x8,%%xmm5 \n"
439 "movdqa %%xmm0,(%1) \n"
440 "por %%xmm5,%%xmm1 \n"
441 "psrldq $0x8,%%xmm2 \n"
442 "pslldq $0x4,%%xmm3 \n"
443 "por %%xmm3,%%xmm2 \n"
444 "movdqa %%xmm1,0x10(%1) \n"
445 "movdqa %%xmm2,0x20(%1) \n"
446 "lea 0x30(%1),%1 \n"
447 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000448 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000449 : "+r"(src), // %0
450 "+r"(dst), // %1
451 "+r"(pix) // %2
452 : "m"(kShuffleMaskARGBToRGB24) // %3
453 : "memory", "cc"
454#if defined(__SSE2__)
455 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
456#endif
457 );
458}
459
460void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000461 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000462 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000463 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000464 "1: \n"
465 "movdqa (%0),%%xmm0 \n"
466 "movdqa 0x10(%0),%%xmm1 \n"
467 "movdqa 0x20(%0),%%xmm2 \n"
468 "movdqa 0x30(%0),%%xmm3 \n"
469 "lea 0x40(%0),%0 \n"
470 "pshufb %%xmm6,%%xmm0 \n"
471 "pshufb %%xmm6,%%xmm1 \n"
472 "pshufb %%xmm6,%%xmm2 \n"
473 "pshufb %%xmm6,%%xmm3 \n"
474 "movdqa %%xmm1,%%xmm4 \n"
475 "psrldq $0x4,%%xmm1 \n"
476 "pslldq $0xc,%%xmm4 \n"
477 "movdqa %%xmm2,%%xmm5 \n"
478 "por %%xmm4,%%xmm0 \n"
479 "pslldq $0x8,%%xmm5 \n"
480 "movdqa %%xmm0,(%1) \n"
481 "por %%xmm5,%%xmm1 \n"
482 "psrldq $0x8,%%xmm2 \n"
483 "pslldq $0x4,%%xmm3 \n"
484 "por %%xmm3,%%xmm2 \n"
485 "movdqa %%xmm1,0x10(%1) \n"
486 "movdqa %%xmm2,0x20(%1) \n"
487 "lea 0x30(%1),%1 \n"
488 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000489 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000490 : "+r"(src), // %0
491 "+r"(dst), // %1
492 "+r"(pix) // %2
493 : "m"(kShuffleMaskARGBToRAW) // %3
494 : "memory", "cc"
495#if defined(__SSE2__)
496 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
497#endif
498 );
499}
500
fbarchard@google.comdbcabea2012-10-29 21:20:25 +0000501void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000502 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000503 "pcmpeqb %%xmm3,%%xmm3 \n"
504 "psrld $0x1b,%%xmm3 \n"
505 "pcmpeqb %%xmm4,%%xmm4 \n"
506 "psrld $0x1a,%%xmm4 \n"
507 "pslld $0x5,%%xmm4 \n"
508 "pcmpeqb %%xmm5,%%xmm5 \n"
509 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000510 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000511 "1: \n"
512 "movdqa (%0),%%xmm0 \n"
513 "movdqa %%xmm0,%%xmm1 \n"
514 "movdqa %%xmm0,%%xmm2 \n"
515 "pslld $0x8,%%xmm0 \n"
516 "psrld $0x3,%%xmm1 \n"
517 "psrld $0x5,%%xmm2 \n"
518 "psrad $0x10,%%xmm0 \n"
519 "pand %%xmm3,%%xmm1 \n"
520 "pand %%xmm4,%%xmm2 \n"
521 "pand %%xmm5,%%xmm0 \n"
522 "por %%xmm2,%%xmm1 \n"
523 "por %%xmm1,%%xmm0 \n"
524 "packssdw %%xmm0,%%xmm0 \n"
525 "lea 0x10(%0),%0 \n"
526 "movq %%xmm0,(%1) \n"
527 "lea 0x8(%1),%1 \n"
528 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000529 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000530 : "+r"(src), // %0
531 "+r"(dst), // %1
532 "+r"(pix) // %2
533 :
534 : "memory", "cc"
535#if defined(__SSE2__)
536 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
537#endif
538 );
539}
540
541void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000542 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000543 "pcmpeqb %%xmm4,%%xmm4 \n"
544 "psrld $0x1b,%%xmm4 \n"
545 "movdqa %%xmm4,%%xmm5 \n"
546 "pslld $0x5,%%xmm5 \n"
547 "movdqa %%xmm4,%%xmm6 \n"
548 "pslld $0xa,%%xmm6 \n"
549 "pcmpeqb %%xmm7,%%xmm7 \n"
550 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000551 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000552 "1: \n"
553 "movdqa (%0),%%xmm0 \n"
554 "movdqa %%xmm0,%%xmm1 \n"
555 "movdqa %%xmm0,%%xmm2 \n"
556 "movdqa %%xmm0,%%xmm3 \n"
557 "psrad $0x10,%%xmm0 \n"
558 "psrld $0x3,%%xmm1 \n"
559 "psrld $0x6,%%xmm2 \n"
560 "psrld $0x9,%%xmm3 \n"
561 "pand %%xmm7,%%xmm0 \n"
562 "pand %%xmm4,%%xmm1 \n"
563 "pand %%xmm5,%%xmm2 \n"
564 "pand %%xmm6,%%xmm3 \n"
565 "por %%xmm1,%%xmm0 \n"
566 "por %%xmm3,%%xmm2 \n"
567 "por %%xmm2,%%xmm0 \n"
568 "packssdw %%xmm0,%%xmm0 \n"
569 "lea 0x10(%0),%0 \n"
570 "movq %%xmm0,(%1) \n"
571 "lea 0x8(%1),%1 \n"
572 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000573 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000574 : "+r"(src), // %0
575 "+r"(dst), // %1
576 "+r"(pix) // %2
577 :
578 : "memory", "cc"
579#if defined(__SSE2__)
580 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
581#endif
582 );
583}
584
585void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000586 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000587 "pcmpeqb %%xmm4,%%xmm4 \n"
588 "psllw $0xc,%%xmm4 \n"
589 "movdqa %%xmm4,%%xmm3 \n"
590 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000591 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000592 "1: \n"
593 "movdqa (%0),%%xmm0 \n"
594 "movdqa %%xmm0,%%xmm1 \n"
595 "pand %%xmm3,%%xmm0 \n"
596 "pand %%xmm4,%%xmm1 \n"
597 "psrlq $0x4,%%xmm0 \n"
598 "psrlq $0x8,%%xmm1 \n"
599 "por %%xmm1,%%xmm0 \n"
600 "packuswb %%xmm0,%%xmm0 \n"
601 "lea 0x10(%0),%0 \n"
602 "movq %%xmm0,(%1) \n"
603 "lea 0x8(%1),%1 \n"
604 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000605 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000606 : "+r"(src), // %0
607 "+r"(dst), // %1
608 "+r"(pix) // %2
609 :
610 : "memory", "cc"
611#if defined(__SSE2__)
612 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
613#endif
614 );
615}
616
fbarchard@google.comb6149762011-11-07 21:58:52 +0000617void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000618 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000619 "movdqa %4,%%xmm5 \n"
620 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000621 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000622 "1: \n"
623 "movdqa (%0),%%xmm0 \n"
624 "movdqa 0x10(%0),%%xmm1 \n"
625 "movdqa 0x20(%0),%%xmm2 \n"
626 "movdqa 0x30(%0),%%xmm3 \n"
627 "pmaddubsw %%xmm4,%%xmm0 \n"
628 "pmaddubsw %%xmm4,%%xmm1 \n"
629 "pmaddubsw %%xmm4,%%xmm2 \n"
630 "pmaddubsw %%xmm4,%%xmm3 \n"
631 "lea 0x40(%0),%0 \n"
632 "phaddw %%xmm1,%%xmm0 \n"
633 "phaddw %%xmm3,%%xmm2 \n"
634 "psrlw $0x7,%%xmm0 \n"
635 "psrlw $0x7,%%xmm2 \n"
636 "packuswb %%xmm2,%%xmm0 \n"
637 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000638 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000639 "movdqa %%xmm0,(%1) \n"
640 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000641 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000642 : "+r"(src_argb), // %0
643 "+r"(dst_y), // %1
644 "+r"(pix) // %2
645 : "m"(kARGBToY), // %3
646 "m"(kAddY16) // %4
647 : "memory", "cc"
648#if defined(__SSE2__)
649 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
650#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000651 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000652}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000653
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000654void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
655 asm volatile (
656 "movdqa %3,%%xmm4 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000657 "movdqa %4,%%xmm5 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000658 ".p2align 4 \n"
659 "1: \n"
660 "movdqa (%0),%%xmm0 \n"
661 "movdqa 0x10(%0),%%xmm1 \n"
662 "movdqa 0x20(%0),%%xmm2 \n"
663 "movdqa 0x30(%0),%%xmm3 \n"
664 "pmaddubsw %%xmm4,%%xmm0 \n"
665 "pmaddubsw %%xmm4,%%xmm1 \n"
666 "pmaddubsw %%xmm4,%%xmm2 \n"
667 "pmaddubsw %%xmm4,%%xmm3 \n"
668 "lea 0x40(%0),%0 \n"
669 "phaddw %%xmm1,%%xmm0 \n"
670 "phaddw %%xmm3,%%xmm2 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000671 "paddw %%xmm5,%%xmm0 \n"
672 "paddw %%xmm5,%%xmm2 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000673 "psrlw $0x7,%%xmm0 \n"
674 "psrlw $0x7,%%xmm2 \n"
675 "packuswb %%xmm2,%%xmm0 \n"
676 "sub $0x10,%2 \n"
677 "movdqa %%xmm0,(%1) \n"
678 "lea 0x10(%1),%1 \n"
679 "jg 1b \n"
680 : "+r"(src_argb), // %0
681 "+r"(dst_y), // %1
682 "+r"(pix) // %2
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000683 : "m"(kARGBToYJ), // %3
684 "m"(kAddYJ64) // %4
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000685 : "memory", "cc"
686#if defined(__SSE2__)
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000687 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000688#endif
689 );
690}
691
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000692void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000693 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000694 "movdqa %4,%%xmm5 \n"
695 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000696 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000697 "1: \n"
698 "movdqu (%0),%%xmm0 \n"
699 "movdqu 0x10(%0),%%xmm1 \n"
700 "movdqu 0x20(%0),%%xmm2 \n"
701 "movdqu 0x30(%0),%%xmm3 \n"
702 "pmaddubsw %%xmm4,%%xmm0 \n"
703 "pmaddubsw %%xmm4,%%xmm1 \n"
704 "pmaddubsw %%xmm4,%%xmm2 \n"
705 "pmaddubsw %%xmm4,%%xmm3 \n"
706 "lea 0x40(%0),%0 \n"
707 "phaddw %%xmm1,%%xmm0 \n"
708 "phaddw %%xmm3,%%xmm2 \n"
709 "psrlw $0x7,%%xmm0 \n"
710 "psrlw $0x7,%%xmm2 \n"
711 "packuswb %%xmm2,%%xmm0 \n"
712 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000713 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000714 "movdqu %%xmm0,(%1) \n"
715 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000716 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000717 : "+r"(src_argb), // %0
718 "+r"(dst_y), // %1
719 "+r"(pix) // %2
720 : "m"(kARGBToY), // %3
721 "m"(kAddY16) // %4
722 : "memory", "cc"
723#if defined(__SSE2__)
724 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
725#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000726 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000727}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000728
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000729void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
730 asm volatile (
731 "movdqa %3,%%xmm4 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000732 "movdqa %4,%%xmm5 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000733 ".p2align 4 \n"
734 "1: \n"
735 "movdqu (%0),%%xmm0 \n"
736 "movdqu 0x10(%0),%%xmm1 \n"
737 "movdqu 0x20(%0),%%xmm2 \n"
738 "movdqu 0x30(%0),%%xmm3 \n"
739 "pmaddubsw %%xmm4,%%xmm0 \n"
740 "pmaddubsw %%xmm4,%%xmm1 \n"
741 "pmaddubsw %%xmm4,%%xmm2 \n"
742 "pmaddubsw %%xmm4,%%xmm3 \n"
743 "lea 0x40(%0),%0 \n"
744 "phaddw %%xmm1,%%xmm0 \n"
745 "phaddw %%xmm3,%%xmm2 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000746 "paddw %%xmm5,%%xmm0 \n"
747 "paddw %%xmm5,%%xmm2 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000748 "psrlw $0x7,%%xmm0 \n"
749 "psrlw $0x7,%%xmm2 \n"
750 "packuswb %%xmm2,%%xmm0 \n"
751 "sub $0x10,%2 \n"
752 "movdqu %%xmm0,(%1) \n"
753 "lea 0x10(%1),%1 \n"
754 "jg 1b \n"
755 : "+r"(src_argb), // %0
756 "+r"(dst_y), // %1
757 "+r"(pix) // %2
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000758 : "m"(kARGBToYJ), // %3
759 "m"(kAddYJ64) // %4
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000760 : "memory", "cc"
761#if defined(__SSE2__)
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000762 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000763#endif
764 );
765}
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000766
fbarchard@google.com714050a2012-02-17 22:59:56 +0000767// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000768// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
769// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
770// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000771// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000772void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
773 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000774 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000775 "movdqa %0,%%xmm4 \n"
776 "movdqa %1,%%xmm3 \n"
777 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000778 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000779 : "m"(kARGBToU), // %0
780 "m"(kARGBToV), // %1
781 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000782 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000783 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000784 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000785 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000786 "1: \n"
787 "movdqa (%0),%%xmm0 \n"
788 "movdqa 0x10(%0),%%xmm1 \n"
789 "movdqa 0x20(%0),%%xmm2 \n"
790 "movdqa 0x30(%0),%%xmm6 \n"
791 "pavgb (%0,%4,1),%%xmm0 \n"
792 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
793 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
794 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
795 "lea 0x40(%0),%0 \n"
796 "movdqa %%xmm0,%%xmm7 \n"
797 "shufps $0x88,%%xmm1,%%xmm0 \n"
798 "shufps $0xdd,%%xmm1,%%xmm7 \n"
799 "pavgb %%xmm7,%%xmm0 \n"
800 "movdqa %%xmm2,%%xmm7 \n"
801 "shufps $0x88,%%xmm6,%%xmm2 \n"
802 "shufps $0xdd,%%xmm6,%%xmm7 \n"
803 "pavgb %%xmm7,%%xmm2 \n"
804 "movdqa %%xmm0,%%xmm1 \n"
805 "movdqa %%xmm2,%%xmm6 \n"
806 "pmaddubsw %%xmm4,%%xmm0 \n"
807 "pmaddubsw %%xmm4,%%xmm2 \n"
808 "pmaddubsw %%xmm3,%%xmm1 \n"
809 "pmaddubsw %%xmm3,%%xmm6 \n"
810 "phaddw %%xmm2,%%xmm0 \n"
811 "phaddw %%xmm6,%%xmm1 \n"
812 "psraw $0x8,%%xmm0 \n"
813 "psraw $0x8,%%xmm1 \n"
814 "packsswb %%xmm1,%%xmm0 \n"
815 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000816 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000817 "movlps %%xmm0,(%1) \n"
818 "movhps %%xmm0,(%1,%2,1) \n"
819 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000820 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000821 : "+r"(src_argb0), // %0
822 "+r"(dst_u), // %1
823 "+r"(dst_v), // %2
824 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000825 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000826 : "memory", "cc"
827#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000828 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000829#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000830 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000831}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000832
833void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
834 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000835 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000836 "movdqa %0,%%xmm4 \n"
837 "movdqa %1,%%xmm3 \n"
838 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000839 :
840 : "m"(kARGBToU), // %0
841 "m"(kARGBToV), // %1
842 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000843 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000844 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000845 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000846 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000847 "1: \n"
848 "movdqu (%0),%%xmm0 \n"
849 "movdqu 0x10(%0),%%xmm1 \n"
850 "movdqu 0x20(%0),%%xmm2 \n"
851 "movdqu 0x30(%0),%%xmm6 \n"
852 "movdqu (%0,%4,1),%%xmm7 \n"
853 "pavgb %%xmm7,%%xmm0 \n"
854 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
855 "pavgb %%xmm7,%%xmm1 \n"
856 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
857 "pavgb %%xmm7,%%xmm2 \n"
858 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
859 "pavgb %%xmm7,%%xmm6 \n"
860 "lea 0x40(%0),%0 \n"
861 "movdqa %%xmm0,%%xmm7 \n"
862 "shufps $0x88,%%xmm1,%%xmm0 \n"
863 "shufps $0xdd,%%xmm1,%%xmm7 \n"
864 "pavgb %%xmm7,%%xmm0 \n"
865 "movdqa %%xmm2,%%xmm7 \n"
866 "shufps $0x88,%%xmm6,%%xmm2 \n"
867 "shufps $0xdd,%%xmm6,%%xmm7 \n"
868 "pavgb %%xmm7,%%xmm2 \n"
869 "movdqa %%xmm0,%%xmm1 \n"
870 "movdqa %%xmm2,%%xmm6 \n"
871 "pmaddubsw %%xmm4,%%xmm0 \n"
872 "pmaddubsw %%xmm4,%%xmm2 \n"
873 "pmaddubsw %%xmm3,%%xmm1 \n"
874 "pmaddubsw %%xmm3,%%xmm6 \n"
875 "phaddw %%xmm2,%%xmm0 \n"
876 "phaddw %%xmm6,%%xmm1 \n"
877 "psraw $0x8,%%xmm0 \n"
878 "psraw $0x8,%%xmm1 \n"
879 "packsswb %%xmm1,%%xmm0 \n"
880 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000881 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000882 "movlps %%xmm0,(%1) \n"
883 "movhps %%xmm0,(%1,%2,1) \n"
884 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000885 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000886 : "+r"(src_argb0), // %0
887 "+r"(dst_u), // %1
888 "+r"(dst_v), // %2
889 "+rm"(width) // %3
890 : "r"(static_cast<intptr_t>(src_stride_argb))
891 : "memory", "cc"
892#if defined(__SSE2__)
893 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
894#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000895 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000896}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000897
fbarchard@google.com762c0502013-02-04 18:47:21 +0000898void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
899 int width) {
900 asm volatile (
901 "movdqa %0,%%xmm4 \n"
902 "movdqa %1,%%xmm3 \n"
903 "movdqa %2,%%xmm5 \n"
904 :
905 : "m"(kARGBToU), // %0
906 "m"(kARGBToV), // %1
907 "m"(kAddUV128) // %2
908 );
909 asm volatile (
910 "sub %1,%2 \n"
911 ".p2align 4 \n"
912 "1: \n"
913 "movdqa (%0),%%xmm0 \n"
914 "movdqa 0x10(%0),%%xmm1 \n"
915 "movdqa 0x20(%0),%%xmm2 \n"
916 "movdqa 0x30(%0),%%xmm6 \n"
917 "pmaddubsw %%xmm4,%%xmm0 \n"
918 "pmaddubsw %%xmm4,%%xmm1 \n"
919 "pmaddubsw %%xmm4,%%xmm2 \n"
920 "pmaddubsw %%xmm4,%%xmm6 \n"
921 "phaddw %%xmm1,%%xmm0 \n"
922 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000923 "psraw $0x8,%%xmm0 \n"
924 "psraw $0x8,%%xmm2 \n"
925 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000926 "paddb %%xmm5,%%xmm0 \n"
927 "sub $0x10,%3 \n"
928 "movdqa %%xmm0,(%1) \n"
929 "movdqa (%0),%%xmm0 \n"
930 "movdqa 0x10(%0),%%xmm1 \n"
931 "movdqa 0x20(%0),%%xmm2 \n"
932 "movdqa 0x30(%0),%%xmm6 \n"
933 "pmaddubsw %%xmm3,%%xmm0 \n"
934 "pmaddubsw %%xmm3,%%xmm1 \n"
935 "pmaddubsw %%xmm3,%%xmm2 \n"
936 "pmaddubsw %%xmm3,%%xmm6 \n"
937 "phaddw %%xmm1,%%xmm0 \n"
938 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000939 "psraw $0x8,%%xmm0 \n"
940 "psraw $0x8,%%xmm2 \n"
941 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000942 "paddb %%xmm5,%%xmm0 \n"
943 "lea 0x40(%0),%0 \n"
944 "movdqa %%xmm0,(%1,%2,1) \n"
945 "lea 0x10(%1),%1 \n"
946 "jg 1b \n"
947 : "+r"(src_argb), // %0
948 "+r"(dst_u), // %1
949 "+r"(dst_v), // %2
950 "+rm"(width) // %3
951 :
952 : "memory", "cc"
953#if defined(__SSE2__)
954 , "xmm0", "xmm1", "xmm2", "xmm6"
955#endif
956 );
957}
958
959void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
960 uint8* dst_v, int width) {
961 asm volatile (
962 "movdqa %0,%%xmm4 \n"
963 "movdqa %1,%%xmm3 \n"
964 "movdqa %2,%%xmm5 \n"
965 :
966 : "m"(kARGBToU), // %0
967 "m"(kARGBToV), // %1
968 "m"(kAddUV128) // %2
969 );
970 asm volatile (
971 "sub %1,%2 \n"
972 ".p2align 4 \n"
973 "1: \n"
974 "movdqu (%0),%%xmm0 \n"
975 "movdqu 0x10(%0),%%xmm1 \n"
976 "movdqu 0x20(%0),%%xmm2 \n"
977 "movdqu 0x30(%0),%%xmm6 \n"
978 "pmaddubsw %%xmm4,%%xmm0 \n"
979 "pmaddubsw %%xmm4,%%xmm1 \n"
980 "pmaddubsw %%xmm4,%%xmm2 \n"
981 "pmaddubsw %%xmm4,%%xmm6 \n"
982 "phaddw %%xmm1,%%xmm0 \n"
983 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +0000984 "psraw $0x8,%%xmm0 \n"
985 "psraw $0x8,%%xmm2 \n"
986 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +0000987 "paddb %%xmm5,%%xmm0 \n"
988 "sub $0x10,%3 \n"
989 "movdqu %%xmm0,(%1) \n"
990 "movdqu (%0),%%xmm0 \n"
991 "movdqu 0x10(%0),%%xmm1 \n"
992 "movdqu 0x20(%0),%%xmm2 \n"
993 "movdqu 0x30(%0),%%xmm6 \n"
994 "pmaddubsw %%xmm3,%%xmm0 \n"
995 "pmaddubsw %%xmm3,%%xmm1 \n"
996 "pmaddubsw %%xmm3,%%xmm2 \n"
997 "pmaddubsw %%xmm3,%%xmm6 \n"
998 "phaddw %%xmm1,%%xmm0 \n"
999 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +00001000 "psraw $0x8,%%xmm0 \n"
1001 "psraw $0x8,%%xmm2 \n"
1002 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +00001003 "paddb %%xmm5,%%xmm0 \n"
1004 "lea 0x40(%0),%0 \n"
1005 "movdqu %%xmm0,(%1,%2,1) \n"
1006 "lea 0x10(%1),%1 \n"
1007 "jg 1b \n"
1008 : "+r"(src_argb), // %0
1009 "+r"(dst_u), // %1
1010 "+r"(dst_v), // %2
1011 "+rm"(width) // %3
1012 :
1013 : "memory", "cc"
1014#if defined(__SSE2__)
1015 , "xmm0", "xmm1", "xmm2", "xmm6"
1016#endif
1017 );
1018}
1019
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001020void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1021 uint8* dst_u, uint8* dst_v, int width) {
1022 asm volatile (
1023 "movdqa %0,%%xmm4 \n"
1024 "movdqa %1,%%xmm3 \n"
1025 "movdqa %2,%%xmm5 \n"
1026 :
1027 : "m"(kARGBToU), // %0
1028 "m"(kARGBToV), // %1
1029 "m"(kAddUV128) // %2
1030 );
1031 asm volatile (
1032 "sub %1,%2 \n"
1033 ".p2align 4 \n"
1034 "1: \n"
1035 "movdqa (%0),%%xmm0 \n"
1036 "movdqa 0x10(%0),%%xmm1 \n"
1037 "movdqa 0x20(%0),%%xmm2 \n"
1038 "movdqa 0x30(%0),%%xmm6 \n"
1039 "lea 0x40(%0),%0 \n"
1040 "movdqa %%xmm0,%%xmm7 \n"
1041 "shufps $0x88,%%xmm1,%%xmm0 \n"
1042 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1043 "pavgb %%xmm7,%%xmm0 \n"
1044 "movdqa %%xmm2,%%xmm7 \n"
1045 "shufps $0x88,%%xmm6,%%xmm2 \n"
1046 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1047 "pavgb %%xmm7,%%xmm2 \n"
1048 "movdqa %%xmm0,%%xmm1 \n"
1049 "movdqa %%xmm2,%%xmm6 \n"
1050 "pmaddubsw %%xmm4,%%xmm0 \n"
1051 "pmaddubsw %%xmm4,%%xmm2 \n"
1052 "pmaddubsw %%xmm3,%%xmm1 \n"
1053 "pmaddubsw %%xmm3,%%xmm6 \n"
1054 "phaddw %%xmm2,%%xmm0 \n"
1055 "phaddw %%xmm6,%%xmm1 \n"
1056 "psraw $0x8,%%xmm0 \n"
1057 "psraw $0x8,%%xmm1 \n"
1058 "packsswb %%xmm1,%%xmm0 \n"
1059 "paddb %%xmm5,%%xmm0 \n"
1060 "sub $0x10,%3 \n"
1061 "movlps %%xmm0,(%1) \n"
1062 "movhps %%xmm0,(%1,%2,1) \n"
1063 "lea 0x8(%1),%1 \n"
1064 "jg 1b \n"
1065 : "+r"(src_argb0), // %0
1066 "+r"(dst_u), // %1
1067 "+r"(dst_v), // %2
1068 "+rm"(width) // %3
1069 :
1070 : "memory", "cc"
1071#if defined(__SSE2__)
1072 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1073#endif
1074 );
1075}
1076
1077void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1078 uint8* dst_u, uint8* dst_v, int width) {
1079 asm volatile (
1080 "movdqa %0,%%xmm4 \n"
1081 "movdqa %1,%%xmm3 \n"
1082 "movdqa %2,%%xmm5 \n"
1083 :
1084 : "m"(kARGBToU), // %0
1085 "m"(kARGBToV), // %1
1086 "m"(kAddUV128) // %2
1087 );
1088 asm volatile (
1089 "sub %1,%2 \n"
1090 ".p2align 4 \n"
1091 "1: \n"
1092 "movdqu (%0),%%xmm0 \n"
1093 "movdqu 0x10(%0),%%xmm1 \n"
1094 "movdqu 0x20(%0),%%xmm2 \n"
1095 "movdqu 0x30(%0),%%xmm6 \n"
1096 "lea 0x40(%0),%0 \n"
1097 "movdqa %%xmm0,%%xmm7 \n"
1098 "shufps $0x88,%%xmm1,%%xmm0 \n"
1099 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1100 "pavgb %%xmm7,%%xmm0 \n"
1101 "movdqa %%xmm2,%%xmm7 \n"
1102 "shufps $0x88,%%xmm6,%%xmm2 \n"
1103 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1104 "pavgb %%xmm7,%%xmm2 \n"
1105 "movdqa %%xmm0,%%xmm1 \n"
1106 "movdqa %%xmm2,%%xmm6 \n"
1107 "pmaddubsw %%xmm4,%%xmm0 \n"
1108 "pmaddubsw %%xmm4,%%xmm2 \n"
1109 "pmaddubsw %%xmm3,%%xmm1 \n"
1110 "pmaddubsw %%xmm3,%%xmm6 \n"
1111 "phaddw %%xmm2,%%xmm0 \n"
1112 "phaddw %%xmm6,%%xmm1 \n"
1113 "psraw $0x8,%%xmm0 \n"
1114 "psraw $0x8,%%xmm1 \n"
1115 "packsswb %%xmm1,%%xmm0 \n"
1116 "paddb %%xmm5,%%xmm0 \n"
1117 "sub $0x10,%3 \n"
1118 "movlps %%xmm0,(%1) \n"
1119 "movhps %%xmm0,(%1,%2,1) \n"
1120 "lea 0x8(%1),%1 \n"
1121 "jg 1b \n"
1122 : "+r"(src_argb0), // %0
1123 "+r"(dst_u), // %1
1124 "+r"(dst_v), // %2
1125 "+rm"(width) // %3
1126 :
1127 : "memory", "cc"
1128#if defined(__SSE2__)
1129 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1130#endif
1131 );
1132}
1133
fbarchard@google.com714050a2012-02-17 22:59:56 +00001134void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001135 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001136 "movdqa %4,%%xmm5 \n"
1137 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001138 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 "1: \n"
1140 "movdqa (%0),%%xmm0 \n"
1141 "movdqa 0x10(%0),%%xmm1 \n"
1142 "movdqa 0x20(%0),%%xmm2 \n"
1143 "movdqa 0x30(%0),%%xmm3 \n"
1144 "pmaddubsw %%xmm4,%%xmm0 \n"
1145 "pmaddubsw %%xmm4,%%xmm1 \n"
1146 "pmaddubsw %%xmm4,%%xmm2 \n"
1147 "pmaddubsw %%xmm4,%%xmm3 \n"
1148 "lea 0x40(%0),%0 \n"
1149 "phaddw %%xmm1,%%xmm0 \n"
1150 "phaddw %%xmm3,%%xmm2 \n"
1151 "psrlw $0x7,%%xmm0 \n"
1152 "psrlw $0x7,%%xmm2 \n"
1153 "packuswb %%xmm2,%%xmm0 \n"
1154 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001155 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001156 "movdqa %%xmm0,(%1) \n"
1157 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001158 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001159 : "+r"(src_bgra), // %0
1160 "+r"(dst_y), // %1
1161 "+r"(pix) // %2
1162 : "m"(kBGRAToY), // %3
1163 "m"(kAddY16) // %4
1164 : "memory", "cc"
1165#if defined(__SSE2__)
1166 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001167#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +00001168 );
1169}
1170
1171void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001172 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001173 "movdqa %4,%%xmm5 \n"
1174 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001175 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001176 "1: \n"
1177 "movdqu (%0),%%xmm0 \n"
1178 "movdqu 0x10(%0),%%xmm1 \n"
1179 "movdqu 0x20(%0),%%xmm2 \n"
1180 "movdqu 0x30(%0),%%xmm3 \n"
1181 "pmaddubsw %%xmm4,%%xmm0 \n"
1182 "pmaddubsw %%xmm4,%%xmm1 \n"
1183 "pmaddubsw %%xmm4,%%xmm2 \n"
1184 "pmaddubsw %%xmm4,%%xmm3 \n"
1185 "lea 0x40(%0),%0 \n"
1186 "phaddw %%xmm1,%%xmm0 \n"
1187 "phaddw %%xmm3,%%xmm2 \n"
1188 "psrlw $0x7,%%xmm0 \n"
1189 "psrlw $0x7,%%xmm2 \n"
1190 "packuswb %%xmm2,%%xmm0 \n"
1191 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001192 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001193 "movdqu %%xmm0,(%1) \n"
1194 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001195 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001196 : "+r"(src_bgra), // %0
1197 "+r"(dst_y), // %1
1198 "+r"(pix) // %2
1199 : "m"(kBGRAToY), // %3
1200 "m"(kAddY16) // %4
1201 : "memory", "cc"
1202#if defined(__SSE2__)
1203 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1204#endif
1205 );
1206}
1207
1208void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1209 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001210 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001211 "movdqa %0,%%xmm4 \n"
1212 "movdqa %1,%%xmm3 \n"
1213 "movdqa %2,%%xmm5 \n"
1214 :
1215 : "m"(kBGRAToU), // %0
1216 "m"(kBGRAToV), // %1
1217 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001218 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001219 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001220 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001221 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001222 "1: \n"
1223 "movdqa (%0),%%xmm0 \n"
1224 "movdqa 0x10(%0),%%xmm1 \n"
1225 "movdqa 0x20(%0),%%xmm2 \n"
1226 "movdqa 0x30(%0),%%xmm6 \n"
1227 "pavgb (%0,%4,1),%%xmm0 \n"
1228 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1229 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1230 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1231 "lea 0x40(%0),%0 \n"
1232 "movdqa %%xmm0,%%xmm7 \n"
1233 "shufps $0x88,%%xmm1,%%xmm0 \n"
1234 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1235 "pavgb %%xmm7,%%xmm0 \n"
1236 "movdqa %%xmm2,%%xmm7 \n"
1237 "shufps $0x88,%%xmm6,%%xmm2 \n"
1238 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1239 "pavgb %%xmm7,%%xmm2 \n"
1240 "movdqa %%xmm0,%%xmm1 \n"
1241 "movdqa %%xmm2,%%xmm6 \n"
1242 "pmaddubsw %%xmm4,%%xmm0 \n"
1243 "pmaddubsw %%xmm4,%%xmm2 \n"
1244 "pmaddubsw %%xmm3,%%xmm1 \n"
1245 "pmaddubsw %%xmm3,%%xmm6 \n"
1246 "phaddw %%xmm2,%%xmm0 \n"
1247 "phaddw %%xmm6,%%xmm1 \n"
1248 "psraw $0x8,%%xmm0 \n"
1249 "psraw $0x8,%%xmm1 \n"
1250 "packsswb %%xmm1,%%xmm0 \n"
1251 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001252 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001253 "movlps %%xmm0,(%1) \n"
1254 "movhps %%xmm0,(%1,%2,1) \n"
1255 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001256 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001257 : "+r"(src_bgra0), // %0
1258 "+r"(dst_u), // %1
1259 "+r"(dst_v), // %2
1260 "+rm"(width) // %3
1261 : "r"(static_cast<intptr_t>(src_stride_bgra))
1262 : "memory", "cc"
1263#if defined(__SSE2__)
1264 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1265#endif
1266 );
1267}
1268
1269void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1270 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001271 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001272 "movdqa %0,%%xmm4 \n"
1273 "movdqa %1,%%xmm3 \n"
1274 "movdqa %2,%%xmm5 \n"
1275 :
1276 : "m"(kBGRAToU), // %0
1277 "m"(kBGRAToV), // %1
1278 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001279 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001280 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001281 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001282 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001283 "1: \n"
1284 "movdqu (%0),%%xmm0 \n"
1285 "movdqu 0x10(%0),%%xmm1 \n"
1286 "movdqu 0x20(%0),%%xmm2 \n"
1287 "movdqu 0x30(%0),%%xmm6 \n"
1288 "movdqu (%0,%4,1),%%xmm7 \n"
1289 "pavgb %%xmm7,%%xmm0 \n"
1290 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1291 "pavgb %%xmm7,%%xmm1 \n"
1292 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1293 "pavgb %%xmm7,%%xmm2 \n"
1294 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1295 "pavgb %%xmm7,%%xmm6 \n"
1296 "lea 0x40(%0),%0 \n"
1297 "movdqa %%xmm0,%%xmm7 \n"
1298 "shufps $0x88,%%xmm1,%%xmm0 \n"
1299 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1300 "pavgb %%xmm7,%%xmm0 \n"
1301 "movdqa %%xmm2,%%xmm7 \n"
1302 "shufps $0x88,%%xmm6,%%xmm2 \n"
1303 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1304 "pavgb %%xmm7,%%xmm2 \n"
1305 "movdqa %%xmm0,%%xmm1 \n"
1306 "movdqa %%xmm2,%%xmm6 \n"
1307 "pmaddubsw %%xmm4,%%xmm0 \n"
1308 "pmaddubsw %%xmm4,%%xmm2 \n"
1309 "pmaddubsw %%xmm3,%%xmm1 \n"
1310 "pmaddubsw %%xmm3,%%xmm6 \n"
1311 "phaddw %%xmm2,%%xmm0 \n"
1312 "phaddw %%xmm6,%%xmm1 \n"
1313 "psraw $0x8,%%xmm0 \n"
1314 "psraw $0x8,%%xmm1 \n"
1315 "packsswb %%xmm1,%%xmm0 \n"
1316 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001317 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001318 "movlps %%xmm0,(%1) \n"
1319 "movhps %%xmm0,(%1,%2,1) \n"
1320 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001321 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001322 : "+r"(src_bgra0), // %0
1323 "+r"(dst_u), // %1
1324 "+r"(dst_v), // %2
1325 "+rm"(width) // %3
1326 : "r"(static_cast<intptr_t>(src_stride_bgra))
1327 : "memory", "cc"
1328#if defined(__SSE2__)
1329 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1330#endif
1331 );
1332}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001333
1334void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001335 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001336 "movdqa %4,%%xmm5 \n"
1337 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001338 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001339 "1: \n"
1340 "movdqa (%0),%%xmm0 \n"
1341 "movdqa 0x10(%0),%%xmm1 \n"
1342 "movdqa 0x20(%0),%%xmm2 \n"
1343 "movdqa 0x30(%0),%%xmm3 \n"
1344 "pmaddubsw %%xmm4,%%xmm0 \n"
1345 "pmaddubsw %%xmm4,%%xmm1 \n"
1346 "pmaddubsw %%xmm4,%%xmm2 \n"
1347 "pmaddubsw %%xmm4,%%xmm3 \n"
1348 "lea 0x40(%0),%0 \n"
1349 "phaddw %%xmm1,%%xmm0 \n"
1350 "phaddw %%xmm3,%%xmm2 \n"
1351 "psrlw $0x7,%%xmm0 \n"
1352 "psrlw $0x7,%%xmm2 \n"
1353 "packuswb %%xmm2,%%xmm0 \n"
1354 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001355 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001356 "movdqa %%xmm0,(%1) \n"
1357 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001358 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001359 : "+r"(src_abgr), // %0
1360 "+r"(dst_y), // %1
1361 "+r"(pix) // %2
1362 : "m"(kABGRToY), // %3
1363 "m"(kAddY16) // %4
1364 : "memory", "cc"
1365#if defined(__SSE2__)
1366 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1367#endif
1368 );
1369}
1370
1371void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001372 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001373 "movdqa %4,%%xmm5 \n"
1374 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001375 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001376 "1: \n"
1377 "movdqu (%0),%%xmm0 \n"
1378 "movdqu 0x10(%0),%%xmm1 \n"
1379 "movdqu 0x20(%0),%%xmm2 \n"
1380 "movdqu 0x30(%0),%%xmm3 \n"
1381 "pmaddubsw %%xmm4,%%xmm0 \n"
1382 "pmaddubsw %%xmm4,%%xmm1 \n"
1383 "pmaddubsw %%xmm4,%%xmm2 \n"
1384 "pmaddubsw %%xmm4,%%xmm3 \n"
1385 "lea 0x40(%0),%0 \n"
1386 "phaddw %%xmm1,%%xmm0 \n"
1387 "phaddw %%xmm3,%%xmm2 \n"
1388 "psrlw $0x7,%%xmm0 \n"
1389 "psrlw $0x7,%%xmm2 \n"
1390 "packuswb %%xmm2,%%xmm0 \n"
1391 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001392 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001393 "movdqu %%xmm0,(%1) \n"
1394 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001395 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001396 : "+r"(src_abgr), // %0
1397 "+r"(dst_y), // %1
1398 "+r"(pix) // %2
1399 : "m"(kABGRToY), // %3
1400 "m"(kAddY16) // %4
1401 : "memory", "cc"
1402#if defined(__SSE2__)
1403 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1404#endif
1405 );
1406}
1407
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001408void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1409 asm volatile (
1410 "movdqa %4,%%xmm5 \n"
1411 "movdqa %3,%%xmm4 \n"
1412 ".p2align 4 \n"
1413 "1: \n"
1414 "movdqa (%0),%%xmm0 \n"
1415 "movdqa 0x10(%0),%%xmm1 \n"
1416 "movdqa 0x20(%0),%%xmm2 \n"
1417 "movdqa 0x30(%0),%%xmm3 \n"
1418 "pmaddubsw %%xmm4,%%xmm0 \n"
1419 "pmaddubsw %%xmm4,%%xmm1 \n"
1420 "pmaddubsw %%xmm4,%%xmm2 \n"
1421 "pmaddubsw %%xmm4,%%xmm3 \n"
1422 "lea 0x40(%0),%0 \n"
1423 "phaddw %%xmm1,%%xmm0 \n"
1424 "phaddw %%xmm3,%%xmm2 \n"
1425 "psrlw $0x7,%%xmm0 \n"
1426 "psrlw $0x7,%%xmm2 \n"
1427 "packuswb %%xmm2,%%xmm0 \n"
1428 "paddb %%xmm5,%%xmm0 \n"
1429 "sub $0x10,%2 \n"
1430 "movdqa %%xmm0,(%1) \n"
1431 "lea 0x10(%1),%1 \n"
1432 "jg 1b \n"
1433 : "+r"(src_rgba), // %0
1434 "+r"(dst_y), // %1
1435 "+r"(pix) // %2
1436 : "m"(kRGBAToY), // %3
1437 "m"(kAddY16) // %4
1438 : "memory", "cc"
1439#if defined(__SSE2__)
1440 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1441#endif
1442 );
1443}
1444
1445void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1446 asm volatile (
1447 "movdqa %4,%%xmm5 \n"
1448 "movdqa %3,%%xmm4 \n"
1449 ".p2align 4 \n"
1450 "1: \n"
1451 "movdqu (%0),%%xmm0 \n"
1452 "movdqu 0x10(%0),%%xmm1 \n"
1453 "movdqu 0x20(%0),%%xmm2 \n"
1454 "movdqu 0x30(%0),%%xmm3 \n"
1455 "pmaddubsw %%xmm4,%%xmm0 \n"
1456 "pmaddubsw %%xmm4,%%xmm1 \n"
1457 "pmaddubsw %%xmm4,%%xmm2 \n"
1458 "pmaddubsw %%xmm4,%%xmm3 \n"
1459 "lea 0x40(%0),%0 \n"
1460 "phaddw %%xmm1,%%xmm0 \n"
1461 "phaddw %%xmm3,%%xmm2 \n"
1462 "psrlw $0x7,%%xmm0 \n"
1463 "psrlw $0x7,%%xmm2 \n"
1464 "packuswb %%xmm2,%%xmm0 \n"
1465 "paddb %%xmm5,%%xmm0 \n"
1466 "sub $0x10,%2 \n"
1467 "movdqu %%xmm0,(%1) \n"
1468 "lea 0x10(%1),%1 \n"
1469 "jg 1b \n"
1470 : "+r"(src_rgba), // %0
1471 "+r"(dst_y), // %1
1472 "+r"(pix) // %2
1473 : "m"(kRGBAToY), // %3
1474 "m"(kAddY16) // %4
1475 : "memory", "cc"
1476#if defined(__SSE2__)
1477 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1478#endif
1479 );
1480}
1481
fbarchard@google.com714050a2012-02-17 22:59:56 +00001482void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1483 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001484 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001485 "movdqa %0,%%xmm4 \n"
1486 "movdqa %1,%%xmm3 \n"
1487 "movdqa %2,%%xmm5 \n"
1488 :
1489 : "m"(kABGRToU), // %0
1490 "m"(kABGRToV), // %1
1491 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001492 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001493 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001494 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001495 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001496 "1: \n"
1497 "movdqa (%0),%%xmm0 \n"
1498 "movdqa 0x10(%0),%%xmm1 \n"
1499 "movdqa 0x20(%0),%%xmm2 \n"
1500 "movdqa 0x30(%0),%%xmm6 \n"
1501 "pavgb (%0,%4,1),%%xmm0 \n"
1502 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1503 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1504 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1505 "lea 0x40(%0),%0 \n"
1506 "movdqa %%xmm0,%%xmm7 \n"
1507 "shufps $0x88,%%xmm1,%%xmm0 \n"
1508 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1509 "pavgb %%xmm7,%%xmm0 \n"
1510 "movdqa %%xmm2,%%xmm7 \n"
1511 "shufps $0x88,%%xmm6,%%xmm2 \n"
1512 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1513 "pavgb %%xmm7,%%xmm2 \n"
1514 "movdqa %%xmm0,%%xmm1 \n"
1515 "movdqa %%xmm2,%%xmm6 \n"
1516 "pmaddubsw %%xmm4,%%xmm0 \n"
1517 "pmaddubsw %%xmm4,%%xmm2 \n"
1518 "pmaddubsw %%xmm3,%%xmm1 \n"
1519 "pmaddubsw %%xmm3,%%xmm6 \n"
1520 "phaddw %%xmm2,%%xmm0 \n"
1521 "phaddw %%xmm6,%%xmm1 \n"
1522 "psraw $0x8,%%xmm0 \n"
1523 "psraw $0x8,%%xmm1 \n"
1524 "packsswb %%xmm1,%%xmm0 \n"
1525 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001526 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001527 "movlps %%xmm0,(%1) \n"
1528 "movhps %%xmm0,(%1,%2,1) \n"
1529 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001530 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001531 : "+r"(src_abgr0), // %0
1532 "+r"(dst_u), // %1
1533 "+r"(dst_v), // %2
1534 "+rm"(width) // %3
1535 : "r"(static_cast<intptr_t>(src_stride_abgr))
1536 : "memory", "cc"
1537#if defined(__SSE2__)
1538 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1539#endif
1540 );
1541}
1542
1543void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1544 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001545 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001546 "movdqa %0,%%xmm4 \n"
1547 "movdqa %1,%%xmm3 \n"
1548 "movdqa %2,%%xmm5 \n"
1549 :
1550 : "m"(kABGRToU), // %0
1551 "m"(kABGRToV), // %1
1552 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001553 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001554 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001555 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001556 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001557 "1: \n"
1558 "movdqu (%0),%%xmm0 \n"
1559 "movdqu 0x10(%0),%%xmm1 \n"
1560 "movdqu 0x20(%0),%%xmm2 \n"
1561 "movdqu 0x30(%0),%%xmm6 \n"
1562 "movdqu (%0,%4,1),%%xmm7 \n"
1563 "pavgb %%xmm7,%%xmm0 \n"
1564 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1565 "pavgb %%xmm7,%%xmm1 \n"
1566 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1567 "pavgb %%xmm7,%%xmm2 \n"
1568 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1569 "pavgb %%xmm7,%%xmm6 \n"
1570 "lea 0x40(%0),%0 \n"
1571 "movdqa %%xmm0,%%xmm7 \n"
1572 "shufps $0x88,%%xmm1,%%xmm0 \n"
1573 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1574 "pavgb %%xmm7,%%xmm0 \n"
1575 "movdqa %%xmm2,%%xmm7 \n"
1576 "shufps $0x88,%%xmm6,%%xmm2 \n"
1577 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1578 "pavgb %%xmm7,%%xmm2 \n"
1579 "movdqa %%xmm0,%%xmm1 \n"
1580 "movdqa %%xmm2,%%xmm6 \n"
1581 "pmaddubsw %%xmm4,%%xmm0 \n"
1582 "pmaddubsw %%xmm4,%%xmm2 \n"
1583 "pmaddubsw %%xmm3,%%xmm1 \n"
1584 "pmaddubsw %%xmm3,%%xmm6 \n"
1585 "phaddw %%xmm2,%%xmm0 \n"
1586 "phaddw %%xmm6,%%xmm1 \n"
1587 "psraw $0x8,%%xmm0 \n"
1588 "psraw $0x8,%%xmm1 \n"
1589 "packsswb %%xmm1,%%xmm0 \n"
1590 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001591 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001592 "movlps %%xmm0,(%1) \n"
1593 "movhps %%xmm0,(%1,%2,1) \n"
1594 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001595 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001596 : "+r"(src_abgr0), // %0
1597 "+r"(dst_u), // %1
1598 "+r"(dst_v), // %2
1599 "+rm"(width) // %3
1600 : "r"(static_cast<intptr_t>(src_stride_abgr))
1601 : "memory", "cc"
1602#if defined(__SSE2__)
1603 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1604#endif
1605 );
1606}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001607
1608void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1609 uint8* dst_u, uint8* dst_v, int width) {
1610 asm volatile (
1611 "movdqa %0,%%xmm4 \n"
1612 "movdqa %1,%%xmm3 \n"
1613 "movdqa %2,%%xmm5 \n"
1614 :
1615 : "m"(kRGBAToU), // %0
1616 "m"(kRGBAToV), // %1
1617 "m"(kAddUV128) // %2
1618 );
1619 asm volatile (
1620 "sub %1,%2 \n"
1621 ".p2align 4 \n"
1622 "1: \n"
1623 "movdqa (%0),%%xmm0 \n"
1624 "movdqa 0x10(%0),%%xmm1 \n"
1625 "movdqa 0x20(%0),%%xmm2 \n"
1626 "movdqa 0x30(%0),%%xmm6 \n"
1627 "pavgb (%0,%4,1),%%xmm0 \n"
1628 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1629 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1630 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1631 "lea 0x40(%0),%0 \n"
1632 "movdqa %%xmm0,%%xmm7 \n"
1633 "shufps $0x88,%%xmm1,%%xmm0 \n"
1634 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1635 "pavgb %%xmm7,%%xmm0 \n"
1636 "movdqa %%xmm2,%%xmm7 \n"
1637 "shufps $0x88,%%xmm6,%%xmm2 \n"
1638 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1639 "pavgb %%xmm7,%%xmm2 \n"
1640 "movdqa %%xmm0,%%xmm1 \n"
1641 "movdqa %%xmm2,%%xmm6 \n"
1642 "pmaddubsw %%xmm4,%%xmm0 \n"
1643 "pmaddubsw %%xmm4,%%xmm2 \n"
1644 "pmaddubsw %%xmm3,%%xmm1 \n"
1645 "pmaddubsw %%xmm3,%%xmm6 \n"
1646 "phaddw %%xmm2,%%xmm0 \n"
1647 "phaddw %%xmm6,%%xmm1 \n"
1648 "psraw $0x8,%%xmm0 \n"
1649 "psraw $0x8,%%xmm1 \n"
1650 "packsswb %%xmm1,%%xmm0 \n"
1651 "paddb %%xmm5,%%xmm0 \n"
1652 "sub $0x10,%3 \n"
1653 "movlps %%xmm0,(%1) \n"
1654 "movhps %%xmm0,(%1,%2,1) \n"
1655 "lea 0x8(%1),%1 \n"
1656 "jg 1b \n"
1657 : "+r"(src_rgba0), // %0
1658 "+r"(dst_u), // %1
1659 "+r"(dst_v), // %2
1660 "+rm"(width) // %3
1661 : "r"(static_cast<intptr_t>(src_stride_rgba))
1662 : "memory", "cc"
1663#if defined(__SSE2__)
1664 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1665#endif
1666 );
1667}
1668
1669void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1670 uint8* dst_u, uint8* dst_v, int width) {
1671 asm volatile (
1672 "movdqa %0,%%xmm4 \n"
1673 "movdqa %1,%%xmm3 \n"
1674 "movdqa %2,%%xmm5 \n"
1675 :
1676 : "m"(kRGBAToU), // %0
1677 "m"(kRGBAToV), // %1
1678 "m"(kAddUV128) // %2
1679 );
1680 asm volatile (
1681 "sub %1,%2 \n"
1682 ".p2align 4 \n"
1683 "1: \n"
1684 "movdqu (%0),%%xmm0 \n"
1685 "movdqu 0x10(%0),%%xmm1 \n"
1686 "movdqu 0x20(%0),%%xmm2 \n"
1687 "movdqu 0x30(%0),%%xmm6 \n"
1688 "movdqu (%0,%4,1),%%xmm7 \n"
1689 "pavgb %%xmm7,%%xmm0 \n"
1690 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1691 "pavgb %%xmm7,%%xmm1 \n"
1692 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1693 "pavgb %%xmm7,%%xmm2 \n"
1694 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1695 "pavgb %%xmm7,%%xmm6 \n"
1696 "lea 0x40(%0),%0 \n"
1697 "movdqa %%xmm0,%%xmm7 \n"
1698 "shufps $0x88,%%xmm1,%%xmm0 \n"
1699 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1700 "pavgb %%xmm7,%%xmm0 \n"
1701 "movdqa %%xmm2,%%xmm7 \n"
1702 "shufps $0x88,%%xmm6,%%xmm2 \n"
1703 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1704 "pavgb %%xmm7,%%xmm2 \n"
1705 "movdqa %%xmm0,%%xmm1 \n"
1706 "movdqa %%xmm2,%%xmm6 \n"
1707 "pmaddubsw %%xmm4,%%xmm0 \n"
1708 "pmaddubsw %%xmm4,%%xmm2 \n"
1709 "pmaddubsw %%xmm3,%%xmm1 \n"
1710 "pmaddubsw %%xmm3,%%xmm6 \n"
1711 "phaddw %%xmm2,%%xmm0 \n"
1712 "phaddw %%xmm6,%%xmm1 \n"
1713 "psraw $0x8,%%xmm0 \n"
1714 "psraw $0x8,%%xmm1 \n"
1715 "packsswb %%xmm1,%%xmm0 \n"
1716 "paddb %%xmm5,%%xmm0 \n"
1717 "sub $0x10,%3 \n"
1718 "movlps %%xmm0,(%1) \n"
1719 "movhps %%xmm0,(%1,%2,1) \n"
1720 "lea 0x8(%1),%1 \n"
1721 "jg 1b \n"
1722 : "+r"(src_rgba0), // %0
1723 "+r"(dst_u), // %1
1724 "+r"(dst_v), // %2
1725 "+rm"(width) // %3
1726 : "r"(static_cast<intptr_t>(src_stride_rgba))
1727 : "memory", "cc"
1728#if defined(__SSE2__)
1729 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1730#endif
1731 );
1732}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001733#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001734
fbarchard@google.come214fe32012-06-04 23:47:11 +00001735#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001736#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1737#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1738#define UR 0
1739
1740#define VB 0
1741#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1742#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1743
1744// Bias
1745#define BB UB * 128 + VB * 128
1746#define BG UG * 128 + VG * 128
1747#define BR UR * 128 + VR * 128
1748
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001749#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001750
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001751struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001752 vec8 kUVToB; // 0
1753 vec8 kUVToG; // 16
1754 vec8 kUVToR; // 32
1755 vec16 kUVBiasB; // 48
1756 vec16 kUVBiasG; // 64
1757 vec16 kUVBiasR; // 80
1758 vec16 kYSub16; // 96
1759 vec16 kYToRgb; // 112
1760 vec8 kVUToB; // 128
1761 vec8 kVUToG; // 144
1762 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001763} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001764 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1765 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1766 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1767 { BB, BB, BB, BB, BB, BB, BB, BB },
1768 { BG, BG, BG, BG, BG, BG, BG, BG },
1769 { BR, BR, BR, BR, BR, BR, BR, BR },
1770 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001771 { YG, YG, YG, YG, YG, YG, YG, YG },
1772 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1773 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1774 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001775};
1776
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001777
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001778// Read 8 UV from 411
1779#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001780 "movq (%[u_buf]),%%xmm0 \n" \
1781 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1782 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001783 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001784
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001785// Read 4 UV from 422, upsample to 8 UV
1786#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001787 "movd (%[u_buf]),%%xmm0 \n" \
1788 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1789 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001790 "punpcklbw %%xmm1,%%xmm0 \n" \
1791 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001792
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001793// Read 2 UV from 411, upsample to 8 UV
1794#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001795 "movd (%[u_buf]),%%xmm0 \n" \
1796 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1797 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001798 "punpcklbw %%xmm1,%%xmm0 \n" \
1799 "punpcklwd %%xmm0,%%xmm0 \n" \
1800 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001801
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001802// Read 4 UV from NV12, upsample to 8 UV
1803#define READNV12 \
1804 "movq (%[uv_buf]),%%xmm0 \n" \
1805 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001806 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001807
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001808// Convert 8 pixels: 8 UV and 8 Y
1809#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001810 "movdqa %%xmm0,%%xmm1 \n" \
1811 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001812 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1813 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1814 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1815 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1816 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1817 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1818 "movq (%[y_buf]),%%xmm3 \n" \
1819 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001820 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001821 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1822 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001823 "paddsw %%xmm3,%%xmm0 \n" \
1824 "paddsw %%xmm3,%%xmm1 \n" \
1825 "paddsw %%xmm3,%%xmm2 \n" \
1826 "psraw $0x6,%%xmm0 \n" \
1827 "psraw $0x6,%%xmm1 \n" \
1828 "psraw $0x6,%%xmm2 \n" \
1829 "packuswb %%xmm0,%%xmm0 \n" \
1830 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001831 "packuswb %%xmm2,%%xmm2 \n" \
1832
1833// Convert 8 pixels: 8 VU and 8 Y
1834#define YVUTORGB \
1835 "movdqa %%xmm0,%%xmm1 \n" \
1836 "movdqa %%xmm0,%%xmm2 \n" \
1837 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1838 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1839 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1840 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1841 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1842 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1843 "movq (%[y_buf]),%%xmm3 \n" \
1844 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1845 "punpcklbw %%xmm4,%%xmm3 \n" \
1846 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1847 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1848 "paddsw %%xmm3,%%xmm0 \n" \
1849 "paddsw %%xmm3,%%xmm1 \n" \
1850 "paddsw %%xmm3,%%xmm2 \n" \
1851 "psraw $0x6,%%xmm0 \n" \
1852 "psraw $0x6,%%xmm1 \n" \
1853 "psraw $0x6,%%xmm2 \n" \
1854 "packuswb %%xmm0,%%xmm0 \n" \
1855 "packuswb %%xmm1,%%xmm1 \n" \
1856 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001857
1858void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001859 const uint8* u_buf,
1860 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001861 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001862 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001863 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001864 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001865 "pcmpeqb %%xmm5,%%xmm5 \n"
1866 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001867 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001868 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001869 READYUV444
1870 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001871 "punpcklbw %%xmm1,%%xmm0 \n"
1872 "punpcklbw %%xmm5,%%xmm2 \n"
1873 "movdqa %%xmm0,%%xmm1 \n"
1874 "punpcklwd %%xmm2,%%xmm0 \n"
1875 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001876 "movdqa %%xmm0,(%[dst_argb]) \n"
1877 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
1878 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001879 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001880 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001881 : [y_buf]"+r"(y_buf), // %[y_buf]
1882 [u_buf]"+r"(u_buf), // %[u_buf]
1883 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001884 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001885 [width]"+rm"(width) // %[width]
1886 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001887 : "memory", "cc"
1888#if defined(__SSE2__)
1889 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1890#endif
1891 );
1892}
1893
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001894void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1895 const uint8* u_buf,
1896 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001897 uint8* dst_rgb24,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001898 int width) {
1899// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1900#ifdef __APPLE__
1901 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001902 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1903 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1904 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1905 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001906#endif
1907
1908 asm volatile (
1909#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001910 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1911 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001912#endif
1913 "sub %[u_buf],%[v_buf] \n"
1914 "pxor %%xmm4,%%xmm4 \n"
1915 ".p2align 4 \n"
1916 "1: \n"
1917 READYUV422
1918 YUVTORGB
1919 "punpcklbw %%xmm1,%%xmm0 \n"
1920 "punpcklbw %%xmm2,%%xmm2 \n"
1921 "movdqa %%xmm0,%%xmm1 \n"
1922 "punpcklwd %%xmm2,%%xmm0 \n"
1923 "punpckhwd %%xmm2,%%xmm1 \n"
1924 "pshufb %%xmm5,%%xmm0 \n"
1925 "pshufb %%xmm6,%%xmm1 \n"
1926 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001927 "movq %%xmm0,(%[dst_rgb24]) \n"
1928 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
1929 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001930 "sub $0x8,%[width] \n"
1931 "jg 1b \n"
1932 : [y_buf]"+r"(y_buf), // %[y_buf]
1933 [u_buf]"+r"(u_buf), // %[u_buf]
1934 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001935 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001936 [width]"+rm"(width) // %[width]
1937 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1938#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001939 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1940 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001941#endif
1942 : "memory", "cc"
1943#if defined(__SSE2__)
1944 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1945#endif
1946 );
1947}
1948
1949void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1950 const uint8* u_buf,
1951 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001952 uint8* dst_raw,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001953 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001954// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001955#ifdef __APPLE__
1956 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001957 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1958 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1959 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1960 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001961#endif
1962
1963 asm volatile (
1964#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001965 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1966 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001967#endif
1968 "sub %[u_buf],%[v_buf] \n"
1969 "pxor %%xmm4,%%xmm4 \n"
1970 ".p2align 4 \n"
1971 "1: \n"
1972 READYUV422
1973 YUVTORGB
1974 "punpcklbw %%xmm1,%%xmm0 \n"
1975 "punpcklbw %%xmm2,%%xmm2 \n"
1976 "movdqa %%xmm0,%%xmm1 \n"
1977 "punpcklwd %%xmm2,%%xmm0 \n"
1978 "punpckhwd %%xmm2,%%xmm1 \n"
1979 "pshufb %%xmm5,%%xmm0 \n"
1980 "pshufb %%xmm6,%%xmm1 \n"
1981 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001982 "movq %%xmm0,(%[dst_raw]) \n"
1983 "movdqu %%xmm1,0x8(%[dst_raw]) \n"
1984 "lea 0x18(%[dst_raw]),%[dst_raw] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001985 "sub $0x8,%[width] \n"
1986 "jg 1b \n"
1987 : [y_buf]"+r"(y_buf), // %[y_buf]
1988 [u_buf]"+r"(u_buf), // %[u_buf]
1989 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00001990 [dst_raw]"+r"(dst_raw), // %[dst_raw]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001991 [width]"+rm"(width) // %[width]
1992 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1993#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001994 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1995 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001996#endif
1997 : "memory", "cc"
1998#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001999 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002000#endif
2001 );
2002}
2003
fbarchard@google.come214fe32012-06-04 23:47:11 +00002004void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002005 const uint8* u_buf,
2006 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002007 uint8* dst_argb,
fbarchard@google.comdbcabea2012-10-29 21:20:25 +00002008 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002009 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002010 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002011 "pcmpeqb %%xmm5,%%xmm5 \n"
2012 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002013 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002014 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002015 READYUV422
2016 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002017 "punpcklbw %%xmm1,%%xmm0 \n"
2018 "punpcklbw %%xmm5,%%xmm2 \n"
2019 "movdqa %%xmm0,%%xmm1 \n"
2020 "punpcklwd %%xmm2,%%xmm0 \n"
2021 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002022 "movdqa %%xmm0,(%[dst_argb]) \n"
2023 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2024 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002025 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002026 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002027 : [y_buf]"+r"(y_buf), // %[y_buf]
2028 [u_buf]"+r"(u_buf), // %[u_buf]
2029 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002030 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002031 [width]"+rm"(width) // %[width]
2032 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002033 : "memory", "cc"
2034#if defined(__SSE2__)
2035 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2036#endif
2037 );
2038}
2039
2040void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
2041 const uint8* u_buf,
2042 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002043 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002044 int width) {
2045 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002046 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002047 "pcmpeqb %%xmm5,%%xmm5 \n"
2048 "pxor %%xmm4,%%xmm4 \n"
2049 ".p2align 4 \n"
2050 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002051 READYUV411
2052 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002053 "punpcklbw %%xmm1,%%xmm0 \n"
2054 "punpcklbw %%xmm5,%%xmm2 \n"
2055 "movdqa %%xmm0,%%xmm1 \n"
2056 "punpcklwd %%xmm2,%%xmm0 \n"
2057 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002058 "movdqa %%xmm0,(%[dst_argb]) \n"
2059 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2060 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002061 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002062 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002063 : [y_buf]"+r"(y_buf), // %[y_buf]
2064 [u_buf]"+r"(u_buf), // %[u_buf]
2065 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002066 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002067 [width]"+rm"(width) // %[width]
2068 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2069 : "memory", "cc"
2070#if defined(__SSE2__)
2071 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2072#endif
2073 );
2074}
2075
2076void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
2077 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002078 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002079 int width) {
2080 asm volatile (
2081 "pcmpeqb %%xmm5,%%xmm5 \n"
2082 "pxor %%xmm4,%%xmm4 \n"
2083 ".p2align 4 \n"
2084 "1: \n"
2085 READNV12
2086 YUVTORGB
2087 "punpcklbw %%xmm1,%%xmm0 \n"
2088 "punpcklbw %%xmm5,%%xmm2 \n"
2089 "movdqa %%xmm0,%%xmm1 \n"
2090 "punpcklwd %%xmm2,%%xmm0 \n"
2091 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002092 "movdqa %%xmm0,(%[dst_argb]) \n"
2093 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2094 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002095 "sub $0x8,%[width] \n"
2096 "jg 1b \n"
2097 : [y_buf]"+r"(y_buf), // %[y_buf]
2098 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002099 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002100 [width]"+rm"(width) // %[width]
2101 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2102 : "memory", "cc"
2103#if defined(__SSE2__)
2104 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2105#endif
2106 );
2107}
2108
2109void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002110 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002111 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002112 int width) {
2113 asm volatile (
2114 "pcmpeqb %%xmm5,%%xmm5 \n"
2115 "pxor %%xmm4,%%xmm4 \n"
2116 ".p2align 4 \n"
2117 "1: \n"
2118 READNV12
2119 YVUTORGB
2120 "punpcklbw %%xmm1,%%xmm0 \n"
2121 "punpcklbw %%xmm5,%%xmm2 \n"
2122 "movdqa %%xmm0,%%xmm1 \n"
2123 "punpcklwd %%xmm2,%%xmm0 \n"
2124 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002125 "movdqa %%xmm0,(%[dst_argb]) \n"
2126 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2127 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002128 "sub $0x8,%[width] \n"
2129 "jg 1b \n"
2130 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002131 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2132 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002133 [width]"+rm"(width) // %[width]
2134 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002135 : "memory", "cc"
2136#if defined(__SSE2__)
2137 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2138#endif
2139 );
2140}
2141
2142void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2143 const uint8* u_buf,
2144 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002145 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002146 int width) {
2147 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002148 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002149 "pcmpeqb %%xmm5,%%xmm5 \n"
2150 "pxor %%xmm4,%%xmm4 \n"
2151 ".p2align 4 \n"
2152 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002153 READYUV444
2154 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002155 "punpcklbw %%xmm1,%%xmm0 \n"
2156 "punpcklbw %%xmm5,%%xmm2 \n"
2157 "movdqa %%xmm0,%%xmm1 \n"
2158 "punpcklwd %%xmm2,%%xmm0 \n"
2159 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002160 "movdqu %%xmm0,(%[dst_argb]) \n"
2161 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2162 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002163 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002164 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002165 : [y_buf]"+r"(y_buf), // %[y_buf]
2166 [u_buf]"+r"(u_buf), // %[u_buf]
2167 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002168 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002169 [width]"+rm"(width) // %[width]
2170 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002171 : "memory", "cc"
2172#if defined(__SSE2__)
2173 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2174#endif
2175 );
2176}
2177
2178void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2179 const uint8* u_buf,
2180 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002181 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002182 int width) {
2183 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002184 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002185 "pcmpeqb %%xmm5,%%xmm5 \n"
2186 "pxor %%xmm4,%%xmm4 \n"
2187 ".p2align 4 \n"
2188 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002189 READYUV422
2190 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002191 "punpcklbw %%xmm1,%%xmm0 \n"
2192 "punpcklbw %%xmm5,%%xmm2 \n"
2193 "movdqa %%xmm0,%%xmm1 \n"
2194 "punpcklwd %%xmm2,%%xmm0 \n"
2195 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002196 "movdqu %%xmm0,(%[dst_argb]) \n"
2197 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2198 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002199 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002200 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002201 : [y_buf]"+r"(y_buf), // %[y_buf]
2202 [u_buf]"+r"(u_buf), // %[u_buf]
2203 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002204 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002205 [width]"+rm"(width) // %[width]
2206 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002207 : "memory", "cc"
2208#if defined(__SSE2__)
2209 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2210#endif
2211 );
2212}
2213
2214void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2215 const uint8* u_buf,
2216 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002217 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002218 int width) {
2219 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002220 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002221 "pcmpeqb %%xmm5,%%xmm5 \n"
2222 "pxor %%xmm4,%%xmm4 \n"
2223 ".p2align 4 \n"
2224 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002225 READYUV411
2226 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002227 "punpcklbw %%xmm1,%%xmm0 \n"
2228 "punpcklbw %%xmm5,%%xmm2 \n"
2229 "movdqa %%xmm0,%%xmm1 \n"
2230 "punpcklwd %%xmm2,%%xmm0 \n"
2231 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002232 "movdqu %%xmm0,(%[dst_argb]) \n"
2233 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2234 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002235 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002236 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002237 : [y_buf]"+r"(y_buf), // %[y_buf]
2238 [u_buf]"+r"(u_buf), // %[u_buf]
2239 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002240 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002241 [width]"+rm"(width) // %[width]
2242 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2243 : "memory", "cc"
2244#if defined(__SSE2__)
2245 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2246#endif
2247 );
2248}
2249
2250void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2251 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002252 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002253 int width) {
2254 asm volatile (
2255 "pcmpeqb %%xmm5,%%xmm5 \n"
2256 "pxor %%xmm4,%%xmm4 \n"
2257 ".p2align 4 \n"
2258 "1: \n"
2259 READNV12
2260 YUVTORGB
2261 "punpcklbw %%xmm1,%%xmm0 \n"
2262 "punpcklbw %%xmm5,%%xmm2 \n"
2263 "movdqa %%xmm0,%%xmm1 \n"
2264 "punpcklwd %%xmm2,%%xmm0 \n"
2265 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002266 "movdqu %%xmm0,(%[dst_argb]) \n"
2267 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2268 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002269 "sub $0x8,%[width] \n"
2270 "jg 1b \n"
2271 : [y_buf]"+r"(y_buf), // %[y_buf]
2272 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002273 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002274 [width]"+rm"(width) // %[width]
2275 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2276 : "memory", "cc"
2277#if defined(__SSE2__)
2278 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2279#endif
2280 );
2281}
2282
2283void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002284 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002285 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002286 int width) {
2287 asm volatile (
2288 "pcmpeqb %%xmm5,%%xmm5 \n"
2289 "pxor %%xmm4,%%xmm4 \n"
2290 ".p2align 4 \n"
2291 "1: \n"
2292 READNV12
2293 YVUTORGB
2294 "punpcklbw %%xmm1,%%xmm0 \n"
2295 "punpcklbw %%xmm5,%%xmm2 \n"
2296 "movdqa %%xmm0,%%xmm1 \n"
2297 "punpcklwd %%xmm2,%%xmm0 \n"
2298 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002299 "movdqu %%xmm0,(%[dst_argb]) \n"
2300 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2301 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002302 "sub $0x8,%[width] \n"
2303 "jg 1b \n"
2304 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002305 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2306 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002307 [width]"+rm"(width) // %[width]
2308 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002309 : "memory", "cc"
2310#if defined(__SSE2__)
2311 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2312#endif
2313 );
2314}
2315
2316void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2317 const uint8* u_buf,
2318 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002319 uint8* dst_bgra,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002320 int width) {
2321 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002322 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002323 "pcmpeqb %%xmm5,%%xmm5 \n"
2324 "pxor %%xmm4,%%xmm4 \n"
2325 ".p2align 4 \n"
2326 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002327 READYUV422
2328 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002329 "pcmpeqb %%xmm5,%%xmm5 \n"
2330 "punpcklbw %%xmm0,%%xmm1 \n"
2331 "punpcklbw %%xmm2,%%xmm5 \n"
2332 "movdqa %%xmm5,%%xmm0 \n"
2333 "punpcklwd %%xmm1,%%xmm5 \n"
2334 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002335 "movdqa %%xmm5,(%[dst_bgra]) \n"
2336 "movdqa %%xmm0,0x10(%[dst_bgra]) \n"
2337 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002338 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002339 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002340 : [y_buf]"+r"(y_buf), // %[y_buf]
2341 [u_buf]"+r"(u_buf), // %[u_buf]
2342 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002343 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002344 [width]"+rm"(width) // %[width]
2345 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002346 : "memory", "cc"
2347#if defined(__SSE2__)
2348 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2349#endif
2350 );
2351}
2352
fbarchard@google.come214fe32012-06-04 23:47:11 +00002353void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002354 const uint8* u_buf,
2355 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002356 uint8* dst_abgr,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002357 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002358 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002359 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002360 "pcmpeqb %%xmm5,%%xmm5 \n"
2361 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002362 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002363 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002364 READYUV422
2365 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002366 "punpcklbw %%xmm1,%%xmm2 \n"
2367 "punpcklbw %%xmm5,%%xmm0 \n"
2368 "movdqa %%xmm2,%%xmm1 \n"
2369 "punpcklwd %%xmm0,%%xmm2 \n"
2370 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002371 "movdqa %%xmm2,(%[dst_abgr]) \n"
2372 "movdqa %%xmm1,0x10(%[dst_abgr]) \n"
2373 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002374 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002375 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002376 : [y_buf]"+r"(y_buf), // %[y_buf]
2377 [u_buf]"+r"(u_buf), // %[u_buf]
2378 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002379 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002380 [width]"+rm"(width) // %[width]
2381 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002382 : "memory", "cc"
2383#if defined(__SSE2__)
2384 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2385#endif
2386 );
2387}
2388
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002389void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2390 const uint8* u_buf,
2391 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002392 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002393 int width) {
2394 asm volatile (
2395 "sub %[u_buf],%[v_buf] \n"
2396 "pcmpeqb %%xmm5,%%xmm5 \n"
2397 "pxor %%xmm4,%%xmm4 \n"
2398 ".p2align 4 \n"
2399 "1: \n"
2400 READYUV422
2401 YUVTORGB
2402 "pcmpeqb %%xmm5,%%xmm5 \n"
2403 "punpcklbw %%xmm2,%%xmm1 \n"
2404 "punpcklbw %%xmm0,%%xmm5 \n"
2405 "movdqa %%xmm5,%%xmm0 \n"
2406 "punpcklwd %%xmm1,%%xmm5 \n"
2407 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002408 "movdqa %%xmm5,(%[dst_rgba]) \n"
2409 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2410 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002411 "sub $0x8,%[width] \n"
2412 "jg 1b \n"
2413 : [y_buf]"+r"(y_buf), // %[y_buf]
2414 [u_buf]"+r"(u_buf), // %[u_buf]
2415 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002416 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002417 [width]"+rm"(width) // %[width]
2418 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2419 : "memory", "cc"
2420#if defined(__SSE2__)
2421 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2422#endif
2423 );
2424}
2425
fbarchard@google.come214fe32012-06-04 23:47:11 +00002426void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002427 const uint8* u_buf,
2428 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002429 uint8* dst_bgra,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002430 int width) {
2431 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002432 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002433 "pcmpeqb %%xmm5,%%xmm5 \n"
2434 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002435 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002436 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002437 READYUV422
2438 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002439 "pcmpeqb %%xmm5,%%xmm5 \n"
2440 "punpcklbw %%xmm0,%%xmm1 \n"
2441 "punpcklbw %%xmm2,%%xmm5 \n"
2442 "movdqa %%xmm5,%%xmm0 \n"
2443 "punpcklwd %%xmm1,%%xmm5 \n"
2444 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002445 "movdqu %%xmm5,(%[dst_bgra]) \n"
2446 "movdqu %%xmm0,0x10(%[dst_bgra]) \n"
2447 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002448 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002449 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002450 : [y_buf]"+r"(y_buf), // %[y_buf]
2451 [u_buf]"+r"(u_buf), // %[u_buf]
2452 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002453 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002454 [width]"+rm"(width) // %[width]
2455 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002456 : "memory", "cc"
2457#if defined(__SSE2__)
2458 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2459#endif
2460 );
2461}
2462
fbarchard@google.come214fe32012-06-04 23:47:11 +00002463void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002464 const uint8* u_buf,
2465 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002466 uint8* dst_abgr,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002467 int width) {
2468 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002469 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002470 "pcmpeqb %%xmm5,%%xmm5 \n"
2471 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002472 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002473 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002474 READYUV422
2475 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002476 "punpcklbw %%xmm1,%%xmm2 \n"
2477 "punpcklbw %%xmm5,%%xmm0 \n"
2478 "movdqa %%xmm2,%%xmm1 \n"
2479 "punpcklwd %%xmm0,%%xmm2 \n"
2480 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002481 "movdqu %%xmm2,(%[dst_abgr]) \n"
2482 "movdqu %%xmm1,0x10(%[dst_abgr]) \n"
2483 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002484 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002485 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002486 : [y_buf]"+r"(y_buf), // %[y_buf]
2487 [u_buf]"+r"(u_buf), // %[u_buf]
2488 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002489 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002490 [width]"+rm"(width) // %[width]
2491 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002492 : "memory", "cc"
2493#if defined(__SSE2__)
2494 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2495#endif
2496 );
2497}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002498
2499void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2500 const uint8* u_buf,
2501 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002502 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002503 int width) {
2504 asm volatile (
2505 "sub %[u_buf],%[v_buf] \n"
2506 "pcmpeqb %%xmm5,%%xmm5 \n"
2507 "pxor %%xmm4,%%xmm4 \n"
2508 ".p2align 4 \n"
2509 "1: \n"
2510 READYUV422
2511 YUVTORGB
2512 "pcmpeqb %%xmm5,%%xmm5 \n"
2513 "punpcklbw %%xmm2,%%xmm1 \n"
2514 "punpcklbw %%xmm0,%%xmm5 \n"
2515 "movdqa %%xmm5,%%xmm0 \n"
2516 "punpcklwd %%xmm1,%%xmm5 \n"
2517 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002518 "movdqa %%xmm5,(%[dst_rgba]) \n"
2519 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2520 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002521 "sub $0x8,%[width] \n"
2522 "jg 1b \n"
2523 : [y_buf]"+r"(y_buf), // %[y_buf]
2524 [u_buf]"+r"(u_buf), // %[u_buf]
2525 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002526 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002527 [width]"+rm"(width) // %[width]
2528 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2529 : "memory", "cc"
2530#if defined(__SSE2__)
2531 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2532#endif
2533 );
2534}
2535
fbarchard@google.come214fe32012-06-04 23:47:11 +00002536#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002537
2538#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002539void YToARGBRow_SSE2(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002540 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002541 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002542 asm volatile (
fbarchard@google.com30859f72012-11-02 09:51:29 +00002543 "pxor %%xmm5,%%xmm5 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002544 "pcmpeqb %%xmm4,%%xmm4 \n"
2545 "pslld $0x18,%%xmm4 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002546 "mov $0x00100010,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002547 "movd %%eax,%%xmm3 \n"
2548 "pshufd $0x0,%%xmm3,%%xmm3 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002549 "mov $0x004a004a,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002550 "movd %%eax,%%xmm2 \n"
2551 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002552 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002553 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002554 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002555 "movq (%0),%%xmm0 \n"
2556 "lea 0x8(%0),%0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002557 "punpcklbw %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002558 "psubusw %%xmm3,%%xmm0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002559 "pmullw %%xmm2,%%xmm0 \n"
2560 "psrlw $6, %%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002561 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002562
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002563 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002564 "punpcklbw %%xmm0,%%xmm0 \n"
2565 "movdqa %%xmm0,%%xmm1 \n"
2566 "punpcklwd %%xmm0,%%xmm0 \n"
2567 "punpckhwd %%xmm1,%%xmm1 \n"
2568 "por %%xmm4,%%xmm0 \n"
2569 "por %%xmm4,%%xmm1 \n"
2570 "movdqa %%xmm0,(%1) \n"
2571 "movdqa %%xmm1,16(%1) \n"
2572 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002573
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002574 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002575 "jg 1b \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002576 : "+r"(y_buf), // %0
2577 "+r"(dst_argb), // %1
2578 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002579 :
2580 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002581#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002582 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002583#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002584 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002585}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002586#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002587
fbarchard@google.com42831e02012-01-21 02:54:17 +00002588#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002589// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002590CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002591 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2592};
2593
fbarchard@google.com42831e02012-01-21 02:54:17 +00002594void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002595 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002596 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002597 "movdqa %3,%%xmm5 \n"
2598 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002599 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002600 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002601 "movdqa (%0,%2),%%xmm0 \n"
2602 "pshufb %%xmm5,%%xmm0 \n"
2603 "sub $0x10,%2 \n"
2604 "movdqa %%xmm0,(%1) \n"
2605 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002606 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002607 : "+r"(src), // %0
2608 "+r"(dst), // %1
2609 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002610 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002611 : "memory", "cc"
2612#if defined(__SSE2__)
2613 , "xmm0", "xmm5"
2614#endif
2615 );
2616}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002617#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002618
fbarchard@google.com42831e02012-01-21 02:54:17 +00002619#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002620void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002621 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002622 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002623 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002624 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002625 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002626 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002627 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002628 "psllw $0x8,%%xmm0 \n"
2629 "psrlw $0x8,%%xmm1 \n"
2630 "por %%xmm1,%%xmm0 \n"
2631 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2632 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2633 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2634 "sub $0x10,%2 \n"
2635 "movdqu %%xmm0,(%1) \n"
2636 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002637 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002638 : "+r"(src), // %0
2639 "+r"(dst), // %1
2640 "+r"(temp_width) // %2
2641 :
2642 : "memory", "cc"
2643#if defined(__SSE2__)
2644 , "xmm0", "xmm1"
2645#endif
2646 );
2647}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002648#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002649
fbarchard@google.com16a96642012-03-02 22:38:09 +00002650#ifdef HAS_MIRRORROW_UV_SSSE3
2651// Shuffle table for reversing the bytes of UV channels.
2652CONST uvec8 kShuffleMirrorUV = {
2653 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2654};
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002655void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
fbarchard@google.com16a96642012-03-02 22:38:09 +00002656 int width) {
2657 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002658 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002659 "movdqa %4,%%xmm1 \n"
2660 "lea -16(%0,%3,2),%0 \n"
2661 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002662 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002663 "1: \n"
2664 "movdqa (%0),%%xmm0 \n"
2665 "lea -16(%0),%0 \n"
2666 "pshufb %%xmm1,%%xmm0 \n"
2667 "sub $8,%3 \n"
2668 "movlpd %%xmm0,(%1) \n"
2669 "movhpd %%xmm0,(%1,%2) \n"
2670 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002671 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002672 : "+r"(src), // %0
2673 "+r"(dst_u), // %1
2674 "+r"(dst_v), // %2
2675 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002676 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002677 : "memory", "cc"
2678#if defined(__SSE2__)
2679 , "xmm0", "xmm1"
2680#endif
2681 );
2682}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002683#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002684
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002685#ifdef HAS_ARGBMIRRORROW_SSSE3
2686// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002687CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002688 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2689};
2690
2691void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2692 intptr_t temp_width = static_cast<intptr_t>(width);
2693 asm volatile (
2694 "movdqa %3,%%xmm5 \n"
2695 "lea -0x10(%0),%0 \n"
2696 ".p2align 4 \n"
2697 "1: \n"
2698 "movdqa (%0,%2,4),%%xmm0 \n"
2699 "pshufb %%xmm5,%%xmm0 \n"
2700 "sub $0x4,%2 \n"
2701 "movdqa %%xmm0,(%1) \n"
2702 "lea 0x10(%1),%1 \n"
2703 "jg 1b \n"
2704 : "+r"(src), // %0
2705 "+r"(dst), // %1
2706 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002707 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002708 : "memory", "cc"
2709#if defined(__SSE2__)
2710 , "xmm0", "xmm5"
2711#endif
2712 );
2713}
2714#endif // HAS_ARGBMIRRORROW_SSSE3
2715
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002716#ifdef HAS_SPLITUVROW_SSE2
2717void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002718 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002719 "pcmpeqb %%xmm5,%%xmm5 \n"
2720 "psrlw $0x8,%%xmm5 \n"
2721 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002722 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002723 "1: \n"
2724 "movdqa (%0),%%xmm0 \n"
2725 "movdqa 0x10(%0),%%xmm1 \n"
2726 "lea 0x20(%0),%0 \n"
2727 "movdqa %%xmm0,%%xmm2 \n"
2728 "movdqa %%xmm1,%%xmm3 \n"
2729 "pand %%xmm5,%%xmm0 \n"
2730 "pand %%xmm5,%%xmm1 \n"
2731 "packuswb %%xmm1,%%xmm0 \n"
2732 "psrlw $0x8,%%xmm2 \n"
2733 "psrlw $0x8,%%xmm3 \n"
2734 "packuswb %%xmm3,%%xmm2 \n"
2735 "movdqa %%xmm0,(%1) \n"
2736 "movdqa %%xmm2,(%1,%2) \n"
2737 "lea 0x10(%1),%1 \n"
2738 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002739 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002740 : "+r"(src_uv), // %0
2741 "+r"(dst_u), // %1
2742 "+r"(dst_v), // %2
2743 "+r"(pix) // %3
2744 :
2745 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002746#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002747 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002748#endif
2749 );
2750}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002751
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002752void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2753 int pix) {
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002754 asm volatile (
2755 "pcmpeqb %%xmm5,%%xmm5 \n"
2756 "psrlw $0x8,%%xmm5 \n"
2757 "sub %1,%2 \n"
2758 ".p2align 4 \n"
2759 "1: \n"
2760 "movdqu (%0),%%xmm0 \n"
2761 "movdqu 0x10(%0),%%xmm1 \n"
2762 "lea 0x20(%0),%0 \n"
2763 "movdqa %%xmm0,%%xmm2 \n"
2764 "movdqa %%xmm1,%%xmm3 \n"
2765 "pand %%xmm5,%%xmm0 \n"
2766 "pand %%xmm5,%%xmm1 \n"
2767 "packuswb %%xmm1,%%xmm0 \n"
2768 "psrlw $0x8,%%xmm2 \n"
2769 "psrlw $0x8,%%xmm3 \n"
2770 "packuswb %%xmm3,%%xmm2 \n"
2771 "movdqu %%xmm0,(%1) \n"
2772 "movdqu %%xmm2,(%1,%2) \n"
2773 "lea 0x10(%1),%1 \n"
2774 "sub $0x10,%3 \n"
2775 "jg 1b \n"
2776 : "+r"(src_uv), // %0
2777 "+r"(dst_u), // %1
2778 "+r"(dst_v), // %2
2779 "+r"(pix) // %3
2780 :
2781 : "memory", "cc"
2782#if defined(__SSE2__)
2783 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2784#endif
2785 );
2786}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002787#endif // HAS_SPLITUVROW_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002788
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002789#ifdef HAS_MERGEUVROW_SSE2
2790void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2791 int width) {
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002792 asm volatile (
2793 "sub %0,%1 \n"
2794 ".p2align 4 \n"
2795 "1: \n"
2796 "movdqa (%0),%%xmm0 \n"
2797 "movdqa (%0,%1,1),%%xmm1 \n"
2798 "lea 0x10(%0),%0 \n"
2799 "movdqa %%xmm0,%%xmm2 \n"
2800 "punpcklbw %%xmm1,%%xmm0 \n"
2801 "punpckhbw %%xmm1,%%xmm2 \n"
2802 "movdqa %%xmm0,(%2) \n"
2803 "movdqa %%xmm2,0x10(%2) \n"
2804 "lea 0x20(%2),%2 \n"
2805 "sub $0x10,%3 \n"
2806 "jg 1b \n"
2807 : "+r"(src_u), // %0
2808 "+r"(src_v), // %1
2809 "+r"(dst_uv), // %2
2810 "+r"(width) // %3
2811 :
2812 : "memory", "cc"
2813#if defined(__SSE2__)
2814 , "xmm0", "xmm1", "xmm2"
2815#endif
2816 );
2817}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002818
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002819void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2820 uint8* dst_uv, int width) {
fbarchard@google.come0d86482012-10-27 19:07:55 +00002821 asm volatile (
2822 "sub %0,%1 \n"
2823 ".p2align 4 \n"
2824 "1: \n"
2825 "movdqu (%0),%%xmm0 \n"
2826 "movdqu (%0,%1,1),%%xmm1 \n"
2827 "lea 0x10(%0),%0 \n"
2828 "movdqa %%xmm0,%%xmm2 \n"
2829 "punpcklbw %%xmm1,%%xmm0 \n"
2830 "punpckhbw %%xmm1,%%xmm2 \n"
2831 "movdqu %%xmm0,(%2) \n"
2832 "movdqu %%xmm2,0x10(%2) \n"
2833 "lea 0x20(%2),%2 \n"
2834 "sub $0x10,%3 \n"
2835 "jg 1b \n"
2836 : "+r"(src_u), // %0
2837 "+r"(src_v), // %1
2838 "+r"(dst_uv), // %2
2839 "+r"(width) // %3
2840 :
2841 : "memory", "cc"
2842#if defined(__SSE2__)
2843 , "xmm0", "xmm1", "xmm2"
2844#endif
2845 );
2846}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002847#endif // HAS_MERGEUVROW_SSE2
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002848
fbarchard@google.com19932f82012-02-16 22:19:14 +00002849#ifdef HAS_COPYROW_SSE2
2850void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002851 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002852 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002853 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002854 "1: \n"
2855 "movdqa (%0),%%xmm0 \n"
2856 "movdqa 0x10(%0),%%xmm1 \n"
2857 "movdqa %%xmm0,(%0,%1) \n"
2858 "movdqa %%xmm1,0x10(%0,%1) \n"
2859 "lea 0x20(%0),%0 \n"
2860 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002861 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002862 : "+r"(src), // %0
2863 "+r"(dst), // %1
2864 "+r"(count) // %2
2865 :
2866 : "memory", "cc"
2867#if defined(__SSE2__)
2868 , "xmm0", "xmm1"
2869#endif
2870 );
2871}
2872#endif // HAS_COPYROW_SSE2
2873
2874#ifdef HAS_COPYROW_X86
2875void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2876 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002877 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002878 "shr $0x2,%2 \n"
2879 "rep movsl \n"
2880 : "+S"(src), // %0
2881 "+D"(dst), // %1
2882 "+c"(width_tmp) // %2
2883 :
2884 : "memory", "cc"
2885 );
2886}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002887#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002888
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002889#ifdef HAS_SETROW_X86
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002890void SetRow_X86(uint8* dst, uint32 v32, int width) {
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002891 size_t width_tmp = static_cast<size_t>(width);
2892 asm volatile (
2893 "shr $0x2,%1 \n"
2894 "rep stosl \n"
2895 : "+D"(dst), // %0
2896 "+c"(width_tmp) // %1
2897 : "a"(v32) // %2
2898 : "memory", "cc");
2899}
2900
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002901void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002902 int dst_stride, int height) {
2903 for (int y = 0; y < height; ++y) {
2904 size_t width_tmp = static_cast<size_t>(width);
2905 uint32* d = reinterpret_cast<uint32*>(dst);
2906 asm volatile (
2907 "rep stosl \n"
2908 : "+D"(d), // %0
2909 "+c"(width_tmp) // %1
2910 : "a"(v32) // %2
2911 : "memory", "cc");
2912 dst += dst_stride;
2913 }
2914}
2915#endif // HAS_SETROW_X86
2916
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002917#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002918void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002919 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002920 "pcmpeqb %%xmm5,%%xmm5 \n"
2921 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002922 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002923 "1: \n"
2924 "movdqa (%0),%%xmm0 \n"
2925 "movdqa 0x10(%0),%%xmm1 \n"
2926 "lea 0x20(%0),%0 \n"
2927 "pand %%xmm5,%%xmm0 \n"
2928 "pand %%xmm5,%%xmm1 \n"
2929 "packuswb %%xmm1,%%xmm0 \n"
2930 "movdqa %%xmm0,(%1) \n"
2931 "lea 0x10(%1),%1 \n"
2932 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002933 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002934 : "+r"(src_yuy2), // %0
2935 "+r"(dst_y), // %1
2936 "+r"(pix) // %2
2937 :
2938 : "memory", "cc"
2939#if defined(__SSE2__)
2940 , "xmm0", "xmm1", "xmm5"
2941#endif
2942 );
2943}
2944
2945void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002946 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002947 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002948 "pcmpeqb %%xmm5,%%xmm5 \n"
2949 "psrlw $0x8,%%xmm5 \n"
2950 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002951 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002952 "1: \n"
2953 "movdqa (%0),%%xmm0 \n"
2954 "movdqa 0x10(%0),%%xmm1 \n"
2955 "movdqa (%0,%4,1),%%xmm2 \n"
2956 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2957 "lea 0x20(%0),%0 \n"
2958 "pavgb %%xmm2,%%xmm0 \n"
2959 "pavgb %%xmm3,%%xmm1 \n"
2960 "psrlw $0x8,%%xmm0 \n"
2961 "psrlw $0x8,%%xmm1 \n"
2962 "packuswb %%xmm1,%%xmm0 \n"
2963 "movdqa %%xmm0,%%xmm1 \n"
2964 "pand %%xmm5,%%xmm0 \n"
2965 "packuswb %%xmm0,%%xmm0 \n"
2966 "psrlw $0x8,%%xmm1 \n"
2967 "packuswb %%xmm1,%%xmm1 \n"
2968 "movq %%xmm0,(%1) \n"
2969 "movq %%xmm1,(%1,%2) \n"
2970 "lea 0x8(%1),%1 \n"
2971 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002972 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002973 : "+r"(src_yuy2), // %0
2974 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002975 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002976 "+r"(pix) // %3
2977 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2978 : "memory", "cc"
2979#if defined(__SSE2__)
2980 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2981#endif
2982 );
2983}
2984
fbarchard@google.comc704f782012-08-30 19:53:48 +00002985void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2986 uint8* dst_u, uint8* dst_v, int pix) {
2987 asm volatile (
2988 "pcmpeqb %%xmm5,%%xmm5 \n"
2989 "psrlw $0x8,%%xmm5 \n"
2990 "sub %1,%2 \n"
2991 ".p2align 4 \n"
2992 "1: \n"
2993 "movdqa (%0),%%xmm0 \n"
2994 "movdqa 0x10(%0),%%xmm1 \n"
2995 "lea 0x20(%0),%0 \n"
2996 "psrlw $0x8,%%xmm0 \n"
2997 "psrlw $0x8,%%xmm1 \n"
2998 "packuswb %%xmm1,%%xmm0 \n"
2999 "movdqa %%xmm0,%%xmm1 \n"
3000 "pand %%xmm5,%%xmm0 \n"
3001 "packuswb %%xmm0,%%xmm0 \n"
3002 "psrlw $0x8,%%xmm1 \n"
3003 "packuswb %%xmm1,%%xmm1 \n"
3004 "movq %%xmm0,(%1) \n"
3005 "movq %%xmm1,(%1,%2) \n"
3006 "lea 0x8(%1),%1 \n"
3007 "sub $0x10,%3 \n"
3008 "jg 1b \n"
3009 : "+r"(src_yuy2), // %0
3010 "+r"(dst_u), // %1
3011 "+r"(dst_v), // %2
3012 "+r"(pix) // %3
3013 :
3014 : "memory", "cc"
3015#if defined(__SSE2__)
3016 , "xmm0", "xmm1", "xmm5"
3017#endif
3018 );
3019}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00003020
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003021void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
3022 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003023 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003024 "pcmpeqb %%xmm5,%%xmm5 \n"
3025 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003026 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003027 "1: \n"
3028 "movdqu (%0),%%xmm0 \n"
3029 "movdqu 0x10(%0),%%xmm1 \n"
3030 "lea 0x20(%0),%0 \n"
3031 "pand %%xmm5,%%xmm0 \n"
3032 "pand %%xmm5,%%xmm1 \n"
3033 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003034 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003035 "movdqu %%xmm0,(%1) \n"
3036 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003037 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003038 : "+r"(src_yuy2), // %0
3039 "+r"(dst_y), // %1
3040 "+r"(pix) // %2
3041 :
3042 : "memory", "cc"
3043#if defined(__SSE2__)
3044 , "xmm0", "xmm1", "xmm5"
3045#endif
3046 );
3047}
3048
3049void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
3050 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00003051 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003052 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003053 "pcmpeqb %%xmm5,%%xmm5 \n"
3054 "psrlw $0x8,%%xmm5 \n"
3055 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003056 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003057 "1: \n"
3058 "movdqu (%0),%%xmm0 \n"
3059 "movdqu 0x10(%0),%%xmm1 \n"
3060 "movdqu (%0,%4,1),%%xmm2 \n"
3061 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3062 "lea 0x20(%0),%0 \n"
3063 "pavgb %%xmm2,%%xmm0 \n"
3064 "pavgb %%xmm3,%%xmm1 \n"
3065 "psrlw $0x8,%%xmm0 \n"
3066 "psrlw $0x8,%%xmm1 \n"
3067 "packuswb %%xmm1,%%xmm0 \n"
3068 "movdqa %%xmm0,%%xmm1 \n"
3069 "pand %%xmm5,%%xmm0 \n"
3070 "packuswb %%xmm0,%%xmm0 \n"
3071 "psrlw $0x8,%%xmm1 \n"
3072 "packuswb %%xmm1,%%xmm1 \n"
3073 "movq %%xmm0,(%1) \n"
3074 "movq %%xmm1,(%1,%2) \n"
3075 "lea 0x8(%1),%1 \n"
3076 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003077 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003078 : "+r"(src_yuy2), // %0
3079 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003080 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003081 "+r"(pix) // %3
3082 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
3083 : "memory", "cc"
3084#if defined(__SSE2__)
3085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3086#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00003087 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003088}
3089
fbarchard@google.comc704f782012-08-30 19:53:48 +00003090void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
3091 uint8* dst_u, uint8* dst_v, int pix) {
3092 asm volatile (
3093 "pcmpeqb %%xmm5,%%xmm5 \n"
3094 "psrlw $0x8,%%xmm5 \n"
3095 "sub %1,%2 \n"
3096 ".p2align 4 \n"
3097 "1: \n"
3098 "movdqu (%0),%%xmm0 \n"
3099 "movdqu 0x10(%0),%%xmm1 \n"
3100 "lea 0x20(%0),%0 \n"
3101 "psrlw $0x8,%%xmm0 \n"
3102 "psrlw $0x8,%%xmm1 \n"
3103 "packuswb %%xmm1,%%xmm0 \n"
3104 "movdqa %%xmm0,%%xmm1 \n"
3105 "pand %%xmm5,%%xmm0 \n"
3106 "packuswb %%xmm0,%%xmm0 \n"
3107 "psrlw $0x8,%%xmm1 \n"
3108 "packuswb %%xmm1,%%xmm1 \n"
3109 "movq %%xmm0,(%1) \n"
3110 "movq %%xmm1,(%1,%2) \n"
3111 "lea 0x8(%1),%1 \n"
3112 "sub $0x10,%3 \n"
3113 "jg 1b \n"
3114 : "+r"(src_yuy2), // %0
3115 "+r"(dst_u), // %1
3116 "+r"(dst_v), // %2
3117 "+r"(pix) // %3
3118 :
3119 : "memory", "cc"
3120#if defined(__SSE2__)
3121 , "xmm0", "xmm1", "xmm5"
3122#endif
3123 );
3124}
3125
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003126void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003127 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003128 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003129 "1: \n"
3130 "movdqa (%0),%%xmm0 \n"
3131 "movdqa 0x10(%0),%%xmm1 \n"
3132 "lea 0x20(%0),%0 \n"
3133 "psrlw $0x8,%%xmm0 \n"
3134 "psrlw $0x8,%%xmm1 \n"
3135 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003136 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003137 "movdqa %%xmm0,(%1) \n"
3138 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003139 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003140 : "+r"(src_uyvy), // %0
3141 "+r"(dst_y), // %1
3142 "+r"(pix) // %2
3143 :
3144 : "memory", "cc"
3145#if defined(__SSE2__)
3146 , "xmm0", "xmm1"
3147#endif
3148 );
3149}
3150
3151void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003152 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003153 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003154 "pcmpeqb %%xmm5,%%xmm5 \n"
3155 "psrlw $0x8,%%xmm5 \n"
3156 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003157 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003158 "1: \n"
3159 "movdqa (%0),%%xmm0 \n"
3160 "movdqa 0x10(%0),%%xmm1 \n"
3161 "movdqa (%0,%4,1),%%xmm2 \n"
3162 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
3163 "lea 0x20(%0),%0 \n"
3164 "pavgb %%xmm2,%%xmm0 \n"
3165 "pavgb %%xmm3,%%xmm1 \n"
3166 "pand %%xmm5,%%xmm0 \n"
3167 "pand %%xmm5,%%xmm1 \n"
3168 "packuswb %%xmm1,%%xmm0 \n"
3169 "movdqa %%xmm0,%%xmm1 \n"
3170 "pand %%xmm5,%%xmm0 \n"
3171 "packuswb %%xmm0,%%xmm0 \n"
3172 "psrlw $0x8,%%xmm1 \n"
3173 "packuswb %%xmm1,%%xmm1 \n"
3174 "movq %%xmm0,(%1) \n"
3175 "movq %%xmm1,(%1,%2) \n"
3176 "lea 0x8(%1),%1 \n"
3177 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003178 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003179 : "+r"(src_uyvy), // %0
3180 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003181 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003182 "+r"(pix) // %3
3183 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3184 : "memory", "cc"
3185#if defined(__SSE2__)
3186 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3187#endif
3188 );
3189}
3190
fbarchard@google.comc704f782012-08-30 19:53:48 +00003191void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3192 uint8* dst_u, uint8* dst_v, int pix) {
3193 asm volatile (
3194 "pcmpeqb %%xmm5,%%xmm5 \n"
3195 "psrlw $0x8,%%xmm5 \n"
3196 "sub %1,%2 \n"
3197 ".p2align 4 \n"
3198 "1: \n"
3199 "movdqa (%0),%%xmm0 \n"
3200 "movdqa 0x10(%0),%%xmm1 \n"
3201 "lea 0x20(%0),%0 \n"
3202 "pand %%xmm5,%%xmm0 \n"
3203 "pand %%xmm5,%%xmm1 \n"
3204 "packuswb %%xmm1,%%xmm0 \n"
3205 "movdqa %%xmm0,%%xmm1 \n"
3206 "pand %%xmm5,%%xmm0 \n"
3207 "packuswb %%xmm0,%%xmm0 \n"
3208 "psrlw $0x8,%%xmm1 \n"
3209 "packuswb %%xmm1,%%xmm1 \n"
3210 "movq %%xmm0,(%1) \n"
3211 "movq %%xmm1,(%1,%2) \n"
3212 "lea 0x8(%1),%1 \n"
3213 "sub $0x10,%3 \n"
3214 "jg 1b \n"
3215 : "+r"(src_uyvy), // %0
3216 "+r"(dst_u), // %1
3217 "+r"(dst_v), // %2
3218 "+r"(pix) // %3
3219 :
3220 : "memory", "cc"
3221#if defined(__SSE2__)
3222 , "xmm0", "xmm1", "xmm5"
3223#endif
3224 );
3225}
3226
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003227void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3228 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003229 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003230 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003231 "1: \n"
3232 "movdqu (%0),%%xmm0 \n"
3233 "movdqu 0x10(%0),%%xmm1 \n"
3234 "lea 0x20(%0),%0 \n"
3235 "psrlw $0x8,%%xmm0 \n"
3236 "psrlw $0x8,%%xmm1 \n"
3237 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003238 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003239 "movdqu %%xmm0,(%1) \n"
3240 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003241 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003242 : "+r"(src_uyvy), // %0
3243 "+r"(dst_y), // %1
3244 "+r"(pix) // %2
3245 :
3246 : "memory", "cc"
3247#if defined(__SSE2__)
3248 , "xmm0", "xmm1"
3249#endif
3250 );
3251}
3252
3253void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003254 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003255 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003256 "pcmpeqb %%xmm5,%%xmm5 \n"
3257 "psrlw $0x8,%%xmm5 \n"
3258 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003259 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003260 "1: \n"
3261 "movdqu (%0),%%xmm0 \n"
3262 "movdqu 0x10(%0),%%xmm1 \n"
3263 "movdqu (%0,%4,1),%%xmm2 \n"
3264 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3265 "lea 0x20(%0),%0 \n"
3266 "pavgb %%xmm2,%%xmm0 \n"
3267 "pavgb %%xmm3,%%xmm1 \n"
3268 "pand %%xmm5,%%xmm0 \n"
3269 "pand %%xmm5,%%xmm1 \n"
3270 "packuswb %%xmm1,%%xmm0 \n"
3271 "movdqa %%xmm0,%%xmm1 \n"
3272 "pand %%xmm5,%%xmm0 \n"
3273 "packuswb %%xmm0,%%xmm0 \n"
3274 "psrlw $0x8,%%xmm1 \n"
3275 "packuswb %%xmm1,%%xmm1 \n"
3276 "movq %%xmm0,(%1) \n"
3277 "movq %%xmm1,(%1,%2) \n"
3278 "lea 0x8(%1),%1 \n"
3279 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003280 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003281 : "+r"(src_uyvy), // %0
3282 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003283 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003284 "+r"(pix) // %3
3285 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3286 : "memory", "cc"
3287#if defined(__SSE2__)
3288 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3289#endif
3290 );
3291}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003292
3293void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3294 uint8* dst_u, uint8* dst_v, int pix) {
3295 asm volatile (
3296 "pcmpeqb %%xmm5,%%xmm5 \n"
3297 "psrlw $0x8,%%xmm5 \n"
3298 "sub %1,%2 \n"
3299 ".p2align 4 \n"
3300 "1: \n"
3301 "movdqu (%0),%%xmm0 \n"
3302 "movdqu 0x10(%0),%%xmm1 \n"
3303 "lea 0x20(%0),%0 \n"
3304 "pand %%xmm5,%%xmm0 \n"
3305 "pand %%xmm5,%%xmm1 \n"
3306 "packuswb %%xmm1,%%xmm0 \n"
3307 "movdqa %%xmm0,%%xmm1 \n"
3308 "pand %%xmm5,%%xmm0 \n"
3309 "packuswb %%xmm0,%%xmm0 \n"
3310 "psrlw $0x8,%%xmm1 \n"
3311 "packuswb %%xmm1,%%xmm1 \n"
3312 "movq %%xmm0,(%1) \n"
3313 "movq %%xmm1,(%1,%2) \n"
3314 "lea 0x8(%1),%1 \n"
3315 "sub $0x10,%3 \n"
3316 "jg 1b \n"
3317 : "+r"(src_uyvy), // %0
3318 "+r"(dst_u), // %1
3319 "+r"(dst_v), // %2
3320 "+r"(pix) // %3
3321 :
3322 : "memory", "cc"
3323#if defined(__SSE2__)
3324 , "xmm0", "xmm1", "xmm5"
3325#endif
3326 );
3327}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003328#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003329
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003330#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003331// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003332void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3333 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003334 asm volatile (
3335 "pcmpeqb %%xmm7,%%xmm7 \n"
3336 "psrlw $0xf,%%xmm7 \n"
3337 "pcmpeqb %%xmm6,%%xmm6 \n"
3338 "psrlw $0x8,%%xmm6 \n"
3339 "pcmpeqb %%xmm5,%%xmm5 \n"
3340 "psllw $0x8,%%xmm5 \n"
3341 "pcmpeqb %%xmm4,%%xmm4 \n"
3342 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003343 "sub $0x1,%3 \n"
3344 "je 91f \n"
3345 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003346
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003347 // 1 pixel loop until destination pointer is aligned.
3348 "10: \n"
3349 "test $0xf,%2 \n"
3350 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003351 "movd (%0),%%xmm3 \n"
3352 "lea 0x4(%0),%0 \n"
3353 "movdqa %%xmm3,%%xmm0 \n"
3354 "pxor %%xmm4,%%xmm3 \n"
3355 "movd (%1),%%xmm2 \n"
3356 "psrlw $0x8,%%xmm3 \n"
3357 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3358 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3359 "pand %%xmm6,%%xmm2 \n"
3360 "paddw %%xmm7,%%xmm3 \n"
3361 "pmullw %%xmm3,%%xmm2 \n"
3362 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003363 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003364 "psrlw $0x8,%%xmm1 \n"
3365 "por %%xmm4,%%xmm0 \n"
3366 "pmullw %%xmm3,%%xmm1 \n"
3367 "psrlw $0x8,%%xmm2 \n"
3368 "paddusb %%xmm2,%%xmm0 \n"
3369 "pand %%xmm5,%%xmm1 \n"
3370 "paddusb %%xmm1,%%xmm0 \n"
3371 "sub $0x1,%3 \n"
3372 "movd %%xmm0,(%2) \n"
3373 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003374 "jge 10b \n"
3375
3376 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003377 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003378 "jl 49f \n"
3379
fbarchard@google.com794fe122012-06-15 01:05:01 +00003380 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003381 ".p2align 2 \n"
3382 "41: \n"
3383 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003384 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003385 "movdqa %%xmm3,%%xmm0 \n"
3386 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003387 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003388 "psrlw $0x8,%%xmm3 \n"
3389 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3390 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003391 "pand %%xmm6,%%xmm2 \n"
3392 "paddw %%xmm7,%%xmm3 \n"
3393 "pmullw %%xmm3,%%xmm2 \n"
3394 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003395 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003396 "psrlw $0x8,%%xmm1 \n"
3397 "por %%xmm4,%%xmm0 \n"
3398 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003399 "psrlw $0x8,%%xmm2 \n"
3400 "paddusb %%xmm2,%%xmm0 \n"
3401 "pand %%xmm5,%%xmm1 \n"
3402 "paddusb %%xmm1,%%xmm0 \n"
3403 "sub $0x4,%3 \n"
3404 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003405 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003406 "jge 41b \n"
3407
3408 "49: \n"
3409 "add $0x3,%3 \n"
3410 "jl 99f \n"
3411
fbarchard@google.com794fe122012-06-15 01:05:01 +00003412 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003413 "91: \n"
3414 "movd (%0),%%xmm3 \n"
3415 "lea 0x4(%0),%0 \n"
3416 "movdqa %%xmm3,%%xmm0 \n"
3417 "pxor %%xmm4,%%xmm3 \n"
3418 "movd (%1),%%xmm2 \n"
3419 "psrlw $0x8,%%xmm3 \n"
3420 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3421 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3422 "pand %%xmm6,%%xmm2 \n"
3423 "paddw %%xmm7,%%xmm3 \n"
3424 "pmullw %%xmm3,%%xmm2 \n"
3425 "movd (%1),%%xmm1 \n"
3426 "lea 0x4(%1),%1 \n"
3427 "psrlw $0x8,%%xmm1 \n"
3428 "por %%xmm4,%%xmm0 \n"
3429 "pmullw %%xmm3,%%xmm1 \n"
3430 "psrlw $0x8,%%xmm2 \n"
3431 "paddusb %%xmm2,%%xmm0 \n"
3432 "pand %%xmm5,%%xmm1 \n"
3433 "paddusb %%xmm1,%%xmm0 \n"
3434 "sub $0x1,%3 \n"
3435 "movd %%xmm0,(%2) \n"
3436 "lea 0x4(%2),%2 \n"
3437 "jge 91b \n"
3438 "99: \n"
3439 : "+r"(src_argb0), // %0
3440 "+r"(src_argb1), // %1
3441 "+r"(dst_argb), // %2
3442 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003443 :
3444 : "memory", "cc"
3445#if defined(__SSE2__)
3446 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3447#endif
3448 );
3449}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003450#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003451
fbarchard@google.com96af8702012-04-06 18:22:27 +00003452#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003453// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003454CONST uvec8 kShuffleAlpha = {
3455 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3456 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3457};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003458
3459// Blend 8 pixels at a time
3460// Shuffle table for reversing the bytes.
3461
3462// Same as SSE2, but replaces
3463// psrlw xmm3, 8 // alpha
3464// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3465// pshuflw xmm3, xmm3,0F5h
3466// with..
3467// pshufb xmm3, kShuffleAlpha // alpha
3468
3469void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3470 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003471 asm volatile (
3472 "pcmpeqb %%xmm7,%%xmm7 \n"
3473 "psrlw $0xf,%%xmm7 \n"
3474 "pcmpeqb %%xmm6,%%xmm6 \n"
3475 "psrlw $0x8,%%xmm6 \n"
3476 "pcmpeqb %%xmm5,%%xmm5 \n"
3477 "psllw $0x8,%%xmm5 \n"
3478 "pcmpeqb %%xmm4,%%xmm4 \n"
3479 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003480 "sub $0x1,%3 \n"
3481 "je 91f \n"
3482 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003483
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003484 // 1 pixel loop until destination pointer is aligned.
3485 "10: \n"
3486 "test $0xf,%2 \n"
3487 "je 19f \n"
3488 "movd (%0),%%xmm3 \n"
3489 "lea 0x4(%0),%0 \n"
3490 "movdqa %%xmm3,%%xmm0 \n"
3491 "pxor %%xmm4,%%xmm3 \n"
3492 "movd (%1),%%xmm2 \n"
3493 "pshufb %4,%%xmm3 \n"
3494 "pand %%xmm6,%%xmm2 \n"
3495 "paddw %%xmm7,%%xmm3 \n"
3496 "pmullw %%xmm3,%%xmm2 \n"
3497 "movd (%1),%%xmm1 \n"
3498 "lea 0x4(%1),%1 \n"
3499 "psrlw $0x8,%%xmm1 \n"
3500 "por %%xmm4,%%xmm0 \n"
3501 "pmullw %%xmm3,%%xmm1 \n"
3502 "psrlw $0x8,%%xmm2 \n"
3503 "paddusb %%xmm2,%%xmm0 \n"
3504 "pand %%xmm5,%%xmm1 \n"
3505 "paddusb %%xmm1,%%xmm0 \n"
3506 "sub $0x1,%3 \n"
3507 "movd %%xmm0,(%2) \n"
3508 "lea 0x4(%2),%2 \n"
3509 "jge 10b \n"
3510
3511 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003512 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003513 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003514 "test $0xf,%0 \n"
3515 "jne 41f \n"
3516 "test $0xf,%1 \n"
3517 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003518
fbarchard@google.com794fe122012-06-15 01:05:01 +00003519 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003520 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003521 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003522 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003523 "lea 0x10(%0),%0 \n"
3524 "movdqa %%xmm3,%%xmm0 \n"
3525 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003526 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003527 "pshufb %4,%%xmm3 \n"
3528 "pand %%xmm6,%%xmm2 \n"
3529 "paddw %%xmm7,%%xmm3 \n"
3530 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003531 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003532 "lea 0x10(%1),%1 \n"
3533 "psrlw $0x8,%%xmm1 \n"
3534 "por %%xmm4,%%xmm0 \n"
3535 "pmullw %%xmm3,%%xmm1 \n"
3536 "psrlw $0x8,%%xmm2 \n"
3537 "paddusb %%xmm2,%%xmm0 \n"
3538 "pand %%xmm5,%%xmm1 \n"
3539 "paddusb %%xmm1,%%xmm0 \n"
3540 "sub $0x4,%3 \n"
3541 "movdqa %%xmm0,(%2) \n"
3542 "lea 0x10(%2),%2 \n"
3543 "jge 40b \n"
3544 "jmp 49f \n"
3545
3546 // 4 pixel unaligned loop.
3547 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003548 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003549 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003550 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003551 "movdqa %%xmm3,%%xmm0 \n"
3552 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003553 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003554 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003555 "pand %%xmm6,%%xmm2 \n"
3556 "paddw %%xmm7,%%xmm3 \n"
3557 "pmullw %%xmm3,%%xmm2 \n"
3558 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003559 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003560 "psrlw $0x8,%%xmm1 \n"
3561 "por %%xmm4,%%xmm0 \n"
3562 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003563 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003564 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003565 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003566 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003567 "sub $0x4,%3 \n"
3568 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003569 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003570 "jge 41b \n"
3571
3572 "49: \n"
3573 "add $0x3,%3 \n"
3574 "jl 99f \n"
3575
fbarchard@google.com794fe122012-06-15 01:05:01 +00003576 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003577 "91: \n"
3578 "movd (%0),%%xmm3 \n"
3579 "lea 0x4(%0),%0 \n"
3580 "movdqa %%xmm3,%%xmm0 \n"
3581 "pxor %%xmm4,%%xmm3 \n"
3582 "movd (%1),%%xmm2 \n"
3583 "pshufb %4,%%xmm3 \n"
3584 "pand %%xmm6,%%xmm2 \n"
3585 "paddw %%xmm7,%%xmm3 \n"
3586 "pmullw %%xmm3,%%xmm2 \n"
3587 "movd (%1),%%xmm1 \n"
3588 "lea 0x4(%1),%1 \n"
3589 "psrlw $0x8,%%xmm1 \n"
3590 "por %%xmm4,%%xmm0 \n"
3591 "pmullw %%xmm3,%%xmm1 \n"
3592 "psrlw $0x8,%%xmm2 \n"
3593 "paddusb %%xmm2,%%xmm0 \n"
3594 "pand %%xmm5,%%xmm1 \n"
3595 "paddusb %%xmm1,%%xmm0 \n"
3596 "sub $0x1,%3 \n"
3597 "movd %%xmm0,(%2) \n"
3598 "lea 0x4(%2),%2 \n"
3599 "jge 91b \n"
3600 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003601 : "+r"(src_argb0), // %0
3602 "+r"(src_argb1), // %1
3603 "+r"(dst_argb), // %2
3604 "+r"(width) // %3
3605 : "m"(kShuffleAlpha) // %4
3606 : "memory", "cc"
3607#if defined(__SSE2__)
3608 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3609#endif
3610 );
3611}
3612#endif // HAS_ARGBBLENDROW_SSSE3
3613
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003614#ifdef HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003615// Attenuate 4 pixels at a time.
3616// aligned to 16 bytes
3617void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3618 asm volatile (
3619 "sub %0,%1 \n"
3620 "pcmpeqb %%xmm4,%%xmm4 \n"
3621 "pslld $0x18,%%xmm4 \n"
3622 "pcmpeqb %%xmm5,%%xmm5 \n"
3623 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003624
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003625 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003626 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003627 "1: \n"
3628 "movdqa (%0),%%xmm0 \n"
3629 "punpcklbw %%xmm0,%%xmm0 \n"
3630 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3631 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3632 "pmulhuw %%xmm2,%%xmm0 \n"
3633 "movdqa (%0),%%xmm1 \n"
3634 "punpckhbw %%xmm1,%%xmm1 \n"
3635 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3636 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3637 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003638 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003639 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003640 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003641 "psrlw $0x8,%%xmm1 \n"
3642 "packuswb %%xmm1,%%xmm0 \n"
3643 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003644 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003645 "sub $0x4,%2 \n"
3646 "movdqa %%xmm0,(%0,%1,1) \n"
3647 "lea 0x10(%0),%0 \n"
3648 "jg 1b \n"
3649 : "+r"(src_argb), // %0
3650 "+r"(dst_argb), // %1
3651 "+r"(width) // %2
3652 :
3653 : "memory", "cc"
3654#if defined(__SSE2__)
3655 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3656#endif
3657 );
3658}
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003659#endif // HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003660
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003661#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003662// Shuffle table duplicating alpha
3663CONST uvec8 kShuffleAlpha0 = {
3664 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3665};
3666CONST uvec8 kShuffleAlpha1 = {
3667 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3668 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3669};
3670// Attenuate 4 pixels at a time.
3671// aligned to 16 bytes
3672void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3673 asm volatile (
3674 "sub %0,%1 \n"
3675 "pcmpeqb %%xmm3,%%xmm3 \n"
3676 "pslld $0x18,%%xmm3 \n"
3677 "movdqa %3,%%xmm4 \n"
3678 "movdqa %4,%%xmm5 \n"
3679
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003680 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003681 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003682 "1: \n"
3683 "movdqa (%0),%%xmm0 \n"
3684 "pshufb %%xmm4,%%xmm0 \n"
3685 "movdqa (%0),%%xmm1 \n"
3686 "punpcklbw %%xmm1,%%xmm1 \n"
3687 "pmulhuw %%xmm1,%%xmm0 \n"
3688 "movdqa (%0),%%xmm1 \n"
3689 "pshufb %%xmm5,%%xmm1 \n"
3690 "movdqa (%0),%%xmm2 \n"
3691 "punpckhbw %%xmm2,%%xmm2 \n"
3692 "pmulhuw %%xmm2,%%xmm1 \n"
3693 "movdqa (%0),%%xmm2 \n"
3694 "pand %%xmm3,%%xmm2 \n"
3695 "psrlw $0x8,%%xmm0 \n"
3696 "psrlw $0x8,%%xmm1 \n"
3697 "packuswb %%xmm1,%%xmm0 \n"
3698 "por %%xmm2,%%xmm0 \n"
3699 "sub $0x4,%2 \n"
3700 "movdqa %%xmm0,(%0,%1,1) \n"
3701 "lea 0x10(%0),%0 \n"
3702 "jg 1b \n"
3703 : "+r"(src_argb), // %0
3704 "+r"(dst_argb), // %1
3705 "+r"(width) // %2
3706 : "m"(kShuffleAlpha0), // %3
3707 "m"(kShuffleAlpha1) // %4
3708 : "memory", "cc"
3709#if defined(__SSE2__)
3710 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3711#endif
3712 );
3713}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003714#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003715
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003716#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003717// Unattenuate 4 pixels at a time.
3718// aligned to 16 bytes
3719void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3720 int width) {
3721 uintptr_t alpha = 0;
3722 asm volatile (
3723 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003724
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003725 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003726 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003727 "1: \n"
3728 "movdqa (%0),%%xmm0 \n"
3729 "movzb 0x3(%0),%3 \n"
3730 "punpcklbw %%xmm0,%%xmm0 \n"
3731 "movd 0x0(%4,%3,4),%%xmm2 \n"
3732 "movzb 0x7(%0),%3 \n"
3733 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003734 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3735 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003736 "movlhps %%xmm3,%%xmm2 \n"
3737 "pmulhuw %%xmm2,%%xmm0 \n"
3738 "movdqa (%0),%%xmm1 \n"
3739 "movzb 0xb(%0),%3 \n"
3740 "punpckhbw %%xmm1,%%xmm1 \n"
3741 "movd 0x0(%4,%3,4),%%xmm2 \n"
3742 "movzb 0xf(%0),%3 \n"
3743 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003744 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3745 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003746 "movlhps %%xmm3,%%xmm2 \n"
3747 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003748 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003749 "sub $0x4,%2 \n"
3750 "movdqa %%xmm0,(%0,%1,1) \n"
3751 "lea 0x10(%0),%0 \n"
3752 "jg 1b \n"
3753 : "+r"(src_argb), // %0
3754 "+r"(dst_argb), // %1
3755 "+r"(width), // %2
3756 "+r"(alpha) // %3
3757 : "r"(fixed_invtbl8) // %4
3758 : "memory", "cc"
3759#if defined(__SSE2__)
3760 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3761#endif
3762 );
3763}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003764#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003765
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003766#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003767// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003768CONST vec8 kARGBToGray = {
3769 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3770};
3771
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003772// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003773void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003774 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003775 "movdqa %3,%%xmm4 \n"
3776 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003777
3778 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003779 ".p2align 4 \n"
3780 "1: \n"
3781 "movdqa (%0),%%xmm0 \n"
3782 "movdqa 0x10(%0),%%xmm1 \n"
3783 "pmaddubsw %%xmm4,%%xmm0 \n"
3784 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003785 "phaddw %%xmm1,%%xmm0 \n"
3786 "psrlw $0x7,%%xmm0 \n"
3787 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003788 "movdqa (%0),%%xmm2 \n"
3789 "movdqa 0x10(%0),%%xmm3 \n"
3790 "psrld $0x18,%%xmm2 \n"
3791 "psrld $0x18,%%xmm3 \n"
3792 "packuswb %%xmm3,%%xmm2 \n"
3793 "packuswb %%xmm2,%%xmm2 \n"
3794 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003795 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003796 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003797 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003798 "punpcklwd %%xmm3,%%xmm0 \n"
3799 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003800 "sub $0x8,%2 \n"
3801 "movdqa %%xmm0,(%0,%1,1) \n"
3802 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003803 "lea 0x20(%0),%0 \n"
3804 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003805 : "+r"(src_argb), // %0
3806 "+r"(dst_argb), // %1
3807 "+r"(width) // %2
3808 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003809 : "memory", "cc"
3810#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003811 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003812#endif
3813 );
3814}
3815#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003816
3817#ifdef HAS_ARGBSEPIAROW_SSSE3
3818// b = (r * 35 + g * 68 + b * 17) >> 7
3819// g = (r * 45 + g * 88 + b * 22) >> 7
3820// r = (r * 50 + g * 98 + b * 24) >> 7
3821// Constant for ARGB color to sepia tone
3822CONST vec8 kARGBToSepiaB = {
3823 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3824};
3825
3826CONST vec8 kARGBToSepiaG = {
3827 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3828};
3829
3830CONST vec8 kARGBToSepiaR = {
3831 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3832};
3833
fbarchard@google.come442dc42012-06-18 17:37:09 +00003834// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003835void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3836 asm volatile (
3837 "movdqa %2,%%xmm2 \n"
3838 "movdqa %3,%%xmm3 \n"
3839 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003840
3841 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003842 ".p2align 4 \n"
3843 "1: \n"
3844 "movdqa (%0),%%xmm0 \n"
3845 "movdqa 0x10(%0),%%xmm6 \n"
3846 "pmaddubsw %%xmm2,%%xmm0 \n"
3847 "pmaddubsw %%xmm2,%%xmm6 \n"
3848 "phaddw %%xmm6,%%xmm0 \n"
3849 "psrlw $0x7,%%xmm0 \n"
3850 "packuswb %%xmm0,%%xmm0 \n"
3851 "movdqa (%0),%%xmm5 \n"
3852 "movdqa 0x10(%0),%%xmm1 \n"
3853 "pmaddubsw %%xmm3,%%xmm5 \n"
3854 "pmaddubsw %%xmm3,%%xmm1 \n"
3855 "phaddw %%xmm1,%%xmm5 \n"
3856 "psrlw $0x7,%%xmm5 \n"
3857 "packuswb %%xmm5,%%xmm5 \n"
3858 "punpcklbw %%xmm5,%%xmm0 \n"
3859 "movdqa (%0),%%xmm5 \n"
3860 "movdqa 0x10(%0),%%xmm1 \n"
3861 "pmaddubsw %%xmm4,%%xmm5 \n"
3862 "pmaddubsw %%xmm4,%%xmm1 \n"
3863 "phaddw %%xmm1,%%xmm5 \n"
3864 "psrlw $0x7,%%xmm5 \n"
3865 "packuswb %%xmm5,%%xmm5 \n"
3866 "movdqa (%0),%%xmm6 \n"
3867 "movdqa 0x10(%0),%%xmm1 \n"
3868 "psrld $0x18,%%xmm6 \n"
3869 "psrld $0x18,%%xmm1 \n"
3870 "packuswb %%xmm1,%%xmm6 \n"
3871 "packuswb %%xmm6,%%xmm6 \n"
3872 "punpcklbw %%xmm6,%%xmm5 \n"
3873 "movdqa %%xmm0,%%xmm1 \n"
3874 "punpcklwd %%xmm5,%%xmm0 \n"
3875 "punpckhwd %%xmm5,%%xmm1 \n"
3876 "sub $0x8,%1 \n"
3877 "movdqa %%xmm0,(%0) \n"
3878 "movdqa %%xmm1,0x10(%0) \n"
3879 "lea 0x20(%0),%0 \n"
3880 "jg 1b \n"
3881 : "+r"(dst_argb), // %0
3882 "+r"(width) // %1
3883 : "m"(kARGBToSepiaB), // %2
3884 "m"(kARGBToSepiaG), // %3
3885 "m"(kARGBToSepiaR) // %4
3886 : "memory", "cc"
3887#if defined(__SSE2__)
3888 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3889#endif
3890 );
3891}
3892#endif // HAS_ARGBSEPIAROW_SSSE3
3893
fbarchard@google.come442dc42012-06-18 17:37:09 +00003894#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3895// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3896// Same as Sepia except matrix is provided.
3897void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3898 int width) {
3899 asm volatile (
3900 "movd (%2),%%xmm2 \n"
3901 "movd 0x4(%2),%%xmm3 \n"
3902 "movd 0x8(%2),%%xmm4 \n"
3903 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3904 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3905 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003906
3907 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003908 ".p2align 4 \n"
3909 "1: \n"
3910 "movdqa (%0),%%xmm0 \n"
3911 "movdqa 0x10(%0),%%xmm6 \n"
3912 "pmaddubsw %%xmm2,%%xmm0 \n"
3913 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003914 "movdqa (%0),%%xmm5 \n"
3915 "movdqa 0x10(%0),%%xmm1 \n"
3916 "pmaddubsw %%xmm3,%%xmm5 \n"
3917 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003918 "phaddsw %%xmm6,%%xmm0 \n"
3919 "phaddsw %%xmm1,%%xmm5 \n"
3920 "psraw $0x7,%%xmm0 \n"
3921 "psraw $0x7,%%xmm5 \n"
3922 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003923 "packuswb %%xmm5,%%xmm5 \n"
3924 "punpcklbw %%xmm5,%%xmm0 \n"
3925 "movdqa (%0),%%xmm5 \n"
3926 "movdqa 0x10(%0),%%xmm1 \n"
3927 "pmaddubsw %%xmm4,%%xmm5 \n"
3928 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003929 "phaddsw %%xmm1,%%xmm5 \n"
3930 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003931 "packuswb %%xmm5,%%xmm5 \n"
3932 "movdqa (%0),%%xmm6 \n"
3933 "movdqa 0x10(%0),%%xmm1 \n"
3934 "psrld $0x18,%%xmm6 \n"
3935 "psrld $0x18,%%xmm1 \n"
3936 "packuswb %%xmm1,%%xmm6 \n"
3937 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003938 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003939 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003940 "punpcklwd %%xmm5,%%xmm0 \n"
3941 "punpckhwd %%xmm5,%%xmm1 \n"
3942 "sub $0x8,%1 \n"
3943 "movdqa %%xmm0,(%0) \n"
3944 "movdqa %%xmm1,0x10(%0) \n"
3945 "lea 0x20(%0),%0 \n"
3946 "jg 1b \n"
3947 : "+r"(dst_argb), // %0
3948 "+r"(width) // %1
3949 : "r"(matrix_argb) // %2
3950 : "memory", "cc"
3951#if defined(__SSE2__)
3952 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3953#endif
3954 );
3955}
3956#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3957
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003958#ifdef HAS_ARGBQUANTIZEROW_SSE2
3959// Quantize 4 ARGB pixels (16 bytes).
3960// aligned to 16 bytes
3961void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3962 int interval_offset, int width) {
3963 asm volatile (
3964 "movd %2,%%xmm2 \n"
3965 "movd %3,%%xmm3 \n"
3966 "movd %4,%%xmm4 \n"
3967 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3968 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3969 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3970 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3971 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3972 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3973 "pxor %%xmm5,%%xmm5 \n"
3974 "pcmpeqb %%xmm6,%%xmm6 \n"
3975 "pslld $0x18,%%xmm6 \n"
3976
3977 // 4 pixel loop.
3978 ".p2align 2 \n"
3979 "1: \n"
3980 "movdqa (%0),%%xmm0 \n"
3981 "punpcklbw %%xmm5,%%xmm0 \n"
3982 "pmulhuw %%xmm2,%%xmm0 \n"
3983 "movdqa (%0),%%xmm1 \n"
3984 "punpckhbw %%xmm5,%%xmm1 \n"
3985 "pmulhuw %%xmm2,%%xmm1 \n"
3986 "pmullw %%xmm3,%%xmm0 \n"
3987 "movdqa (%0),%%xmm7 \n"
3988 "pmullw %%xmm3,%%xmm1 \n"
3989 "pand %%xmm6,%%xmm7 \n"
3990 "paddw %%xmm4,%%xmm0 \n"
3991 "paddw %%xmm4,%%xmm1 \n"
3992 "packuswb %%xmm1,%%xmm0 \n"
3993 "por %%xmm7,%%xmm0 \n"
3994 "sub $0x4,%1 \n"
3995 "movdqa %%xmm0,(%0) \n"
3996 "lea 0x10(%0),%0 \n"
3997 "jg 1b \n"
3998 : "+r"(dst_argb), // %0
3999 "+r"(width) // %1
4000 : "r"(scale), // %2
4001 "r"(interval_size), // %3
4002 "r"(interval_offset) // %4
4003 : "memory", "cc"
4004#if defined(__SSE2__)
4005 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4006#endif
4007 );
4008}
4009#endif // HAS_ARGBQUANTIZEROW_SSE2
4010
fbarchard@google.comb94b1392012-12-03 20:36:40 +00004011#ifdef HAS_ARGBSHADEROW_SSE2
4012// Shade 4 pixels at a time by specified value.
4013// Aligned to 16 bytes.
4014void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4015 uint32 value) {
4016 asm volatile (
4017 "movd %3,%%xmm2 \n"
4018 "sub %0,%1 \n"
4019 "punpcklbw %%xmm2,%%xmm2 \n"
4020 "punpcklqdq %%xmm2,%%xmm2 \n"
4021
4022 // 4 pixel loop.
4023 ".p2align 2 \n"
4024 "1: \n"
4025 "movdqa (%0),%%xmm0 \n"
4026 "movdqa %%xmm0,%%xmm1 \n"
4027 "punpcklbw %%xmm0,%%xmm0 \n"
4028 "punpckhbw %%xmm1,%%xmm1 \n"
4029 "pmulhuw %%xmm2,%%xmm0 \n"
4030 "pmulhuw %%xmm2,%%xmm1 \n"
4031 "psrlw $0x8,%%xmm0 \n"
4032 "psrlw $0x8,%%xmm1 \n"
4033 "packuswb %%xmm1,%%xmm0 \n"
4034 "sub $0x4,%2 \n"
4035 "movdqa %%xmm0,(%0,%1,1) \n"
4036 "lea 0x10(%0),%0 \n"
4037 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004038 : "+r"(src_argb), // %0
4039 "+r"(dst_argb), // %1
4040 "+r"(width) // %2
4041 : "r"(value) // %3
fbarchard@google.comb94b1392012-12-03 20:36:40 +00004042 : "memory", "cc"
4043#if defined(__SSE2__)
4044 , "xmm0", "xmm1", "xmm2"
4045#endif
4046 );
4047}
4048#endif // HAS_ARGBSHADEROW_SSE2
4049
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004050#ifdef HAS_ARGBMULTIPLYROW_SSE2
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004051// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004052// Aligned to 16 bytes.
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004053void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4054 uint8* dst_argb, int width) {
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004055 asm volatile (
4056 "pxor %%xmm5,%%xmm5 \n"
4057 "sub %0,%1 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004058 "sub %0,%2 \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004059
4060 // 4 pixel loop.
4061 ".p2align 4 \n"
4062 "1: \n"
4063 "movdqa (%0),%%xmm0 \n"
4064 "movdqa (%0,%1),%%xmm2 \n"
4065 "movdqa %%xmm0,%%xmm1 \n"
4066 "movdqa %%xmm2,%%xmm3 \n"
4067 "punpcklbw %%xmm0,%%xmm0 \n"
4068 "punpckhbw %%xmm1,%%xmm1 \n"
4069 "punpcklbw %%xmm5,%%xmm2 \n"
4070 "punpckhbw %%xmm5,%%xmm3 \n"
4071 "pmulhuw %%xmm2,%%xmm0 \n"
4072 "pmulhuw %%xmm3,%%xmm1 \n"
4073 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004074 "sub $0x4,%3 \n"
4075 "movdqa %%xmm0,(%0,%2,1) \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004076 "lea 0x10(%0),%0 \n"
4077 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004078 : "+r"(src_argb0), // %0
4079 "+r"(src_argb1), // %1
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004080 "+r"(dst_argb), // %2
4081 "+r"(width) // %3
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004082 :
4083 : "memory", "cc"
4084#if defined(__SSE2__)
4085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4086#endif
4087 );
4088}
4089#endif // HAS_ARGBMULTIPLYROW_SSE2
4090
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004091#ifdef HAS_ARGBADDROW_SSE2
4092// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4093// Aligned to 16 bytes.
4094void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4095 uint8* dst_argb, int width) {
4096 asm volatile (
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004097 "sub %0,%1 \n"
4098 "sub %0,%2 \n"
4099
4100 // 4 pixel loop.
4101 ".p2align 4 \n"
4102 "1: \n"
4103 "movdqa (%0),%%xmm0 \n"
4104 "movdqa (%0,%1),%%xmm1 \n"
4105 "paddusb %%xmm1,%%xmm0 \n"
4106 "sub $0x4,%3 \n"
4107 "movdqa %%xmm0,(%0,%2,1) \n"
4108 "lea 0x10(%0),%0 \n"
4109 "jg 1b \n"
4110 : "+r"(src_argb0), // %0
4111 "+r"(src_argb1), // %1
4112 "+r"(dst_argb), // %2
4113 "+r"(width) // %3
4114 :
4115 : "memory", "cc"
4116#if defined(__SSE2__)
fbarchard@google.com573a8832013-01-24 23:08:12 +00004117 , "xmm0", "xmm1"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004118#endif
4119 );
4120}
4121#endif // HAS_ARGBADDROW_SSE2
4122
fbarchard@google.com573a8832013-01-24 23:08:12 +00004123#ifdef HAS_ARGBSUBTRACTROW_SSE2
4124// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4125// Aligned to 16 bytes.
4126void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4127 uint8* dst_argb, int width) {
4128 asm volatile (
4129 "sub %0,%1 \n"
4130 "sub %0,%2 \n"
4131
4132 // 4 pixel loop.
4133 ".p2align 4 \n"
4134 "1: \n"
4135 "movdqa (%0),%%xmm0 \n"
4136 "movdqa (%0,%1),%%xmm1 \n"
4137 "psubusb %%xmm1,%%xmm0 \n"
4138 "sub $0x4,%3 \n"
4139 "movdqa %%xmm0,(%0,%2,1) \n"
4140 "lea 0x10(%0),%0 \n"
4141 "jg 1b \n"
4142 : "+r"(src_argb0), // %0
4143 "+r"(src_argb1), // %1
4144 "+r"(dst_argb), // %2
4145 "+r"(width) // %3
4146 :
4147 : "memory", "cc"
4148#if defined(__SSE2__)
4149 , "xmm0", "xmm1"
4150#endif
4151 );
4152}
4153#endif // HAS_ARGBSUBTRACTROW_SSE2
4154
fbarchard@google.com9d48df92013-03-24 20:12:25 +00004155#ifdef HAS_SOBELXROW_SSSE3
4156// SobelX as a matrix is
4157// -1 0 1
4158// -2 0 2
4159// -1 0 1
4160void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
4161 const uint8* src_y2, uint8* dst_sobelx, int width) {
4162 asm volatile (
4163 "sub %0,%1 \n"
4164 "sub %0,%2 \n"
4165 "sub %0,%3 \n"
4166 "pxor %%xmm5,%%xmm5 \n"
4167
4168 // 8 pixel loop.
4169 ".p2align 4 \n"
4170 "1: \n"
4171 "movq (%0),%%xmm0 \n"
4172 "movq 0x2(%0),%%xmm1 \n"
4173 "punpcklbw %%xmm5,%%xmm0 \n"
4174 "punpcklbw %%xmm5,%%xmm1 \n"
4175 "psubw %%xmm1,%%xmm0 \n"
4176 "movq (%0,%1,1),%%xmm1 \n"
4177 "movq 0x2(%0,%1,1),%%xmm2 \n"
4178 "punpcklbw %%xmm5,%%xmm1 \n"
4179 "punpcklbw %%xmm5,%%xmm2 \n"
4180 "psubw %%xmm2,%%xmm1 \n"
4181 "movq (%0,%2,1),%%xmm2 \n"
4182 "movq 0x2(%0,%2,1),%%xmm3 \n"
4183 "punpcklbw %%xmm5,%%xmm2 \n"
4184 "punpcklbw %%xmm5,%%xmm3 \n"
4185 "psubw %%xmm3,%%xmm2 \n"
4186 "paddw %%xmm2,%%xmm0 \n"
4187 "paddw %%xmm1,%%xmm0 \n"
4188 "paddw %%xmm1,%%xmm0 \n"
4189 "pabsw %%xmm0,%%xmm0 \n"
4190 "packuswb %%xmm0,%%xmm0 \n"
4191 "sub $0x8,%4 \n"
4192 "movq %%xmm0,(%0,%3,1) \n"
4193 "lea 0x8(%0),%0 \n"
4194 "jg 1b \n"
4195 : "+r"(src_y0), // %0
4196 "+r"(src_y1), // %1
4197 "+r"(src_y2), // %2
4198 "+r"(dst_sobelx), // %3
4199 "+r"(width) // %4
4200 :
4201 : "memory", "cc"
4202#if defined(__SSE2__)
4203 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4204#endif
4205 );
4206}
4207#endif // HAS_SOBELXROW_SSSE3
4208
4209#ifdef HAS_SOBELYROW_SSSE3
4210// SobelY as a matrix is
4211// -1 -2 -1
4212// 0 0 0
4213// 1 2 1
4214void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
4215 uint8* dst_sobely, int width) {
4216 asm volatile (
4217 "sub %0,%1 \n"
4218 "sub %0,%2 \n"
4219 "pxor %%xmm5,%%xmm5 \n"
4220
4221 // 8 pixel loop.
4222 ".p2align 4 \n"
4223 "1: \n"
4224 "movq (%0),%%xmm0 \n"
4225 "movq (%0,%1,1),%%xmm1 \n"
4226 "punpcklbw %%xmm5,%%xmm0 \n"
4227 "punpcklbw %%xmm5,%%xmm1 \n"
4228 "psubw %%xmm1,%%xmm0 \n"
4229 "movq 0x1(%0),%%xmm1 \n"
4230 "movq 0x1(%0,%1,1),%%xmm2 \n"
4231 "punpcklbw %%xmm5,%%xmm1 \n"
4232 "punpcklbw %%xmm5,%%xmm2 \n"
4233 "psubw %%xmm2,%%xmm1 \n"
4234 "movq 0x2(%0),%%xmm2 \n"
4235 "movq 0x2(%0,%1,1),%%xmm3 \n"
4236 "punpcklbw %%xmm5,%%xmm2 \n"
4237 "punpcklbw %%xmm5,%%xmm3 \n"
4238 "psubw %%xmm3,%%xmm2 \n"
4239 "paddw %%xmm2,%%xmm0 \n"
4240 "paddw %%xmm1,%%xmm0 \n"
4241 "paddw %%xmm1,%%xmm0 \n"
4242 "pabsw %%xmm0,%%xmm0 \n"
4243 "packuswb %%xmm0,%%xmm0 \n"
4244 "sub $0x8,%3 \n"
4245 "movq %%xmm0,(%0,%2,1) \n"
4246 "lea 0x8(%0),%0 \n"
4247 "jg 1b \n"
4248 : "+r"(src_y0), // %0
4249 "+r"(src_y1), // %1
4250 "+r"(dst_sobely), // %2
4251 "+r"(width) // %3
4252 :
4253 : "memory", "cc"
4254#if defined(__SSE2__)
4255 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4256#endif
4257 );
4258}
4259#endif // HAS_SOBELYROW_SSSE3
4260
4261#ifdef HAS_SOBELROW_SSE2
4262// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4263// A = 255
4264// R = Sobel
4265// G = Sobel
4266// B = Sobel
4267void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4268 uint8* dst_argb, int width) {
4269 asm volatile (
4270 "sub %0,%1 \n"
4271 "pcmpeqb %%xmm5,%%xmm5 \n"
4272 "pslld $0x18,%%xmm5 \n"
4273
4274 // 8 pixel loop.
4275 ".p2align 4 \n"
4276 "1: \n"
4277 "movdqa (%0),%%xmm0 \n"
4278 "movdqa (%0,%1,1),%%xmm1 \n"
4279 "lea 0x10(%0),%0 \n"
4280 "paddusb %%xmm1,%%xmm0 \n"
4281 "movdqa %%xmm0,%%xmm2 \n"
4282 "punpcklbw %%xmm0,%%xmm2 \n"
4283 "punpckhbw %%xmm0,%%xmm0 \n"
4284 "movdqa %%xmm2,%%xmm1 \n"
4285 "punpcklwd %%xmm2,%%xmm1 \n"
4286 "punpckhwd %%xmm2,%%xmm2 \n"
4287 "por %%xmm5,%%xmm1 \n"
4288 "por %%xmm5,%%xmm2 \n"
4289 "movdqa %%xmm0,%%xmm3 \n"
4290 "punpcklwd %%xmm0,%%xmm3 \n"
4291 "punpckhwd %%xmm0,%%xmm0 \n"
4292 "por %%xmm5,%%xmm3 \n"
4293 "por %%xmm5,%%xmm0 \n"
4294 "sub $0x10,%3 \n"
4295 "movdqa %%xmm1,(%2) \n"
4296 "movdqa %%xmm2,0x10(%2) \n"
4297 "movdqa %%xmm3,0x20(%2) \n"
4298 "movdqa %%xmm0,0x30(%2) \n"
4299 "lea 0x40(%2),%2 \n"
4300 "jg 1b \n"
4301 : "+r"(src_sobelx), // %0
4302 "+r"(src_sobely), // %1
4303 "+r"(dst_argb), // %2
4304 "+r"(width) // %3
4305 :
4306 : "memory", "cc"
4307#if defined(__SSE2__)
4308 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4309#endif
4310 );
4311}
4312#endif // HAS_SOBELROW_SSE2
4313
4314#ifdef HAS_SOBELXYROW_SSE2
4315// Mixes Sobel X, Sobel Y and Sobel into ARGB.
4316// A = 255
4317// R = Sobel X
4318// G = Sobel
4319// B = Sobel Y
4320void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4321 uint8* dst_argb, int width) {
4322 asm volatile (
4323 "sub %0,%1 \n"
4324 "pcmpeqb %%xmm5,%%xmm5 \n"
4325
4326 // 8 pixel loop.
4327 ".p2align 4 \n"
4328 "1: \n"
4329 "movdqa (%0),%%xmm0 \n"
4330 "movdqa (%0,%1,1),%%xmm1 \n"
4331 "lea 0x10(%0),%0 \n"
4332 "movdqa %%xmm0,%%xmm2 \n"
4333 "paddusb %%xmm1,%%xmm2 \n"
4334 "movdqa %%xmm0,%%xmm3 \n"
4335 "punpcklbw %%xmm5,%%xmm3 \n"
4336 "punpckhbw %%xmm5,%%xmm0 \n"
4337 "movdqa %%xmm1,%%xmm4 \n"
4338 "punpcklbw %%xmm2,%%xmm4 \n"
4339 "punpckhbw %%xmm2,%%xmm1 \n"
4340 "movdqa %%xmm4,%%xmm6 \n"
4341 "punpcklwd %%xmm3,%%xmm6 \n"
4342 "punpckhwd %%xmm3,%%xmm4 \n"
4343 "movdqa %%xmm1,%%xmm7 \n"
4344 "punpcklwd %%xmm0,%%xmm7 \n"
4345 "punpckhwd %%xmm0,%%xmm1 \n"
4346 "sub $0x10,%3 \n"
4347 "movdqa %%xmm6,(%2) \n"
4348 "movdqa %%xmm4,0x10(%2) \n"
4349 "movdqa %%xmm7,0x20(%2) \n"
4350 "movdqa %%xmm1,0x30(%2) \n"
4351 "lea 0x40(%2),%2 \n"
4352 "jg 1b \n"
4353 : "+r"(src_sobelx), // %0
4354 "+r"(src_sobely), // %1
4355 "+r"(dst_argb), // %2
4356 "+r"(width) // %3
4357 :
4358 : "memory", "cc"
4359#if defined(__SSE2__)
4360 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4361#endif
4362 );
4363}
4364#endif // HAS_SOBELXYROW_SSE2
4365
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004366#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4367// Creates a table of cumulative sums where each value is a sum of all values
4368// above and to the left of the value, inclusive of the value.
4369void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00004370 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004371 asm volatile (
4372 "sub %1,%2 \n"
4373 "pxor %%xmm0,%%xmm0 \n"
4374 "pxor %%xmm1,%%xmm1 \n"
4375 "sub $0x4,%3 \n"
4376 "jl 49f \n"
4377 "test $0xf,%1 \n"
4378 "jne 49f \n"
4379
4380 // 4 pixel loop \n"
4381 ".p2align 2 \n"
4382 "40: \n"
4383 "movdqu (%0),%%xmm2 \n"
4384 "lea 0x10(%0),%0 \n"
4385 "movdqa %%xmm2,%%xmm4 \n"
4386 "punpcklbw %%xmm1,%%xmm2 \n"
4387 "movdqa %%xmm2,%%xmm3 \n"
4388 "punpcklwd %%xmm1,%%xmm2 \n"
4389 "punpckhwd %%xmm1,%%xmm3 \n"
4390 "punpckhbw %%xmm1,%%xmm4 \n"
4391 "movdqa %%xmm4,%%xmm5 \n"
4392 "punpcklwd %%xmm1,%%xmm4 \n"
4393 "punpckhwd %%xmm1,%%xmm5 \n"
4394 "paddd %%xmm2,%%xmm0 \n"
4395 "movdqa (%1,%2,1),%%xmm2 \n"
4396 "paddd %%xmm0,%%xmm2 \n"
4397 "paddd %%xmm3,%%xmm0 \n"
4398 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
4399 "paddd %%xmm0,%%xmm3 \n"
4400 "paddd %%xmm4,%%xmm0 \n"
4401 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
4402 "paddd %%xmm0,%%xmm4 \n"
4403 "paddd %%xmm5,%%xmm0 \n"
4404 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
4405 "paddd %%xmm0,%%xmm5 \n"
4406 "movdqa %%xmm2,(%1) \n"
4407 "movdqa %%xmm3,0x10(%1) \n"
4408 "movdqa %%xmm4,0x20(%1) \n"
4409 "movdqa %%xmm5,0x30(%1) \n"
4410 "lea 0x40(%1),%1 \n"
4411 "sub $0x4,%3 \n"
4412 "jge 40b \n"
4413
4414 "49: \n"
4415 "add $0x3,%3 \n"
4416 "jl 19f \n"
4417
4418 // 1 pixel loop \n"
4419 ".p2align 2 \n"
4420 "10: \n"
4421 "movd (%0),%%xmm2 \n"
4422 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00004423 "punpcklbw %%xmm1,%%xmm2 \n"
4424 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004425 "paddd %%xmm2,%%xmm0 \n"
4426 "movdqu (%1,%2,1),%%xmm2 \n"
4427 "paddd %%xmm0,%%xmm2 \n"
4428 "movdqu %%xmm2,(%1) \n"
4429 "lea 0x10(%1),%1 \n"
4430 "sub $0x1,%3 \n"
4431 "jge 10b \n"
4432
4433 "19: \n"
4434 : "+r"(row), // %0
4435 "+r"(cumsum), // %1
4436 "+r"(previous_cumsum), // %2
4437 "+r"(width) // %3
4438 :
4439 : "memory", "cc"
4440#if defined(__SSE2__)
4441 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4442#endif
4443 );
4444}
4445#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4446
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004447#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4448void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4449 int width, int area, uint8* dst,
4450 int count) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004451 asm volatile (
4452 "movd %5,%%xmm4 \n"
4453 "cvtdq2ps %%xmm4,%%xmm4 \n"
4454 "rcpss %%xmm4,%%xmm4 \n"
4455 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4456 "sub $0x4,%3 \n"
4457 "jl 49f \n"
4458
4459 // 4 pixel loop \n"
4460 ".p2align 2 \n"
4461 "40: \n"
4462 "movdqa (%0),%%xmm0 \n"
4463 "movdqa 0x10(%0),%%xmm1 \n"
4464 "movdqa 0x20(%0),%%xmm2 \n"
4465 "movdqa 0x30(%0),%%xmm3 \n"
4466 "psubd (%0,%4,4),%%xmm0 \n"
4467 "psubd 0x10(%0,%4,4),%%xmm1 \n"
4468 "psubd 0x20(%0,%4,4),%%xmm2 \n"
4469 "psubd 0x30(%0,%4,4),%%xmm3 \n"
4470 "lea 0x40(%0),%0 \n"
4471 "psubd (%1),%%xmm0 \n"
4472 "psubd 0x10(%1),%%xmm1 \n"
4473 "psubd 0x20(%1),%%xmm2 \n"
4474 "psubd 0x30(%1),%%xmm3 \n"
4475 "paddd (%1,%4,4),%%xmm0 \n"
4476 "paddd 0x10(%1,%4,4),%%xmm1 \n"
4477 "paddd 0x20(%1,%4,4),%%xmm2 \n"
4478 "paddd 0x30(%1,%4,4),%%xmm3 \n"
4479 "lea 0x40(%1),%1 \n"
4480 "cvtdq2ps %%xmm0,%%xmm0 \n"
4481 "cvtdq2ps %%xmm1,%%xmm1 \n"
4482 "mulps %%xmm4,%%xmm0 \n"
4483 "mulps %%xmm4,%%xmm1 \n"
4484 "cvtdq2ps %%xmm2,%%xmm2 \n"
4485 "cvtdq2ps %%xmm3,%%xmm3 \n"
4486 "mulps %%xmm4,%%xmm2 \n"
4487 "mulps %%xmm4,%%xmm3 \n"
4488 "cvtps2dq %%xmm0,%%xmm0 \n"
4489 "cvtps2dq %%xmm1,%%xmm1 \n"
4490 "cvtps2dq %%xmm2,%%xmm2 \n"
4491 "cvtps2dq %%xmm3,%%xmm3 \n"
4492 "packssdw %%xmm1,%%xmm0 \n"
4493 "packssdw %%xmm3,%%xmm2 \n"
4494 "packuswb %%xmm2,%%xmm0 \n"
4495 "movdqu %%xmm0,(%2) \n"
4496 "lea 0x10(%2),%2 \n"
4497 "sub $0x4,%3 \n"
4498 "jge 40b \n"
4499
4500 "49: \n"
4501 "add $0x3,%3 \n"
4502 "jl 19f \n"
4503
4504 // 1 pixel loop \n"
4505 ".p2align 2 \n"
4506 "10: \n"
4507 "movdqa (%0),%%xmm0 \n"
4508 "psubd (%0,%4,4),%%xmm0 \n"
4509 "lea 0x10(%0),%0 \n"
4510 "psubd (%1),%%xmm0 \n"
4511 "paddd (%1,%4,4),%%xmm0 \n"
4512 "lea 0x10(%1),%1 \n"
4513 "cvtdq2ps %%xmm0,%%xmm0 \n"
4514 "mulps %%xmm4,%%xmm0 \n"
4515 "cvtps2dq %%xmm0,%%xmm0 \n"
4516 "packssdw %%xmm0,%%xmm0 \n"
4517 "packuswb %%xmm0,%%xmm0 \n"
4518 "movd %%xmm0,(%2) \n"
4519 "lea 0x4(%2),%2 \n"
4520 "sub $0x1,%3 \n"
4521 "jge 10b \n"
4522 "19: \n"
4523 : "+r"(topleft), // %0
4524 "+r"(botleft), // %1
4525 "+r"(dst), // %2
4526 "+rm"(count) // %3
4527 : "r"(static_cast<intptr_t>(width)), // %4
4528 "rm"(area) // %5
4529 : "memory", "cc"
4530#if defined(__SSE2__)
4531 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
4532#endif
4533 );
4534}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004535#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004536
fbarchard@google.com73444402012-08-09 17:33:29 +00004537#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004538// TODO(fbarchard): Find 64 bit way to avoid masking.
fbarchard@google.com73444402012-08-09 17:33:29 +00004539// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004540// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00004541// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004542
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00004543LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00004544void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004545 uint8* dst_argb, const float* src_dudv, int width) {
fbarchard@google.com73444402012-08-09 17:33:29 +00004546 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004547 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00004548 asm volatile (
4549 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004550 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004551 "shl $0x10,%1 \n"
4552 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004553 "movd %1,%%xmm5 \n"
4554 "sub $0x4,%4 \n"
4555 "jl 49f \n"
4556
4557 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4558 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004559 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004560 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004561 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004562 "movdqa %%xmm7,%%xmm4 \n"
4563 "addps %%xmm4,%%xmm4 \n"
4564 "movdqa %%xmm2,%%xmm3 \n"
4565 "addps %%xmm4,%%xmm3 \n"
4566 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004567
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004568 // 4 pixel loop \n"
4569 ".p2align 4 \n"
4570 "40: \n"
4571 "cvttps2dq %%xmm2,%%xmm0 \n"
4572 "cvttps2dq %%xmm3,%%xmm1 \n"
4573 "packssdw %%xmm1,%%xmm0 \n"
4574 "pmaddwd %%xmm5,%%xmm0 \n"
4575#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004576 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004577 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004578 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004579 "shr $32,%5 \n"
4580 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4581#else
4582 "movd %%xmm0,%1 \n"
4583 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4584 "movd %%xmm0,%5 \n"
4585 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4586#endif
4587 "movd (%0,%1,1),%%xmm1 \n"
4588 "movd (%0,%5,1),%%xmm6 \n"
4589 "punpckldq %%xmm6,%%xmm1 \n"
4590 "addps %%xmm4,%%xmm2 \n"
4591 "movq %%xmm1,(%2) \n"
4592#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004593 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004594 "mov %1,%5 \n"
4595 "and $0x0fffffff,%1 \n"
4596 "shr $32,%5 \n"
4597#else
4598 "movd %%xmm0,%1 \n"
4599 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4600 "movd %%xmm0,%5 \n"
4601#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004602 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004603 "movd (%0,%5,1),%%xmm6 \n"
4604 "punpckldq %%xmm6,%%xmm0 \n"
4605 "addps %%xmm4,%%xmm3 \n"
4606 "sub $0x4,%4 \n"
4607 "movq %%xmm0,0x08(%2) \n"
4608 "lea 0x10(%2),%2 \n"
4609 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004610
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004611 "49: \n"
4612 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004613 "jl 19f \n"
4614
4615 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004616 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004617 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004618 "cvttps2dq %%xmm2,%%xmm0 \n"
4619 "packssdw %%xmm0,%%xmm0 \n"
4620 "pmaddwd %%xmm5,%%xmm0 \n"
4621 "addps %%xmm7,%%xmm2 \n"
4622 "movd %%xmm0,%1 \n"
4623#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004624 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004625#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004626 "movd (%0,%1,1),%%xmm0 \n"
4627 "sub $0x1,%4 \n"
4628 "movd %%xmm0,(%2) \n"
4629 "lea 0x4(%2),%2 \n"
4630 "jge 10b \n"
4631 "19: \n"
4632 : "+r"(src_argb), // %0
4633 "+r"(src_argb_stride_temp), // %1
4634 "+r"(dst_argb), // %2
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004635 "+r"(src_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004636 "+rm"(width), // %4
4637 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004638 :
4639 : "memory", "cc"
4640#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004641 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004642#endif
4643 );
4644}
4645#endif // HAS_ARGBAFFINEROW_SSE2
4646
fbarchard@google.comb5491752012-11-20 09:44:46 +00004647// Bilinear image filtering.
4648// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4649void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004650 ptrdiff_t src_stride, int dst_width,
4651 int source_y_fraction) {
4652 asm volatile (
4653 "sub %1,%0 \n"
4654 "shr %3 \n"
4655 "cmp $0x0,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004656 "je 100f \n"
4657 "cmp $0x20,%3 \n"
4658 "je 75f \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004659 "cmp $0x40,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004660 "je 50f \n"
4661 "cmp $0x60,%3 \n"
4662 "je 25f \n"
4663
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004664 "movd %3,%%xmm0 \n"
4665 "neg %3 \n"
4666 "add $0x80,%3 \n"
4667 "movd %3,%%xmm5 \n"
4668 "punpcklbw %%xmm0,%%xmm5 \n"
4669 "punpcklwd %%xmm5,%%xmm5 \n"
4670 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004671
4672 // General purpose row blend.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004673 ".p2align 4 \n"
4674 "1: \n"
4675 "movdqa (%1),%%xmm0 \n"
4676 "movdqa (%1,%4,1),%%xmm2 \n"
4677 "movdqa %%xmm0,%%xmm1 \n"
4678 "punpcklbw %%xmm2,%%xmm0 \n"
4679 "punpckhbw %%xmm2,%%xmm1 \n"
4680 "pmaddubsw %%xmm5,%%xmm0 \n"
4681 "pmaddubsw %%xmm5,%%xmm1 \n"
4682 "psrlw $0x7,%%xmm0 \n"
4683 "psrlw $0x7,%%xmm1 \n"
4684 "packuswb %%xmm1,%%xmm0 \n"
4685 "sub $0x4,%2 \n"
4686 "movdqa %%xmm0,(%1,%0,1) \n"
4687 "lea 0x10(%1),%1 \n"
4688 "jg 1b \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004689 "jmp 99f \n"
4690
4691 // Blend 25 / 75.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004692 ".p2align 4 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004693 "25: \n"
4694 "movdqa (%1),%%xmm0 \n"
4695 "movdqa (%1,%4,1),%%xmm1 \n"
4696 "pavgb %%xmm1,%%xmm0 \n"
4697 "pavgb %%xmm1,%%xmm0 \n"
4698 "sub $0x4,%2 \n"
4699 "movdqa %%xmm0,(%1,%0,1) \n"
4700 "lea 0x10(%1),%1 \n"
4701 "jg 25b \n"
4702 "jmp 99f \n"
4703
4704 // Blend 50 / 50.
4705 ".p2align 4 \n"
4706 "50: \n"
4707 "movdqa (%1),%%xmm0 \n"
4708 "movdqa (%1,%4,1),%%xmm1 \n"
4709 "pavgb %%xmm1,%%xmm0 \n"
4710 "sub $0x4,%2 \n"
4711 "movdqa %%xmm0,(%1,%0,1) \n"
4712 "lea 0x10(%1),%1 \n"
4713 "jg 50b \n"
4714 "jmp 99f \n"
4715
4716 // Blend 75 / 25.
4717 ".p2align 4 \n"
4718 "75: \n"
4719 "movdqa (%1),%%xmm1 \n"
4720 "movdqa (%1,%4,1),%%xmm0 \n"
4721 "pavgb %%xmm1,%%xmm0 \n"
4722 "pavgb %%xmm1,%%xmm0 \n"
4723 "sub $0x4,%2 \n"
4724 "movdqa %%xmm0,(%1,%0,1) \n"
4725 "lea 0x10(%1),%1 \n"
4726 "jg 75b \n"
4727 "jmp 99f \n"
4728
4729 // Blend 100 / 0 - Copy row unchanged.
4730 ".p2align 4 \n"
4731 "100: \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004732 "movdqa (%1),%%xmm0 \n"
4733 "sub $0x4,%2 \n"
4734 "movdqa %%xmm0,(%1,%0,1) \n"
4735 "lea 0x10(%1),%1 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004736 "jg 100b \n"
4737
fbarchard@google.comb5491752012-11-20 09:44:46 +00004738 "99: \n"
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004739 : "+r"(dst_argb), // %0
4740 "+r"(src_argb), // %1
fbarchard@google.comb5491752012-11-20 09:44:46 +00004741 "+r"(dst_width), // %2
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004742 "+r"(source_y_fraction) // %3
4743 : "r"(static_cast<intptr_t>(src_stride)) // %4
4744 : "memory", "cc"
4745#if defined(__SSE2__)
4746 , "xmm0", "xmm1", "xmm2", "xmm5"
4747#endif
4748 );
4749}
4750
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004751// Bilinear image filtering.
4752// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
4753void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
4754 ptrdiff_t src_stride, int dst_width,
4755 int source_y_fraction) {
4756 asm volatile (
4757 "sub %1,%0 \n"
4758 "shr %3 \n"
4759 "cmp $0x0,%3 \n"
4760 "je 100f \n"
4761 "cmp $0x20,%3 \n"
4762 "je 75f \n"
4763 "cmp $0x40,%3 \n"
4764 "je 50f \n"
4765 "cmp $0x60,%3 \n"
4766 "je 25f \n"
4767
4768 "movd %3,%%xmm0 \n"
4769 "neg %3 \n"
4770 "add $0x80,%3 \n"
4771 "movd %3,%%xmm5 \n"
4772 "punpcklbw %%xmm0,%%xmm5 \n"
4773 "punpcklwd %%xmm5,%%xmm5 \n"
4774 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4775 "pxor %%xmm4,%%xmm4 \n"
4776
4777 // General purpose row blend.
4778 ".p2align 4 \n"
4779 "1: \n"
4780 "movdqa (%1),%%xmm0 \n"
4781 "movdqa (%1,%4,1),%%xmm2 \n"
4782 "movdqa %%xmm0,%%xmm1 \n"
4783 "movdqa %%xmm2,%%xmm3 \n"
4784 "punpcklbw %%xmm4,%%xmm2 \n"
4785 "punpckhbw %%xmm4,%%xmm3 \n"
4786 "punpcklbw %%xmm4,%%xmm0 \n"
4787 "punpckhbw %%xmm4,%%xmm1 \n"
4788 "psubw %%xmm0,%%xmm2 \n"
4789 "psubw %%xmm1,%%xmm3 \n"
4790 "paddw %%xmm2,%%xmm2 \n"
4791 "paddw %%xmm3,%%xmm3 \n"
4792 "pmulhw %%xmm5,%%xmm2 \n"
4793 "pmulhw %%xmm5,%%xmm3 \n"
4794 "paddw %%xmm2,%%xmm0 \n"
4795 "paddw %%xmm3,%%xmm1 \n"
4796 "packuswb %%xmm1,%%xmm0 \n"
4797 "sub $0x4,%2 \n"
4798 "movdqa %%xmm0,(%1,%0,1) \n"
4799 "lea 0x10(%1),%1 \n"
4800 "jg 1b \n"
4801 "jmp 99f \n"
4802
4803 // Blend 25 / 75.
4804 ".p2align 4 \n"
4805 "25: \n"
4806 "movdqa (%1),%%xmm0 \n"
4807 "movdqa (%1,%4,1),%%xmm1 \n"
4808 "pavgb %%xmm1,%%xmm0 \n"
4809 "pavgb %%xmm1,%%xmm0 \n"
4810 "sub $0x4,%2 \n"
4811 "movdqa %%xmm0,(%1,%0,1) \n"
4812 "lea 0x10(%1),%1 \n"
4813 "jg 25b \n"
4814 "jmp 99f \n"
4815
4816 // Blend 50 / 50.
4817 ".p2align 4 \n"
4818 "50: \n"
4819 "movdqa (%1),%%xmm0 \n"
4820 "movdqa (%1,%4,1),%%xmm1 \n"
4821 "pavgb %%xmm1,%%xmm0 \n"
4822 "sub $0x4,%2 \n"
4823 "movdqa %%xmm0,(%1,%0,1) \n"
4824 "lea 0x10(%1),%1 \n"
4825 "jg 50b \n"
4826 "jmp 99f \n"
4827
4828 // Blend 75 / 25.
4829 ".p2align 4 \n"
4830 "75: \n"
4831 "movdqa (%1),%%xmm1 \n"
4832 "movdqa (%1,%4,1),%%xmm0 \n"
4833 "pavgb %%xmm1,%%xmm0 \n"
4834 "pavgb %%xmm1,%%xmm0 \n"
4835 "sub $0x4,%2 \n"
4836 "movdqa %%xmm0,(%1,%0,1) \n"
4837 "lea 0x10(%1),%1 \n"
4838 "jg 75b \n"
4839 "jmp 99f \n"
4840
4841 // Blend 100 / 0 - Copy row unchanged.
4842 ".p2align 4 \n"
4843 "100: \n"
4844 "movdqa (%1),%%xmm0 \n"
4845 "sub $0x4,%2 \n"
4846 "movdqa %%xmm0,(%1,%0,1) \n"
4847 "lea 0x10(%1),%1 \n"
4848 "jg 100b \n"
4849
4850 "99: \n"
4851 : "+r"(dst_argb), // %0
4852 "+r"(src_argb), // %1
4853 "+r"(dst_width), // %2
4854 "+r"(source_y_fraction) // %3
4855 : "r"(static_cast<intptr_t>(src_stride)) // %4
4856 : "memory", "cc"
4857#if defined(__SSE2__)
4858 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4859#endif
4860 );
4861}
4862
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004863void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4864 uint8* dst_uv, int pix) {
4865 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00004866 "sub %0,%1 \n"
4867 ".p2align 4 \n"
4868 "1: \n"
4869 "movdqa (%0),%%xmm0 \n"
4870 "pavgb (%0,%3),%%xmm0 \n"
4871 "sub $0x10,%2 \n"
4872 "movdqa %%xmm0,(%0,%1) \n"
4873 "lea 0x10(%0),%0 \n"
4874 "jg 1b \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004875 : "+r"(src_uv), // %0
4876 "+r"(dst_uv), // %1
4877 "+r"(pix) // %2
4878 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4879 : "memory", "cc"
4880#if defined(__SSE2__)
4881 , "xmm0"
4882#endif
4883 );
4884}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004885
4886void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4887 uint32 selector, int pix) {
4888 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00004889 "movd %3,%%xmm5 \n"
4890 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004891 ".p2align 4 \n"
4892 "1: \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00004893 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004894 "movdqa 0x10(%0),%%xmm1 \n"
4895 "lea 0x20(%0),%0 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00004896 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004897 "pshufb %%xmm5,%%xmm1 \n"
fbarchard@google.coma3be4702013-03-22 05:20:02 +00004898 "punpckldq %%xmm1,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004899 "sub $0x8,%2 \n"
4900 "movq %%xmm0,(%1) \n"
4901 "lea 0x8(%1),%1 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00004902 "jg 1b \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004903 : "+r"(src_argb), // %0
4904 "+r"(dst_bayer), // %1
4905 "+r"(pix) // %2
4906 : "g"(selector) // %3
4907 : "memory", "cc"
4908#if defined(__SSE2__)
fbarchard@google.come8df16b2013-03-22 04:47:14 +00004909 , "xmm0", "xmm1", "xmm5"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004910#endif
4911 );
4912}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004913
fbarchard@google.com10965432013-03-08 23:22:32 +00004914// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4915void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4916 const uint8* shuffler, int pix) {
4917 asm volatile (
4918 "movdqa (%3),%%xmm5 \n"
4919 ".p2align 4 \n"
4920 "1: \n"
4921 "movdqa (%0),%%xmm0 \n"
4922 "movdqa 0x10(%0),%%xmm1 \n"
4923 "lea 0x20(%0),%0 \n"
4924 "pshufb %%xmm5,%%xmm0 \n"
4925 "pshufb %%xmm5,%%xmm1 \n"
4926 "sub $0x8,%2 \n"
4927 "movdqa %%xmm0,(%1) \n"
4928 "movdqa %%xmm1,0x10(%1) \n"
4929 "lea 0x20(%1),%1 \n"
4930 "jg 1b \n"
4931 : "+r"(src_argb), // %0
4932 "+r"(dst_argb), // %1
4933 "+r"(pix) // %2
4934 : "r"(shuffler) // %3
4935 : "memory", "cc"
4936#if defined(__SSE2__)
4937 , "xmm0", "xmm1", "xmm5"
4938#endif
4939 );
4940}
4941
4942void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
4943 const uint8* shuffler, int pix) {
4944 asm volatile (
4945 "movdqa (%3),%%xmm5 \n"
4946 ".p2align 4 \n"
4947 "1: \n"
4948 "movdqu (%0),%%xmm0 \n"
4949 "movdqu 0x10(%0),%%xmm1 \n"
4950 "lea 0x20(%0),%0 \n"
4951 "pshufb %%xmm5,%%xmm0 \n"
4952 "pshufb %%xmm5,%%xmm1 \n"
4953 "sub $0x8,%2 \n"
4954 "movdqu %%xmm0,(%1) \n"
4955 "movdqu %%xmm1,0x10(%1) \n"
4956 "lea 0x20(%1),%1 \n"
4957 "jg 1b \n"
4958 : "+r"(src_argb), // %0
4959 "+r"(dst_argb), // %1
4960 "+r"(pix) // %2
4961 : "r"(shuffler) // %3
4962 : "memory", "cc"
4963#if defined(__SSE2__)
4964 , "xmm0", "xmm1", "xmm5"
4965#endif
4966 );
4967}
4968
fbarchard@google.com9de88672012-10-12 06:23:33 +00004969void I422ToYUY2Row_SSE2(const uint8* src_y,
4970 const uint8* src_u,
4971 const uint8* src_v,
4972 uint8* dst_frame, int width) {
4973 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00004974 "sub %1,%2 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00004975 ".p2align 4 \n"
4976 "1: \n"
4977 "movq (%1),%%xmm2 \n"
4978 "movq (%1,%2,1),%%xmm3 \n"
4979 "lea 0x8(%1),%1 \n"
4980 "punpcklbw %%xmm3,%%xmm2 \n"
4981 "movdqa (%0),%%xmm0 \n"
4982 "lea 0x10(%0),%0 \n"
4983 "movdqa %%xmm0,%%xmm1 \n"
4984 "punpcklbw %%xmm2,%%xmm0 \n"
4985 "punpckhbw %%xmm2,%%xmm1 \n"
4986 "movdqa %%xmm0,(%3) \n"
4987 "movdqa %%xmm1,0x10(%3) \n"
4988 "lea 0x20(%3),%3 \n"
4989 "sub $0x10,%4 \n"
4990 "jg 1b \n"
4991 : "+r"(src_y), // %0
4992 "+r"(src_u), // %1
4993 "+r"(src_v), // %2
4994 "+r"(dst_frame), // %3
4995 "+rm"(width) // %4
4996 :
4997 : "memory", "cc"
4998#if defined(__SSE2__)
4999 , "xmm0", "xmm1", "xmm2", "xmm3"
5000#endif
5001 );
5002}
5003
5004void I422ToUYVYRow_SSE2(const uint8* src_y,
5005 const uint8* src_u,
5006 const uint8* src_v,
5007 uint8* dst_frame, int width) {
5008 asm volatile (
5009 "sub %1,%2 \n"
5010 ".p2align 4 \n"
5011 "1: \n"
5012 "movq (%1),%%xmm2 \n"
5013 "movq (%1,%2,1),%%xmm3 \n"
5014 "lea 0x8(%1),%1 \n"
5015 "punpcklbw %%xmm3,%%xmm2 \n"
5016 "movdqa (%0),%%xmm0 \n"
5017 "movdqa %%xmm2,%%xmm1 \n"
5018 "lea 0x10(%0),%0 \n"
5019 "punpcklbw %%xmm0,%%xmm1 \n"
5020 "punpckhbw %%xmm0,%%xmm2 \n"
5021 "movdqa %%xmm1,(%3) \n"
5022 "movdqa %%xmm2,0x10(%3) \n"
5023 "lea 0x20(%3),%3 \n"
5024 "sub $0x10,%4 \n"
5025 "jg 1b \n"
5026 : "+r"(src_y), // %0
5027 "+r"(src_u), // %1
5028 "+r"(src_v), // %2
5029 "+r"(dst_frame), // %3
5030 "+rm"(width) // %4
5031 :
5032 : "memory", "cc"
5033#if defined(__SSE2__)
5034 , "xmm0", "xmm1", "xmm2", "xmm3"
5035#endif
5036 );
5037}
5038
fbarchard@google.com2d11d432012-02-16 02:50:39 +00005039#endif // defined(__x86_64__) || defined(__i386__)
5040
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00005041#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00005042} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00005043} // namespace libyuv
5044#endif