blob: 7ea47fbbe6121921bb1097687575cd562b6e6dbe [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
21#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
22
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000023#ifdef __APPLE__
24#define CONST
25#else
26#define CONST static const
27#endif
28
fbarchard@google.comb6149762011-11-07 21:58:52 +000029#ifdef HAS_ARGBTOUVROW_SSSE3
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000030CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000031 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
32};
33
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000034CONST uvec8 kARGBToV = {
fbarchard@google.com2430e042011-11-11 21:57:06 +000035 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
fbarchard@google.comb6149762011-11-07 21:58:52 +000036};
fbarchard@google.com2430e042011-11-11 21:57:06 +000037
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
40 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
41};
42#endif
43
fbarchard@google.com228bdc22011-11-15 21:58:26 +000044#ifdef HAS_ARGBTOYROW_SSSE3
45
46// Constant multiplication table for converting ARGB to I400.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000047CONST vec8 kARGBToY = {
fbarchard@google.com228bdc22011-11-15 21:58:26 +000048 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
49};
50
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000051CONST uvec8 kAddY16 = {
fbarchard@google.com228bdc22011-11-15 21:58:26 +000052 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
53};
54
fbarchard@google.comba1f5262012-01-12 19:22:41 +000055// Shuffle table for converting RGB24 to ARGB.
56CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000057 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
58};
59
60// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000061CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000062 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
63};
64
fbarchard@google.comb6149762011-11-07 21:58:52 +000065// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000066CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000067 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
68};
69
70// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000071CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000072 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
73};
74
fbarchard@google.comf1b60632012-02-17 19:27:20 +000075// Shuffle table for converting ARGB to RGB24.
76CONST uvec8 kShuffleMaskARGBToRGB24 = {
77 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
78};
79
80
81// Shuffle table for converting ARGB to RAW.
82CONST uvec8 kShuffleMaskARGBToRAW = {
83 2u, 1u,0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
84};
85
fbarchard@google.comb6149762011-11-07 21:58:52 +000086void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000087 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +000088 "pcmpeqb %%xmm5,%%xmm5 \n"
89 "pslld $0x18,%%xmm5 \n"
90 "1: \n"
91 "movq (%0),%%xmm0 \n"
92 "lea 0x8(%0),%0 \n"
93 "punpcklbw %%xmm0,%%xmm0 \n"
94 "movdqa %%xmm0,%%xmm1 \n"
95 "punpcklwd %%xmm0,%%xmm0 \n"
96 "punpckhwd %%xmm1,%%xmm1 \n"
97 "por %%xmm5,%%xmm0 \n"
98 "por %%xmm5,%%xmm1 \n"
99 "movdqa %%xmm0,(%1) \n"
100 "movdqa %%xmm1,0x10(%1) \n"
101 "lea 0x20(%1),%1 \n"
102 "sub $0x8,%2 \n"
103 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000104 : "+r"(src_y), // %0
105 "+r"(dst_argb), // %1
106 "+r"(pix) // %2
107 :
108 : "memory", "cc"
109#if defined(__SSE2__)
110 , "xmm0", "xmm1", "xmm5"
111#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000112 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000113}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000114
115void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000116 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000117 "movdqa %3,%%xmm5 \n"
118 "1: \n"
119 "movdqa (%0),%%xmm0 \n"
120 "lea 0x10(%0),%0 \n"
121 "pshufb %%xmm5,%%xmm0 \n"
122 "movdqa %%xmm0,(%1) \n"
123 "lea 0x10(%1),%1 \n"
124 "sub $0x4,%2 \n"
125 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000126 : "+r"(src_abgr), // %0
127 "+r"(dst_argb), // %1
128 "+r"(pix) // %2
129 : "m"(kShuffleMaskABGRToARGB) // %3
130 : "memory", "cc"
131#if defined(__SSE2__)
132 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000133#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000134 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000135}
136
137void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000138 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000139 "movdqa %3,%%xmm5 \n"
140 "1: \n"
141 "movdqa (%0),%%xmm0 \n"
142 "lea 0x10(%0),%0 \n"
143 "pshufb %%xmm5,%%xmm0 \n"
144 "movdqa %%xmm0,(%1) \n"
145 "lea 0x10(%1),%1 \n"
146 "sub $0x4,%2 \n"
147 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000148 : "+r"(src_bgra), // %0
149 "+r"(dst_argb), // %1
150 "+r"(pix) // %2
151 : "m"(kShuffleMaskBGRAToARGB) // %3
152 : "memory", "cc"
153#if defined(__SSE2__)
154 , "xmm0", "xmm5"
155#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000156 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000157}
158
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000159void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000160 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000161 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
162 "pslld $0x18,%%xmm5 \n"
163 "movdqa %3,%%xmm4 \n"
164 "1: \n"
165 "movdqu (%0),%%xmm0 \n"
166 "movdqu 0x10(%0),%%xmm1 \n"
167 "movdqu 0x20(%0),%%xmm3 \n"
168 "lea 0x30(%0),%0 \n"
169 "movdqa %%xmm3,%%xmm2 \n"
170 "palignr $0x8,%%xmm1,%%xmm2 \n"
171 "pshufb %%xmm4,%%xmm2 \n"
172 "por %%xmm5,%%xmm2 \n"
173 "palignr $0xc,%%xmm0,%%xmm1 \n"
174 "pshufb %%xmm4,%%xmm0 \n"
175 "movdqa %%xmm2,0x20(%1) \n"
176 "por %%xmm5,%%xmm0 \n"
177 "pshufb %%xmm4,%%xmm1 \n"
178 "movdqa %%xmm0,(%1) \n"
179 "por %%xmm5,%%xmm1 \n"
180 "palignr $0x4,%%xmm3,%%xmm3 \n"
181 "pshufb %%xmm4,%%xmm3 \n"
182 "movdqa %%xmm1,0x10(%1) \n"
183 "por %%xmm5,%%xmm3 \n"
184 "movdqa %%xmm3,0x30(%1) \n"
185 "lea 0x40(%1),%1 \n"
186 "sub $0x10,%2 \n"
187 "ja 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000188 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000189 "+r"(dst_argb), // %1
190 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000191 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000192 : "memory", "cc"
193#if defined(__SSE2__)
194 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
195#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000196 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000197}
198
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000199void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000200 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000201 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
202 "pslld $0x18,%%xmm5 \n"
203 "movdqa %3,%%xmm4 \n"
204 "1: \n"
205 "movdqu (%0),%%xmm0 \n"
206 "movdqu 0x10(%0),%%xmm1 \n"
207 "movdqu 0x20(%0),%%xmm3 \n"
208 "lea 0x30(%0),%0 \n"
209 "movdqa %%xmm3,%%xmm2 \n"
210 "palignr $0x8,%%xmm1,%%xmm2 \n"
211 "pshufb %%xmm4,%%xmm2 \n"
212 "por %%xmm5,%%xmm2 \n"
213 "palignr $0xc,%%xmm0,%%xmm1 \n"
214 "pshufb %%xmm4,%%xmm0 \n"
215 "movdqa %%xmm2,0x20(%1) \n"
216 "por %%xmm5,%%xmm0 \n"
217 "pshufb %%xmm4,%%xmm1 \n"
218 "movdqa %%xmm0,(%1) \n"
219 "por %%xmm5,%%xmm1 \n"
220 "palignr $0x4,%%xmm3,%%xmm3 \n"
221 "pshufb %%xmm4,%%xmm3 \n"
222 "movdqa %%xmm1,0x10(%1) \n"
223 "por %%xmm5,%%xmm3 \n"
224 "movdqa %%xmm3,0x30(%1) \n"
225 "lea 0x40(%1),%1 \n"
226 "sub $0x10,%2 \n"
227 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000228 : "+r"(src_raw), // %0
229 "+r"(dst_argb), // %1
230 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000231 : "m"(kShuffleMaskRAWToARGB) // %3
232 : "memory", "cc"
233#if defined(__SSE2__)
234 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
235#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000236 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000237}
238
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000239void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
240 asm volatile (
241 "mov $0x1080108,%%eax \n"
242 "movd %%eax,%%xmm5 \n"
243 "pshufd $0x0,%%xmm5,%%xmm5 \n"
244 "mov $0x20082008,%%eax \n"
245 "movd %%eax,%%xmm6 \n"
246 "pshufd $0x0,%%xmm6,%%xmm6 \n"
247 "pcmpeqb %%xmm3,%%xmm3 \n"
248 "psllw $0xb,%%xmm3 \n"
249 "pcmpeqb %%xmm4,%%xmm4 \n"
250 "psllw $0xa,%%xmm4 \n"
251 "psrlw $0x5,%%xmm4 \n"
252 "pcmpeqb %%xmm7,%%xmm7 \n"
253 "psllw $0x8,%%xmm7 \n"
254 "sub %0,%1 \n"
255 "sub %0,%1 \n"
256 "1: \n"
257 "movdqu (%0),%%xmm0 \n"
258 "movdqa %%xmm0,%%xmm1 \n"
259 "movdqa %%xmm0,%%xmm2 \n"
260 "pand %%xmm3,%%xmm1 \n"
261 "psllw $0xb,%%xmm2 \n"
262 "pmulhuw %%xmm5,%%xmm1 \n"
263 "pmulhuw %%xmm5,%%xmm2 \n"
264 "psllw $0x8,%%xmm1 \n"
265 "por %%xmm2,%%xmm1 \n"
266 "pand %%xmm4,%%xmm0 \n"
267 "pmulhuw %%xmm6,%%xmm0 \n"
268 "por %%xmm7,%%xmm0 \n"
269 "movdqa %%xmm1,%%xmm2 \n"
270 "punpcklbw %%xmm0,%%xmm1 \n"
271 "punpckhbw %%xmm0,%%xmm2 \n"
272 "movdqa %%xmm1,(%1,%0,2) \n"
273 "movdqa %%xmm2,0x10(%1,%0,2) \n"
274 "lea 0x10(%0),%0 \n"
275 "sub $0x8,%2 \n"
276 "ja 1b \n"
277 : "+r"(src), // %0
278 "+r"(dst), // %1
279 "+r"(pix) // %2
280 :
281 : "memory", "cc", "eax"
282#if defined(__SSE2__)
283 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
284#endif
285 );
286}
287
288void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
289 asm volatile (
290 "mov $0x1080108,%%eax \n"
291 "movd %%eax,%%xmm5 \n"
292 "pshufd $0x0,%%xmm5,%%xmm5 \n"
293 "mov $0x42004200,%%eax \n"
294 "movd %%eax,%%xmm6 \n"
295 "pshufd $0x0,%%xmm6,%%xmm6 \n"
296 "pcmpeqb %%xmm3,%%xmm3 \n"
297 "psllw $0xb,%%xmm3 \n"
298 "movdqa %%xmm3,%%xmm4 \n"
299 "psrlw $0x6,%%xmm4 \n"
300 "pcmpeqb %%xmm7,%%xmm7 \n"
301 "psllw $0x8,%%xmm7 \n"
302 "sub %0,%1 \n"
303 "sub %0,%1 \n"
304 "1: \n"
305 "movdqu (%0),%%xmm0 \n"
306 "movdqa %%xmm0,%%xmm1 \n"
307 "movdqa %%xmm0,%%xmm2 \n"
308 "psllw $0x1,%%xmm1 \n"
309 "psllw $0xb,%%xmm2 \n"
310 "pand %%xmm3,%%xmm1 \n"
311 "pmulhuw %%xmm5,%%xmm2 \n"
312 "pmulhuw %%xmm5,%%xmm1 \n"
313 "psllw $0x8,%%xmm1 \n"
314 "por %%xmm2,%%xmm1 \n"
315 "movdqa %%xmm0,%%xmm2 \n"
316 "pand %%xmm4,%%xmm0 \n"
317 "psraw $0x8,%%xmm2 \n"
318 "pmulhuw %%xmm6,%%xmm0 \n"
319 "pand %%xmm7,%%xmm2 \n"
320 "por %%xmm2,%%xmm0 \n"
321 "movdqa %%xmm1,%%xmm2 \n"
322 "punpcklbw %%xmm0,%%xmm1 \n"
323 "punpckhbw %%xmm0,%%xmm2 \n"
324 "movdqa %%xmm1,(%1,%0,2) \n"
325 "movdqa %%xmm2,0x10(%1,%0,2) \n"
326 "lea 0x10(%0),%0 \n"
327 "sub $0x8,%2 \n"
328 "ja 1b \n"
329 : "+r"(src), // %0
330 "+r"(dst), // %1
331 "+r"(pix) // %2
332 :
333 : "memory", "cc", "eax"
334#if defined(__SSE2__)
335 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
336#endif
337 );
338}
339
340void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
341 asm volatile (
342 "mov $0xf0f0f0f,%%eax \n"
343 "movd %%eax,%%xmm4 \n"
344 "pshufd $0x0,%%xmm4,%%xmm4 \n"
345 "movdqa %%xmm4,%%xmm5 \n"
346 "pslld $0x4,%%xmm5 \n"
347 "sub %0,%1 \n"
348 "sub %0,%1 \n"
349 "1: \n"
350 "movdqu (%0),%%xmm0 \n"
351 "movdqa %%xmm0,%%xmm2 \n"
352 "pand %%xmm4,%%xmm0 \n"
353 "pand %%xmm5,%%xmm2 \n"
354 "movdqa %%xmm0,%%xmm1 \n"
355 "movdqa %%xmm2,%%xmm3 \n"
356 "psllw $0x4,%%xmm1 \n"
357 "psrlw $0x4,%%xmm3 \n"
358 "por %%xmm1,%%xmm0 \n"
359 "por %%xmm3,%%xmm2 \n"
360 "movdqa %%xmm0,%%xmm1 \n"
361 "punpcklbw %%xmm2,%%xmm0 \n"
362 "punpckhbw %%xmm2,%%xmm1 \n"
363 "movdqa %%xmm0,(%1,%0,2) \n"
364 "movdqa %%xmm1,0x10(%1,%0,2) \n"
365 "lea 0x10(%0),%0 \n"
366 "sub $0x8,%2 \n"
367 "ja 1b \n"
368 : "+r"(src), // %0
369 "+r"(dst), // %1
370 "+r"(pix) // %2
371 :
372 : "memory", "cc", "eax"
373#if defined(__SSE2__)
374 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
375#endif
376 );
377}
378
379void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
380 asm volatile (
381 "movdqa %3,%%xmm6 \n"
382 "1: \n"
383 "movdqa (%0),%%xmm0 \n"
384 "movdqa 0x10(%0),%%xmm1 \n"
385 "movdqa 0x20(%0),%%xmm2 \n"
386 "movdqa 0x30(%0),%%xmm3 \n"
387 "lea 0x40(%0),%0 \n"
388 "pshufb %%xmm6,%%xmm0 \n"
389 "pshufb %%xmm6,%%xmm1 \n"
390 "pshufb %%xmm6,%%xmm2 \n"
391 "pshufb %%xmm6,%%xmm3 \n"
392 "movdqa %%xmm1,%%xmm4 \n"
393 "psrldq $0x4,%%xmm1 \n"
394 "pslldq $0xc,%%xmm4 \n"
395 "movdqa %%xmm2,%%xmm5 \n"
396 "por %%xmm4,%%xmm0 \n"
397 "pslldq $0x8,%%xmm5 \n"
398 "movdqa %%xmm0,(%1) \n"
399 "por %%xmm5,%%xmm1 \n"
400 "psrldq $0x8,%%xmm2 \n"
401 "pslldq $0x4,%%xmm3 \n"
402 "por %%xmm3,%%xmm2 \n"
403 "movdqa %%xmm1,0x10(%1) \n"
404 "movdqa %%xmm2,0x20(%1) \n"
405 "lea 0x30(%1),%1 \n"
406 "sub $0x10,%2 \n"
407 "ja 1b \n"
408 : "+r"(src), // %0
409 "+r"(dst), // %1
410 "+r"(pix) // %2
411 : "m"(kShuffleMaskARGBToRGB24) // %3
412 : "memory", "cc"
413#if defined(__SSE2__)
414 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
415#endif
416 );
417}
418
419void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
420 asm volatile (
421 "movdqa %3,%%xmm6 \n"
422 "1: \n"
423 "movdqa (%0),%%xmm0 \n"
424 "movdqa 0x10(%0),%%xmm1 \n"
425 "movdqa 0x20(%0),%%xmm2 \n"
426 "movdqa 0x30(%0),%%xmm3 \n"
427 "lea 0x40(%0),%0 \n"
428 "pshufb %%xmm6,%%xmm0 \n"
429 "pshufb %%xmm6,%%xmm1 \n"
430 "pshufb %%xmm6,%%xmm2 \n"
431 "pshufb %%xmm6,%%xmm3 \n"
432 "movdqa %%xmm1,%%xmm4 \n"
433 "psrldq $0x4,%%xmm1 \n"
434 "pslldq $0xc,%%xmm4 \n"
435 "movdqa %%xmm2,%%xmm5 \n"
436 "por %%xmm4,%%xmm0 \n"
437 "pslldq $0x8,%%xmm5 \n"
438 "movdqa %%xmm0,(%1) \n"
439 "por %%xmm5,%%xmm1 \n"
440 "psrldq $0x8,%%xmm2 \n"
441 "pslldq $0x4,%%xmm3 \n"
442 "por %%xmm3,%%xmm2 \n"
443 "movdqa %%xmm1,0x10(%1) \n"
444 "movdqa %%xmm2,0x20(%1) \n"
445 "lea 0x30(%1),%1 \n"
446 "sub $0x10,%2 \n"
447 "ja 1b \n"
448 : "+r"(src), // %0
449 "+r"(dst), // %1
450 "+r"(pix) // %2
451 : "m"(kShuffleMaskARGBToRAW) // %3
452 : "memory", "cc"
453#if defined(__SSE2__)
454 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
455#endif
456 );
457}
458
459void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
460 asm volatile (
461 "pcmpeqb %%xmm3,%%xmm3 \n"
462 "psrld $0x1b,%%xmm3 \n"
463 "pcmpeqb %%xmm4,%%xmm4 \n"
464 "psrld $0x1a,%%xmm4 \n"
465 "pslld $0x5,%%xmm4 \n"
466 "pcmpeqb %%xmm5,%%xmm5 \n"
467 "pslld $0xb,%%xmm5 \n"
468 "1: \n"
469 "movdqa (%0),%%xmm0 \n"
470 "movdqa %%xmm0,%%xmm1 \n"
471 "movdqa %%xmm0,%%xmm2 \n"
472 "pslld $0x8,%%xmm0 \n"
473 "psrld $0x3,%%xmm1 \n"
474 "psrld $0x5,%%xmm2 \n"
475 "psrad $0x10,%%xmm0 \n"
476 "pand %%xmm3,%%xmm1 \n"
477 "pand %%xmm4,%%xmm2 \n"
478 "pand %%xmm5,%%xmm0 \n"
479 "por %%xmm2,%%xmm1 \n"
480 "por %%xmm1,%%xmm0 \n"
481 "packssdw %%xmm0,%%xmm0 \n"
482 "lea 0x10(%0),%0 \n"
483 "movq %%xmm0,(%1) \n"
484 "lea 0x8(%1),%1 \n"
485 "sub $0x4,%2 \n"
486 "ja 1b \n"
487 : "+r"(src), // %0
488 "+r"(dst), // %1
489 "+r"(pix) // %2
490 :
491 : "memory", "cc"
492#if defined(__SSE2__)
493 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
494#endif
495 );
496}
497
498void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
499 asm volatile (
500 "pcmpeqb %%xmm4,%%xmm4 \n"
501 "psrld $0x1b,%%xmm4 \n"
502 "movdqa %%xmm4,%%xmm5 \n"
503 "pslld $0x5,%%xmm5 \n"
504 "movdqa %%xmm4,%%xmm6 \n"
505 "pslld $0xa,%%xmm6 \n"
506 "pcmpeqb %%xmm7,%%xmm7 \n"
507 "pslld $0xf,%%xmm7 \n"
508 "1: \n"
509 "movdqa (%0),%%xmm0 \n"
510 "movdqa %%xmm0,%%xmm1 \n"
511 "movdqa %%xmm0,%%xmm2 \n"
512 "movdqa %%xmm0,%%xmm3 \n"
513 "psrad $0x10,%%xmm0 \n"
514 "psrld $0x3,%%xmm1 \n"
515 "psrld $0x6,%%xmm2 \n"
516 "psrld $0x9,%%xmm3 \n"
517 "pand %%xmm7,%%xmm0 \n"
518 "pand %%xmm4,%%xmm1 \n"
519 "pand %%xmm5,%%xmm2 \n"
520 "pand %%xmm6,%%xmm3 \n"
521 "por %%xmm1,%%xmm0 \n"
522 "por %%xmm3,%%xmm2 \n"
523 "por %%xmm2,%%xmm0 \n"
524 "packssdw %%xmm0,%%xmm0 \n"
525 "lea 0x10(%0),%0 \n"
526 "movq %%xmm0,(%1) \n"
527 "lea 0x8(%1),%1 \n"
528 "sub $0x4,%2 \n"
529 "ja 1b \n"
530 : "+r"(src), // %0
531 "+r"(dst), // %1
532 "+r"(pix) // %2
533 :
534 : "memory", "cc"
535#if defined(__SSE2__)
536 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
537#endif
538 );
539}
540
541void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
542 asm volatile (
543 "pcmpeqb %%xmm4,%%xmm4 \n"
544 "psllw $0xc,%%xmm4 \n"
545 "movdqa %%xmm4,%%xmm3 \n"
546 "psrlw $0x8,%%xmm3 \n"
547 "1: \n"
548 "movdqa (%0),%%xmm0 \n"
549 "movdqa %%xmm0,%%xmm1 \n"
550 "pand %%xmm3,%%xmm0 \n"
551 "pand %%xmm4,%%xmm1 \n"
552 "psrlq $0x4,%%xmm0 \n"
553 "psrlq $0x8,%%xmm1 \n"
554 "por %%xmm1,%%xmm0 \n"
555 "packuswb %%xmm0,%%xmm0 \n"
556 "lea 0x10(%0),%0 \n"
557 "movq %%xmm0,(%1) \n"
558 "lea 0x8(%1),%1 \n"
559 "sub $0x4,%2 \n"
560 "ja 1b \n"
561 : "+r"(src), // %0
562 "+r"(dst), // %1
563 "+r"(pix) // %2
564 :
565 : "memory", "cc"
566#if defined(__SSE2__)
567 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
568#endif
569 );
570}
571
572
fbarchard@google.comb6149762011-11-07 21:58:52 +0000573void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000574 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000575 "movdqa %4,%%xmm5 \n"
576 "movdqa %3,%%xmm4 \n"
577 "1: \n"
578 "movdqa (%0),%%xmm0 \n"
579 "movdqa 0x10(%0),%%xmm1 \n"
580 "movdqa 0x20(%0),%%xmm2 \n"
581 "movdqa 0x30(%0),%%xmm3 \n"
582 "pmaddubsw %%xmm4,%%xmm0 \n"
583 "pmaddubsw %%xmm4,%%xmm1 \n"
584 "pmaddubsw %%xmm4,%%xmm2 \n"
585 "pmaddubsw %%xmm4,%%xmm3 \n"
586 "lea 0x40(%0),%0 \n"
587 "phaddw %%xmm1,%%xmm0 \n"
588 "phaddw %%xmm3,%%xmm2 \n"
589 "psrlw $0x7,%%xmm0 \n"
590 "psrlw $0x7,%%xmm2 \n"
591 "packuswb %%xmm2,%%xmm0 \n"
592 "paddb %%xmm5,%%xmm0 \n"
593 "movdqa %%xmm0,(%1) \n"
594 "lea 0x10(%1),%1 \n"
595 "sub $0x10,%2 \n"
596 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000597 : "+r"(src_argb), // %0
598 "+r"(dst_y), // %1
599 "+r"(pix) // %2
600 : "m"(kARGBToY), // %3
601 "m"(kAddY16) // %4
602 : "memory", "cc"
603#if defined(__SSE2__)
604 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
605#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000606 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000607}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000608
609void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
610 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000611 "movdqa %4,%%xmm5 \n"
612 "movdqa %3,%%xmm4 \n"
613 "1: \n"
614 "movdqu (%0),%%xmm0 \n"
615 "movdqu 0x10(%0),%%xmm1 \n"
616 "movdqu 0x20(%0),%%xmm2 \n"
617 "movdqu 0x30(%0),%%xmm3 \n"
618 "pmaddubsw %%xmm4,%%xmm0 \n"
619 "pmaddubsw %%xmm4,%%xmm1 \n"
620 "pmaddubsw %%xmm4,%%xmm2 \n"
621 "pmaddubsw %%xmm4,%%xmm3 \n"
622 "lea 0x40(%0),%0 \n"
623 "phaddw %%xmm1,%%xmm0 \n"
624 "phaddw %%xmm3,%%xmm2 \n"
625 "psrlw $0x7,%%xmm0 \n"
626 "psrlw $0x7,%%xmm2 \n"
627 "packuswb %%xmm2,%%xmm0 \n"
628 "paddb %%xmm5,%%xmm0 \n"
629 "movdqu %%xmm0,(%1) \n"
630 "lea 0x10(%1),%1 \n"
631 "sub $0x10,%2 \n"
632 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000633 : "+r"(src_argb), // %0
634 "+r"(dst_y), // %1
635 "+r"(pix) // %2
636 : "m"(kARGBToY), // %3
637 "m"(kAddY16) // %4
638 : "memory", "cc"
639#if defined(__SSE2__)
640 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
641#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000642 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000643}
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000644#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +0000645
fbarchard@google.comb6149762011-11-07 21:58:52 +0000646#ifdef HAS_ARGBTOUVROW_SSSE3
647void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
648 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000649 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000650 "movdqa %0,%%xmm4 \n"
651 "movdqa %1,%%xmm3 \n"
652 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000653 :
654 : "m"(kARGBToU), // %0
655 "m"(kARGBToV), // %1
656 "m"(kAddUV128) // %2
657 :
658#if defined(__SSE2__)
659 "xmm3", "xmm4", "xmm5"
660#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000661 );
662 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000663 "sub %1,%2 \n"
664 "1: \n"
665 "movdqa (%0),%%xmm0 \n"
666 "movdqa 0x10(%0),%%xmm1 \n"
667 "movdqa 0x20(%0),%%xmm2 \n"
668 "movdqa 0x30(%0),%%xmm6 \n"
669 "pavgb (%0,%4,1),%%xmm0 \n"
670 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
671 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
672 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
673 "lea 0x40(%0),%0 \n"
674 "movdqa %%xmm0,%%xmm7 \n"
675 "shufps $0x88,%%xmm1,%%xmm0 \n"
676 "shufps $0xdd,%%xmm1,%%xmm7 \n"
677 "pavgb %%xmm7,%%xmm0 \n"
678 "movdqa %%xmm2,%%xmm7 \n"
679 "shufps $0x88,%%xmm6,%%xmm2 \n"
680 "shufps $0xdd,%%xmm6,%%xmm7 \n"
681 "pavgb %%xmm7,%%xmm2 \n"
682 "movdqa %%xmm0,%%xmm1 \n"
683 "movdqa %%xmm2,%%xmm6 \n"
684 "pmaddubsw %%xmm4,%%xmm0 \n"
685 "pmaddubsw %%xmm4,%%xmm2 \n"
686 "pmaddubsw %%xmm3,%%xmm1 \n"
687 "pmaddubsw %%xmm3,%%xmm6 \n"
688 "phaddw %%xmm2,%%xmm0 \n"
689 "phaddw %%xmm6,%%xmm1 \n"
690 "psraw $0x8,%%xmm0 \n"
691 "psraw $0x8,%%xmm1 \n"
692 "packsswb %%xmm1,%%xmm0 \n"
693 "paddb %%xmm5,%%xmm0 \n"
694 "movlps %%xmm0,(%1) \n"
695 "movhps %%xmm0,(%1,%2,1) \n"
696 "lea 0x8(%1),%1 \n"
697 "sub $0x10,%3 \n"
698 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000699 : "+r"(src_argb0), // %0
700 "+r"(dst_u), // %1
701 "+r"(dst_v), // %2
702 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000703 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000704 : "memory", "cc"
705#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000706 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000707#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000708 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000709}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000710
711void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
712 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000713 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000714 "movdqa %0,%%xmm4 \n"
715 "movdqa %1,%%xmm3 \n"
716 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000717 :
718 : "m"(kARGBToU), // %0
719 "m"(kARGBToV), // %1
720 "m"(kAddUV128) // %2
721 :
722#if defined(__SSE2__)
723 "xmm3", "xmm4", "xmm5"
724#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000725 );
726 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000727 "sub %1,%2 \n"
728 "1: \n"
729 "movdqu (%0),%%xmm0 \n"
730 "movdqu 0x10(%0),%%xmm1 \n"
731 "movdqu 0x20(%0),%%xmm2 \n"
732 "movdqu 0x30(%0),%%xmm6 \n"
733 "movdqu (%0,%4,1),%%xmm7 \n"
734 "pavgb %%xmm7,%%xmm0 \n"
735 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
736 "pavgb %%xmm7,%%xmm1 \n"
737 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
738 "pavgb %%xmm7,%%xmm2 \n"
739 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
740 "pavgb %%xmm7,%%xmm6 \n"
741 "lea 0x40(%0),%0 \n"
742 "movdqa %%xmm0,%%xmm7 \n"
743 "shufps $0x88,%%xmm1,%%xmm0 \n"
744 "shufps $0xdd,%%xmm1,%%xmm7 \n"
745 "pavgb %%xmm7,%%xmm0 \n"
746 "movdqa %%xmm2,%%xmm7 \n"
747 "shufps $0x88,%%xmm6,%%xmm2 \n"
748 "shufps $0xdd,%%xmm6,%%xmm7 \n"
749 "pavgb %%xmm7,%%xmm2 \n"
750 "movdqa %%xmm0,%%xmm1 \n"
751 "movdqa %%xmm2,%%xmm6 \n"
752 "pmaddubsw %%xmm4,%%xmm0 \n"
753 "pmaddubsw %%xmm4,%%xmm2 \n"
754 "pmaddubsw %%xmm3,%%xmm1 \n"
755 "pmaddubsw %%xmm3,%%xmm6 \n"
756 "phaddw %%xmm2,%%xmm0 \n"
757 "phaddw %%xmm6,%%xmm1 \n"
758 "psraw $0x8,%%xmm0 \n"
759 "psraw $0x8,%%xmm1 \n"
760 "packsswb %%xmm1,%%xmm0 \n"
761 "paddb %%xmm5,%%xmm0 \n"
762 "movlps %%xmm0,(%1) \n"
763 "movhps %%xmm0,(%1,%2,1) \n"
764 "lea 0x8(%1),%1 \n"
765 "sub $0x10,%3 \n"
766 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000767 : "+r"(src_argb0), // %0
768 "+r"(dst_u), // %1
769 "+r"(dst_v), // %2
770 "+rm"(width) // %3
771 : "r"(static_cast<intptr_t>(src_stride_argb))
772 : "memory", "cc"
773#if defined(__SSE2__)
774 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
775#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000776 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000777}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000778#endif
779
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000780#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000781#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
782#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
783#define UR 0
784
785#define VB 0
786#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
787#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
788
789// Bias
790#define BB UB * 128 + VB * 128
791#define BG UG * 128 + VG * 128
792#define BR UR * 128 + VR * 128
793
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000794#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000795
fbarchard@google.comb6149762011-11-07 21:58:52 +0000796#if defined(__APPLE__) || defined(__x86_64__)
797#define OMITFP
798#else
799#define OMITFP __attribute__((optimize("omit-frame-pointer")))
800#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000801
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000802struct {
803 vec8 kUVToB;
804 vec8 kUVToG;
805 vec8 kUVToR;
806 vec16 kUVBiasB;
807 vec16 kUVBiasG;
808 vec16 kUVBiasR;
809 vec16 kYSub16;
810 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000811} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000812 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
813 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
814 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
815 { BB, BB, BB, BB, BB, BB, BB, BB },
816 { BG, BG, BG, BG, BG, BG, BG, BG },
817 { BR, BR, BR, BR, BR, BR, BR, BR },
818 { 16, 16, 16, 16, 16, 16, 16, 16 },
819 { YG, YG, YG, YG, YG, YG, YG, YG }
820};
821
822// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +0000823#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000824 "movd (%1),%%xmm0 \n" \
825 "movd (%1,%2,1),%%xmm1 \n" \
826 "lea 0x4(%1),%1 \n" \
827 "punpcklbw %%xmm1,%%xmm0 \n" \
828 "punpcklwd %%xmm0,%%xmm0 \n" \
829 "movdqa %%xmm0,%%xmm1 \n" \
830 "movdqa %%xmm0,%%xmm2 \n" \
831 "pmaddubsw (%5),%%xmm0 \n" \
832 "pmaddubsw 16(%5),%%xmm1 \n" \
833 "pmaddubsw 32(%5),%%xmm2 \n" \
834 "psubw 48(%5),%%xmm0 \n" \
835 "psubw 64(%5),%%xmm1 \n" \
836 "psubw 80(%5),%%xmm2 \n" \
837 "movq (%0),%%xmm3 \n" \
838 "lea 0x8(%0),%0 \n" \
839 "punpcklbw %%xmm4,%%xmm3 \n" \
840 "psubsw 96(%5),%%xmm3 \n" \
841 "pmullw 112(%5),%%xmm3 \n" \
842 "paddsw %%xmm3,%%xmm0 \n" \
843 "paddsw %%xmm3,%%xmm1 \n" \
844 "paddsw %%xmm3,%%xmm2 \n" \
845 "psraw $0x6,%%xmm0 \n" \
846 "psraw $0x6,%%xmm1 \n" \
847 "psraw $0x6,%%xmm2 \n" \
848 "packuswb %%xmm0,%%xmm0 \n" \
849 "packuswb %%xmm1,%%xmm1 \n" \
850 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000851
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000852void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
853 const uint8* u_buf,
854 const uint8* v_buf,
855 uint8* rgb_buf,
856 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000857 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000858 "sub %1,%2 \n"
859 "pcmpeqb %%xmm5,%%xmm5 \n"
860 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000861 "1: \n"
862 YUVTORGB
863 "punpcklbw %%xmm1,%%xmm0 \n"
864 "punpcklbw %%xmm5,%%xmm2 \n"
865 "movdqa %%xmm0,%%xmm1 \n"
866 "punpcklwd %%xmm2,%%xmm0 \n"
867 "punpckhwd %%xmm2,%%xmm1 \n"
868 "movdqa %%xmm0,(%3) \n"
869 "movdqa %%xmm1,0x10(%3) \n"
870 "lea 0x20(%3),%3 \n"
871 "sub $0x8,%4 \n"
872 "ja 1b \n"
873 : "+r"(y_buf), // %0
874 "+r"(u_buf), // %1
875 "+r"(v_buf), // %2
876 "+r"(rgb_buf), // %3
877 "+rm"(width) // %4
878 : "r"(&kYuvConstants.kUVToB) // %5
879 : "memory", "cc"
880#if defined(__SSE2__)
881 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
882#endif
883 );
884}
885
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000886void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
887 const uint8* u_buf,
888 const uint8* v_buf,
889 uint8* rgb_buf,
890 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000891 asm volatile (
892 "sub %1,%2 \n"
893 "pcmpeqb %%xmm5,%%xmm5 \n"
894 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000895 "1: \n"
896 YUVTORGB
897 "pcmpeqb %%xmm5,%%xmm5 \n"
898 "punpcklbw %%xmm0,%%xmm1 \n"
899 "punpcklbw %%xmm2,%%xmm5 \n"
900 "movdqa %%xmm5,%%xmm0 \n"
901 "punpcklwd %%xmm1,%%xmm5 \n"
902 "punpckhwd %%xmm1,%%xmm0 \n"
903 "movdqa %%xmm5,(%3) \n"
904 "movdqa %%xmm0,0x10(%3) \n"
905 "lea 0x20(%3),%3 \n"
906 "sub $0x8,%4 \n"
907 "ja 1b \n"
908 : "+r"(y_buf), // %0
909 "+r"(u_buf), // %1
910 "+r"(v_buf), // %2
911 "+r"(rgb_buf), // %3
912 "+rm"(width) // %4
913 : "r"(&kYuvConstants.kUVToB) // %5
914 : "memory", "cc"
915#if defined(__SSE2__)
916 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
917#endif
918 );
919}
920
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000921void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
922 const uint8* u_buf,
923 const uint8* v_buf,
924 uint8* rgb_buf,
925 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000926 asm volatile (
927 "sub %1,%2 \n"
928 "pcmpeqb %%xmm5,%%xmm5 \n"
929 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000930 "1: \n"
931 YUVTORGB
932 "punpcklbw %%xmm1,%%xmm2 \n"
933 "punpcklbw %%xmm5,%%xmm0 \n"
934 "movdqa %%xmm2,%%xmm1 \n"
935 "punpcklwd %%xmm0,%%xmm2 \n"
936 "punpckhwd %%xmm0,%%xmm1 \n"
937 "movdqa %%xmm2,(%3) \n"
938 "movdqa %%xmm1,0x10(%3) \n"
939 "lea 0x20(%3),%3 \n"
940 "sub $0x8,%4 \n"
941 "ja 1b \n"
942 : "+r"(y_buf), // %0
943 "+r"(u_buf), // %1
944 "+r"(v_buf), // %2
945 "+r"(rgb_buf), // %3
946 "+rm"(width) // %4
947 : "r"(&kYuvConstants.kUVToB) // %5
948 : "memory", "cc"
949#if defined(__SSE2__)
950 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
951#endif
952 );
953}
954
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000955void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
956 const uint8* u_buf,
957 const uint8* v_buf,
958 uint8* rgb_buf,
959 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000960 asm volatile (
961 "sub %1,%2 \n"
962 "pcmpeqb %%xmm5,%%xmm5 \n"
963 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000964 "1: \n"
965 "movd (%1),%%xmm0 \n"
966 "movd (%1,%2,1),%%xmm1 \n"
967 "lea 0x4(%1),%1 \n"
968 "punpcklbw %%xmm1,%%xmm0 \n"
969 "movdqa %%xmm0,%%xmm1 \n"
970 "movdqa %%xmm0,%%xmm2 \n"
971 "pmaddubsw (%5),%%xmm0 \n"
972 "pmaddubsw 16(%5),%%xmm1 \n"
973 "pmaddubsw 32(%5),%%xmm2 \n"
974 "psubw 48(%5),%%xmm0 \n"
975 "psubw 64(%5),%%xmm1 \n"
976 "psubw 80(%5),%%xmm2 \n"
977 "movd (%0),%%xmm3 \n"
978 "lea 0x4(%0),%0 \n"
979 "punpcklbw %%xmm4,%%xmm3 \n"
980 "psubsw 96(%5),%%xmm3 \n"
981 "pmullw 112(%5),%%xmm3 \n"
982 "paddsw %%xmm3,%%xmm0 \n"
983 "paddsw %%xmm3,%%xmm1 \n"
984 "paddsw %%xmm3,%%xmm2 \n"
985 "psraw $0x6,%%xmm0 \n"
986 "psraw $0x6,%%xmm1 \n"
987 "psraw $0x6,%%xmm2 \n"
988 "packuswb %%xmm0,%%xmm0 \n"
989 "packuswb %%xmm1,%%xmm1 \n"
990 "packuswb %%xmm2,%%xmm2 \n"
991 "punpcklbw %%xmm1,%%xmm0 \n"
992 "punpcklbw %%xmm5,%%xmm2 \n"
993 "punpcklwd %%xmm2,%%xmm0 \n"
994 "movdqa %%xmm0,(%3) \n"
995 "lea 0x10(%3),%3 \n"
996 "sub $0x4,%4 \n"
997 "ja 1b \n"
998 : "+r"(y_buf), // %0
999 "+r"(u_buf), // %1
1000 "+r"(v_buf), // %2
1001 "+r"(rgb_buf), // %3
1002 "+rm"(width) // %4
1003 : "r"(&kYuvConstants.kUVToB) // %5
1004 : "memory", "cc"
1005#if defined(__SSE2__)
1006 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1007#endif
1008 );
1009}
1010#endif
1011
1012#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001013void YToARGBRow_SSE2(const uint8* y_buf,
1014 uint8* rgb_buf,
1015 int width) {
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001016 asm volatile (
1017 "pcmpeqb %%xmm4,%%xmm4 \n"
1018 "pslld $0x18,%%xmm4 \n"
1019 "mov $0x10001000,%%eax \n"
1020 "movd %%eax,%%xmm3 \n"
1021 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1022 "mov $0x012a012a,%%eax \n"
1023 "movd %%eax,%%xmm2 \n"
1024 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001025 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001026 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001027 "movq (%0),%%xmm0 \n"
1028 "lea 0x8(%0),%0 \n"
1029 "punpcklbw %%xmm0,%%xmm0 \n"
1030 "psubusw %%xmm3,%%xmm0 \n"
1031 "pmulhuw %%xmm2,%%xmm0 \n"
1032 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001033
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001034 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001035 "punpcklbw %%xmm0,%%xmm0 \n"
1036 "movdqa %%xmm0,%%xmm1 \n"
1037 "punpcklwd %%xmm0,%%xmm0 \n"
1038 "punpckhwd %%xmm1,%%xmm1 \n"
1039 "por %%xmm4,%%xmm0 \n"
1040 "por %%xmm4,%%xmm1 \n"
1041 "movdqa %%xmm0,(%1) \n"
1042 "movdqa %%xmm1,16(%1) \n"
1043 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001044
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001045 "sub $0x8,%2 \n"
1046 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001047 : "+r"(y_buf), // %0
1048 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001049 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001050 :
1051 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001052#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001053 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001054#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001055 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001056}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001057#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001058
frkoenig@google.come5185422011-11-07 23:07:57 +00001059#ifdef HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001060void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1061 SIMD_ALIGNED(uint8 row[kMaxStride]);
1062 ABGRToARGBRow_SSSE3(src_argb, row, pix);
1063 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001064}
1065
fbarchard@google.comb6149762011-11-07 21:58:52 +00001066void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1067 SIMD_ALIGNED(uint8 row[kMaxStride]);
1068 BGRAToARGBRow_SSSE3(src_argb, row, pix);
1069 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +00001070}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +00001071
1072void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1073 SIMD_ALIGNED(uint8 row[kMaxStride]);
1074 ABGRToARGBRow_C(src_argb, row, pix);
1075 ARGBToYRow_SSSE3(row, dst_y, pix);
1076}
1077
1078void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1079 SIMD_ALIGNED(uint8 row[kMaxStride]);
1080 BGRAToARGBRow_C(src_argb, row, pix);
1081 ARGBToYRow_SSSE3(row, dst_y, pix);
1082}
frkoenig@google.come5185422011-11-07 23:07:57 +00001083#endif
mikhal@webrtc.org43575c82011-10-12 18:49:21 +00001084
fbarchard@google.comb6149762011-11-07 21:58:52 +00001085#ifdef HAS_ARGBTOUVROW_SSSE3
1086void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
1087 uint8* dst_u, uint8* dst_v, int pix) {
1088 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
1089 ABGRToARGBRow_SSSE3(src_argb, row, pix);
1090 ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
1091 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +00001092}
1093
fbarchard@google.comb6149762011-11-07 21:58:52 +00001094void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
1095 uint8* dst_u, uint8* dst_v, int pix) {
1096 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
1097 BGRAToARGBRow_SSSE3(src_argb, row, pix);
1098 BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
1099 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001100}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +00001101
1102void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
1103 uint8* dst_u, uint8* dst_v, int pix) {
1104 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
1105 ABGRToARGBRow_C(src_argb, row, pix);
1106 ABGRToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
1107 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
1108}
1109
1110void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
1111 uint8* dst_u, uint8* dst_v, int pix) {
1112 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
1113 BGRAToARGBRow_C(src_argb, row, pix);
1114 BGRAToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
1115 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
1116}
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001117#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +00001118
fbarchard@google.com42831e02012-01-21 02:54:17 +00001119#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001120
1121// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001122CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001123 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1124};
1125
fbarchard@google.com42831e02012-01-21 02:54:17 +00001126void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001127 intptr_t temp_width = static_cast<intptr_t>(width);
1128 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001129 "movdqa %3,%%xmm5 \n"
1130 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001131 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001132 "movdqa (%0,%2),%%xmm0 \n"
1133 "pshufb %%xmm5,%%xmm0 \n"
1134 "sub $0x10,%2 \n"
1135 "movdqa %%xmm0,(%1) \n"
1136 "lea 0x10(%1),%1 \n"
1137 "ja 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001138 : "+r"(src), // %0
1139 "+r"(dst), // %1
1140 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001141 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001142 : "memory", "cc"
1143#if defined(__SSE2__)
1144 , "xmm0", "xmm5"
1145#endif
1146 );
1147}
1148#endif
1149
fbarchard@google.com42831e02012-01-21 02:54:17 +00001150#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001151
fbarchard@google.com42831e02012-01-21 02:54:17 +00001152void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001153 intptr_t temp_width = static_cast<intptr_t>(width);
1154 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001155 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001156 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001157 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001158 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001159 "psllw $0x8,%%xmm0 \n"
1160 "psrlw $0x8,%%xmm1 \n"
1161 "por %%xmm1,%%xmm0 \n"
1162 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1163 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1164 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1165 "sub $0x10,%2 \n"
1166 "movdqu %%xmm0,(%1) \n"
1167 "lea 0x10(%1),%1 \n"
1168 "ja 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001169 : "+r"(src), // %0
1170 "+r"(dst), // %1
1171 "+r"(temp_width) // %2
1172 :
1173 : "memory", "cc"
1174#if defined(__SSE2__)
1175 , "xmm0", "xmm1"
1176#endif
1177 );
1178}
1179#endif
1180
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001181#ifdef HAS_SPLITUV_SSE2
1182void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
1183 asm volatile (
1184 "pcmpeqb %%xmm5,%%xmm5 \n"
1185 "psrlw $0x8,%%xmm5 \n"
1186 "sub %1,%2 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001187 "1: \n"
1188 "movdqa (%0),%%xmm0 \n"
1189 "movdqa 0x10(%0),%%xmm1 \n"
1190 "lea 0x20(%0),%0 \n"
1191 "movdqa %%xmm0,%%xmm2 \n"
1192 "movdqa %%xmm1,%%xmm3 \n"
1193 "pand %%xmm5,%%xmm0 \n"
1194 "pand %%xmm5,%%xmm1 \n"
1195 "packuswb %%xmm1,%%xmm0 \n"
1196 "psrlw $0x8,%%xmm2 \n"
1197 "psrlw $0x8,%%xmm3 \n"
1198 "packuswb %%xmm3,%%xmm2 \n"
1199 "movdqa %%xmm0,(%1) \n"
1200 "movdqa %%xmm2,(%1,%2) \n"
1201 "lea 0x10(%1),%1 \n"
1202 "sub $0x10,%3 \n"
1203 "ja 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001204 : "+r"(src_uv), // %0
1205 "+r"(dst_u), // %1
1206 "+r"(dst_v), // %2
1207 "+r"(pix) // %3
1208 :
1209 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001210#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001211 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001212#endif
1213 );
1214}
1215#endif
1216
fbarchard@google.com19932f82012-02-16 22:19:14 +00001217#ifdef HAS_COPYROW_SSE2
1218void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
1219 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001220 "sub %0,%1 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001221 "1: \n"
1222 "movdqa (%0),%%xmm0 \n"
1223 "movdqa 0x10(%0),%%xmm1 \n"
1224 "movdqa %%xmm0,(%0,%1) \n"
1225 "movdqa %%xmm1,0x10(%0,%1) \n"
1226 "lea 0x20(%0),%0 \n"
1227 "sub $0x20,%2 \n"
1228 "ja 1b \n"
1229 : "+r"(src), // %0
1230 "+r"(dst), // %1
1231 "+r"(count) // %2
1232 :
1233 : "memory", "cc"
1234#if defined(__SSE2__)
1235 , "xmm0", "xmm1"
1236#endif
1237 );
1238}
1239#endif // HAS_COPYROW_SSE2
1240
1241#ifdef HAS_COPYROW_X86
1242void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1243 size_t width_tmp = static_cast<size_t>(width);
1244 asm volatile (
1245 "shr $0x2,%2 \n"
1246 "rep movsl \n"
1247 : "+S"(src), // %0
1248 "+D"(dst), // %1
1249 "+c"(width_tmp) // %2
1250 :
1251 : "memory", "cc"
1252 );
1253}
1254#endif
1255
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001256#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001257void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
1258 asm volatile (
1259 "pcmpeqb %%xmm5,%%xmm5 \n"
1260 "psrlw $0x8,%%xmm5 \n"
1261 "1: \n"
1262 "movdqa (%0),%%xmm0 \n"
1263 "movdqa 0x10(%0),%%xmm1 \n"
1264 "lea 0x20(%0),%0 \n"
1265 "pand %%xmm5,%%xmm0 \n"
1266 "pand %%xmm5,%%xmm1 \n"
1267 "packuswb %%xmm1,%%xmm0 \n"
1268 "movdqa %%xmm0,(%1) \n"
1269 "lea 0x10(%1),%1 \n"
1270 "sub $0x10,%2 \n"
1271 "ja 1b \n"
1272 : "+r"(src_yuy2), // %0
1273 "+r"(dst_y), // %1
1274 "+r"(pix) // %2
1275 :
1276 : "memory", "cc"
1277#if defined(__SSE2__)
1278 , "xmm0", "xmm1", "xmm5"
1279#endif
1280 );
1281}
1282
1283void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1284 uint8* dst_u, uint8* dst_y, int pix) {
1285 asm volatile (
1286 "pcmpeqb %%xmm5,%%xmm5 \n"
1287 "psrlw $0x8,%%xmm5 \n"
1288 "sub %1,%2 \n"
1289 "1: \n"
1290 "movdqa (%0),%%xmm0 \n"
1291 "movdqa 0x10(%0),%%xmm1 \n"
1292 "movdqa (%0,%4,1),%%xmm2 \n"
1293 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1294 "lea 0x20(%0),%0 \n"
1295 "pavgb %%xmm2,%%xmm0 \n"
1296 "pavgb %%xmm3,%%xmm1 \n"
1297 "psrlw $0x8,%%xmm0 \n"
1298 "psrlw $0x8,%%xmm1 \n"
1299 "packuswb %%xmm1,%%xmm0 \n"
1300 "movdqa %%xmm0,%%xmm1 \n"
1301 "pand %%xmm5,%%xmm0 \n"
1302 "packuswb %%xmm0,%%xmm0 \n"
1303 "psrlw $0x8,%%xmm1 \n"
1304 "packuswb %%xmm1,%%xmm1 \n"
1305 "movq %%xmm0,(%1) \n"
1306 "movq %%xmm1,(%1,%2) \n"
1307 "lea 0x8(%1),%1 \n"
1308 "sub $0x10,%3 \n"
1309 "ja 1b \n"
1310 : "+r"(src_yuy2), // %0
1311 "+r"(dst_u), // %1
1312 "+r"(dst_y), // %2
1313 "+r"(pix) // %3
1314 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1315 : "memory", "cc"
1316#if defined(__SSE2__)
1317 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1318#endif
1319 );
1320}
1321
1322void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1323 uint8* dst_y, int pix) {
1324 asm volatile (
1325 "pcmpeqb %%xmm5,%%xmm5 \n"
1326 "psrlw $0x8,%%xmm5 \n"
1327 "1: \n"
1328 "movdqu (%0),%%xmm0 \n"
1329 "movdqu 0x10(%0),%%xmm1 \n"
1330 "lea 0x20(%0),%0 \n"
1331 "pand %%xmm5,%%xmm0 \n"
1332 "pand %%xmm5,%%xmm1 \n"
1333 "packuswb %%xmm1,%%xmm0 \n"
1334 "movdqu %%xmm0,(%1) \n"
1335 "lea 0x10(%1),%1 \n"
1336 "sub $0x10,%2 \n"
1337 "ja 1b \n"
1338 : "+r"(src_yuy2), // %0
1339 "+r"(dst_y), // %1
1340 "+r"(pix) // %2
1341 :
1342 : "memory", "cc"
1343#if defined(__SSE2__)
1344 , "xmm0", "xmm1", "xmm5"
1345#endif
1346 );
1347}
1348
1349void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1350 int stride_yuy2,
1351 uint8* dst_u, uint8* dst_y,
1352 int pix) {
1353 asm volatile (
1354 "pcmpeqb %%xmm5,%%xmm5 \n"
1355 "psrlw $0x8,%%xmm5 \n"
1356 "sub %1,%2 \n"
1357 "1: \n"
1358 "movdqu (%0),%%xmm0 \n"
1359 "movdqu 0x10(%0),%%xmm1 \n"
1360 "movdqu (%0,%4,1),%%xmm2 \n"
1361 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1362 "lea 0x20(%0),%0 \n"
1363 "pavgb %%xmm2,%%xmm0 \n"
1364 "pavgb %%xmm3,%%xmm1 \n"
1365 "psrlw $0x8,%%xmm0 \n"
1366 "psrlw $0x8,%%xmm1 \n"
1367 "packuswb %%xmm1,%%xmm0 \n"
1368 "movdqa %%xmm0,%%xmm1 \n"
1369 "pand %%xmm5,%%xmm0 \n"
1370 "packuswb %%xmm0,%%xmm0 \n"
1371 "psrlw $0x8,%%xmm1 \n"
1372 "packuswb %%xmm1,%%xmm1 \n"
1373 "movq %%xmm0,(%1) \n"
1374 "movq %%xmm1,(%1,%2) \n"
1375 "lea 0x8(%1),%1 \n"
1376 "sub $0x10,%3 \n"
1377 "ja 1b \n"
1378 : "+r"(src_yuy2), // %0
1379 "+r"(dst_u), // %1
1380 "+r"(dst_y), // %2
1381 "+r"(pix) // %3
1382 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1383 : "memory", "cc"
1384#if defined(__SSE2__)
1385 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1386#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001387 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001388}
1389
1390void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
1391 asm volatile (
1392 "1: \n"
1393 "movdqa (%0),%%xmm0 \n"
1394 "movdqa 0x10(%0),%%xmm1 \n"
1395 "lea 0x20(%0),%0 \n"
1396 "psrlw $0x8,%%xmm0 \n"
1397 "psrlw $0x8,%%xmm1 \n"
1398 "packuswb %%xmm1,%%xmm0 \n"
1399 "movdqa %%xmm0,(%1) \n"
1400 "lea 0x10(%1),%1 \n"
1401 "sub $0x10,%2 \n"
1402 "ja 1b \n"
1403 : "+r"(src_uyvy), // %0
1404 "+r"(dst_y), // %1
1405 "+r"(pix) // %2
1406 :
1407 : "memory", "cc"
1408#if defined(__SSE2__)
1409 , "xmm0", "xmm1"
1410#endif
1411 );
1412}
1413
1414void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
1415 uint8* dst_u, uint8* dst_y, int pix) {
1416 asm volatile (
1417 "pcmpeqb %%xmm5,%%xmm5 \n"
1418 "psrlw $0x8,%%xmm5 \n"
1419 "sub %1,%2 \n"
1420 "1: \n"
1421 "movdqa (%0),%%xmm0 \n"
1422 "movdqa 0x10(%0),%%xmm1 \n"
1423 "movdqa (%0,%4,1),%%xmm2 \n"
1424 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1425 "lea 0x20(%0),%0 \n"
1426 "pavgb %%xmm2,%%xmm0 \n"
1427 "pavgb %%xmm3,%%xmm1 \n"
1428 "pand %%xmm5,%%xmm0 \n"
1429 "pand %%xmm5,%%xmm1 \n"
1430 "packuswb %%xmm1,%%xmm0 \n"
1431 "movdqa %%xmm0,%%xmm1 \n"
1432 "pand %%xmm5,%%xmm0 \n"
1433 "packuswb %%xmm0,%%xmm0 \n"
1434 "psrlw $0x8,%%xmm1 \n"
1435 "packuswb %%xmm1,%%xmm1 \n"
1436 "movq %%xmm0,(%1) \n"
1437 "movq %%xmm1,(%1,%2) \n"
1438 "lea 0x8(%1),%1 \n"
1439 "sub $0x10,%3 \n"
1440 "ja 1b \n"
1441 : "+r"(src_uyvy), // %0
1442 "+r"(dst_u), // %1
1443 "+r"(dst_y), // %2
1444 "+r"(pix) // %3
1445 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1446 : "memory", "cc"
1447#if defined(__SSE2__)
1448 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1449#endif
1450 );
1451}
1452
1453void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
1454 uint8* dst_y, int pix) {
1455 asm volatile (
1456 "1: \n"
1457 "movdqu (%0),%%xmm0 \n"
1458 "movdqu 0x10(%0),%%xmm1 \n"
1459 "lea 0x20(%0),%0 \n"
1460 "psrlw $0x8,%%xmm0 \n"
1461 "psrlw $0x8,%%xmm1 \n"
1462 "packuswb %%xmm1,%%xmm0 \n"
1463 "movdqu %%xmm0,(%1) \n"
1464 "lea 0x10(%1),%1 \n"
1465 "sub $0x10,%2 \n"
1466 "ja 1b \n"
1467 : "+r"(src_uyvy), // %0
1468 "+r"(dst_y), // %1
1469 "+r"(pix) // %2
1470 :
1471 : "memory", "cc"
1472#if defined(__SSE2__)
1473 , "xmm0", "xmm1"
1474#endif
1475 );
1476}
1477
1478void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
1479 uint8* dst_u, uint8* dst_y, int pix) {
1480 asm volatile (
1481 "pcmpeqb %%xmm5,%%xmm5 \n"
1482 "psrlw $0x8,%%xmm5 \n"
1483 "sub %1,%2 \n"
1484 "1: \n"
1485 "movdqu (%0),%%xmm0 \n"
1486 "movdqu 0x10(%0),%%xmm1 \n"
1487 "movdqu (%0,%4,1),%%xmm2 \n"
1488 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1489 "lea 0x20(%0),%0 \n"
1490 "pavgb %%xmm2,%%xmm0 \n"
1491 "pavgb %%xmm3,%%xmm1 \n"
1492 "pand %%xmm5,%%xmm0 \n"
1493 "pand %%xmm5,%%xmm1 \n"
1494 "packuswb %%xmm1,%%xmm0 \n"
1495 "movdqa %%xmm0,%%xmm1 \n"
1496 "pand %%xmm5,%%xmm0 \n"
1497 "packuswb %%xmm0,%%xmm0 \n"
1498 "psrlw $0x8,%%xmm1 \n"
1499 "packuswb %%xmm1,%%xmm1 \n"
1500 "movq %%xmm0,(%1) \n"
1501 "movq %%xmm1,(%1,%2) \n"
1502 "lea 0x8(%1),%1 \n"
1503 "sub $0x10,%3 \n"
1504 "ja 1b \n"
1505 : "+r"(src_uyvy), // %0
1506 "+r"(dst_u), // %1
1507 "+r"(dst_y), // %2
1508 "+r"(pix) // %3
1509 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1510 : "memory", "cc"
1511#if defined(__SSE2__)
1512 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1513#endif
1514 );
1515}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001516#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001517
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001518#endif // defined(__x86_64__) || defined(__i386__)
1519
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001520#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001521} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001522} // namespace libyuv
1523#endif