blob: e0148b7250eeb39d1a918ce7c23f72cf6d9b1b99 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000020#ifdef __APPLE__
21#define CONST
22#else
23#define CONST static const
24#endif
25
fbarchard@google.comb6149762011-11-07 21:58:52 +000026#ifdef HAS_ARGBTOUVROW_SSSE3
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000027CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000028 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
29};
30
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000031CONST uvec8 kARGBToV = {
fbarchard@google.com2430e042011-11-11 21:57:06 +000032 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
fbarchard@google.comb6149762011-11-07 21:58:52 +000033};
fbarchard@google.com2430e042011-11-11 21:57:06 +000034
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000035CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000036 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
37 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
38};
39#endif
40
fbarchard@google.com228bdc22011-11-15 21:58:26 +000041#ifdef HAS_ARGBTOYROW_SSSE3
42
43// Constant multiplication table for converting ARGB to I400.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000044CONST vec8 kARGBToY = {
fbarchard@google.com228bdc22011-11-15 21:58:26 +000045 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
46};
47
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000048CONST uvec8 kAddY16 = {
fbarchard@google.com228bdc22011-11-15 21:58:26 +000049 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
50};
51
fbarchard@google.comba1f5262012-01-12 19:22:41 +000052// Shuffle table for converting RGB24 to ARGB.
53CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000054 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
55};
56
57// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000058CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000059 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
60};
61
fbarchard@google.comb6149762011-11-07 21:58:52 +000062// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000063CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000064 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
65};
66
67// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000068CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000069 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
70};
71
72void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000073 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +000074 "pcmpeqb %%xmm5,%%xmm5 \n"
75 "pslld $0x18,%%xmm5 \n"
76 "1: \n"
77 "movq (%0),%%xmm0 \n"
78 "lea 0x8(%0),%0 \n"
79 "punpcklbw %%xmm0,%%xmm0 \n"
80 "movdqa %%xmm0,%%xmm1 \n"
81 "punpcklwd %%xmm0,%%xmm0 \n"
82 "punpckhwd %%xmm1,%%xmm1 \n"
83 "por %%xmm5,%%xmm0 \n"
84 "por %%xmm5,%%xmm1 \n"
85 "movdqa %%xmm0,(%1) \n"
86 "movdqa %%xmm1,0x10(%1) \n"
87 "lea 0x20(%1),%1 \n"
88 "sub $0x8,%2 \n"
89 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 : "+r"(src_y), // %0
91 "+r"(dst_argb), // %1
92 "+r"(pix) // %2
93 :
94 : "memory", "cc"
95#if defined(__SSE2__)
96 , "xmm0", "xmm1", "xmm5"
97#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +000098);
99}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000100
101void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000102 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000103 "movdqa %3,%%xmm5 \n"
104 "1: \n"
105 "movdqa (%0),%%xmm0 \n"
106 "lea 0x10(%0),%0 \n"
107 "pshufb %%xmm5,%%xmm0 \n"
108 "movdqa %%xmm0,(%1) \n"
109 "lea 0x10(%1),%1 \n"
110 "sub $0x4,%2 \n"
111 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000112 : "+r"(src_abgr), // %0
113 "+r"(dst_argb), // %1
114 "+r"(pix) // %2
115 : "m"(kShuffleMaskABGRToARGB) // %3
116 : "memory", "cc"
117#if defined(__SSE2__)
118 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000119#endif
120
fbarchard@google.comb6149762011-11-07 21:58:52 +0000121);
122}
123
124void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000125 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000126 "movdqa %3,%%xmm5 \n"
127 "1: \n"
128 "movdqa (%0),%%xmm0 \n"
129 "lea 0x10(%0),%0 \n"
130 "pshufb %%xmm5,%%xmm0 \n"
131 "movdqa %%xmm0,(%1) \n"
132 "lea 0x10(%1),%1 \n"
133 "sub $0x4,%2 \n"
134 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000135 : "+r"(src_bgra), // %0
136 "+r"(dst_argb), // %1
137 "+r"(pix) // %2
138 : "m"(kShuffleMaskBGRAToARGB) // %3
139 : "memory", "cc"
140#if defined(__SSE2__)
141 , "xmm0", "xmm5"
142#endif
143);
144}
145
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000146void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000147 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000148 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
149 "pslld $0x18,%%xmm5 \n"
150 "movdqa %3,%%xmm4 \n"
151 "1: \n"
152 "movdqu (%0),%%xmm0 \n"
153 "movdqu 0x10(%0),%%xmm1 \n"
154 "movdqu 0x20(%0),%%xmm3 \n"
155 "lea 0x30(%0),%0 \n"
156 "movdqa %%xmm3,%%xmm2 \n"
157 "palignr $0x8,%%xmm1,%%xmm2 \n"
158 "pshufb %%xmm4,%%xmm2 \n"
159 "por %%xmm5,%%xmm2 \n"
160 "palignr $0xc,%%xmm0,%%xmm1 \n"
161 "pshufb %%xmm4,%%xmm0 \n"
162 "movdqa %%xmm2,0x20(%1) \n"
163 "por %%xmm5,%%xmm0 \n"
164 "pshufb %%xmm4,%%xmm1 \n"
165 "movdqa %%xmm0,(%1) \n"
166 "por %%xmm5,%%xmm1 \n"
167 "palignr $0x4,%%xmm3,%%xmm3 \n"
168 "pshufb %%xmm4,%%xmm3 \n"
169 "movdqa %%xmm1,0x10(%1) \n"
170 "por %%xmm5,%%xmm3 \n"
171 "movdqa %%xmm3,0x30(%1) \n"
172 "lea 0x40(%1),%1 \n"
173 "sub $0x10,%2 \n"
174 "ja 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000175 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000176 "+r"(dst_argb), // %1
177 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000178 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000179 : "memory", "cc"
180#if defined(__SSE2__)
181 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
182#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000183);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000184}
185
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000186void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000187 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000188 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
189 "pslld $0x18,%%xmm5 \n"
190 "movdqa %3,%%xmm4 \n"
191 "1: \n"
192 "movdqu (%0),%%xmm0 \n"
193 "movdqu 0x10(%0),%%xmm1 \n"
194 "movdqu 0x20(%0),%%xmm3 \n"
195 "lea 0x30(%0),%0 \n"
196 "movdqa %%xmm3,%%xmm2 \n"
197 "palignr $0x8,%%xmm1,%%xmm2 \n"
198 "pshufb %%xmm4,%%xmm2 \n"
199 "por %%xmm5,%%xmm2 \n"
200 "palignr $0xc,%%xmm0,%%xmm1 \n"
201 "pshufb %%xmm4,%%xmm0 \n"
202 "movdqa %%xmm2,0x20(%1) \n"
203 "por %%xmm5,%%xmm0 \n"
204 "pshufb %%xmm4,%%xmm1 \n"
205 "movdqa %%xmm0,(%1) \n"
206 "por %%xmm5,%%xmm1 \n"
207 "palignr $0x4,%%xmm3,%%xmm3 \n"
208 "pshufb %%xmm4,%%xmm3 \n"
209 "movdqa %%xmm1,0x10(%1) \n"
210 "por %%xmm5,%%xmm3 \n"
211 "movdqa %%xmm3,0x30(%1) \n"
212 "lea 0x40(%1),%1 \n"
213 "sub $0x10,%2 \n"
214 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000215 : "+r"(src_raw), // %0
216 "+r"(dst_argb), // %1
217 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000218 : "m"(kShuffleMaskRAWToARGB) // %3
219 : "memory", "cc"
220#if defined(__SSE2__)
221 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
222#endif
223);
224}
225
226void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000227 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000228 "movdqa %4,%%xmm5 \n"
229 "movdqa %3,%%xmm4 \n"
230 "1: \n"
231 "movdqa (%0),%%xmm0 \n"
232 "movdqa 0x10(%0),%%xmm1 \n"
233 "movdqa 0x20(%0),%%xmm2 \n"
234 "movdqa 0x30(%0),%%xmm3 \n"
235 "pmaddubsw %%xmm4,%%xmm0 \n"
236 "pmaddubsw %%xmm4,%%xmm1 \n"
237 "pmaddubsw %%xmm4,%%xmm2 \n"
238 "pmaddubsw %%xmm4,%%xmm3 \n"
239 "lea 0x40(%0),%0 \n"
240 "phaddw %%xmm1,%%xmm0 \n"
241 "phaddw %%xmm3,%%xmm2 \n"
242 "psrlw $0x7,%%xmm0 \n"
243 "psrlw $0x7,%%xmm2 \n"
244 "packuswb %%xmm2,%%xmm0 \n"
245 "paddb %%xmm5,%%xmm0 \n"
246 "movdqa %%xmm0,(%1) \n"
247 "lea 0x10(%1),%1 \n"
248 "sub $0x10,%2 \n"
249 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000250 : "+r"(src_argb), // %0
251 "+r"(dst_y), // %1
252 "+r"(pix) // %2
253 : "m"(kARGBToY), // %3
254 "m"(kAddY16) // %4
255 : "memory", "cc"
256#if defined(__SSE2__)
257 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
258#endif
259
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000260);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000261}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000262
263void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
264 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000265 "movdqa %4,%%xmm5 \n"
266 "movdqa %3,%%xmm4 \n"
267 "1: \n"
268 "movdqu (%0),%%xmm0 \n"
269 "movdqu 0x10(%0),%%xmm1 \n"
270 "movdqu 0x20(%0),%%xmm2 \n"
271 "movdqu 0x30(%0),%%xmm3 \n"
272 "pmaddubsw %%xmm4,%%xmm0 \n"
273 "pmaddubsw %%xmm4,%%xmm1 \n"
274 "pmaddubsw %%xmm4,%%xmm2 \n"
275 "pmaddubsw %%xmm4,%%xmm3 \n"
276 "lea 0x40(%0),%0 \n"
277 "phaddw %%xmm1,%%xmm0 \n"
278 "phaddw %%xmm3,%%xmm2 \n"
279 "psrlw $0x7,%%xmm0 \n"
280 "psrlw $0x7,%%xmm2 \n"
281 "packuswb %%xmm2,%%xmm0 \n"
282 "paddb %%xmm5,%%xmm0 \n"
283 "movdqu %%xmm0,(%1) \n"
284 "lea 0x10(%1),%1 \n"
285 "sub $0x10,%2 \n"
286 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000287 : "+r"(src_argb), // %0
288 "+r"(dst_y), // %1
289 "+r"(pix) // %2
290 : "m"(kARGBToY), // %3
291 "m"(kAddY16) // %4
292 : "memory", "cc"
293#if defined(__SSE2__)
294 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
295#endif
296
297);
298}
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000299#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +0000300
fbarchard@google.comb6149762011-11-07 21:58:52 +0000301#ifdef HAS_ARGBTOUVROW_SSSE3
302void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
303 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000304 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000305 "movdqa %0,%%xmm4 \n"
306 "movdqa %1,%%xmm3 \n"
307 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000308 :
309 : "m"(kARGBToU), // %0
310 "m"(kARGBToV), // %1
311 "m"(kAddUV128) // %2
312 :
313#if defined(__SSE2__)
314 "xmm3", "xmm4", "xmm5"
315#endif
316 );
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000317 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000318 "sub %1,%2 \n"
319 "1: \n"
320 "movdqa (%0),%%xmm0 \n"
321 "movdqa 0x10(%0),%%xmm1 \n"
322 "movdqa 0x20(%0),%%xmm2 \n"
323 "movdqa 0x30(%0),%%xmm6 \n"
324 "pavgb (%0,%4,1),%%xmm0 \n"
325 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
326 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
327 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
328 "lea 0x40(%0),%0 \n"
329 "movdqa %%xmm0,%%xmm7 \n"
330 "shufps $0x88,%%xmm1,%%xmm0 \n"
331 "shufps $0xdd,%%xmm1,%%xmm7 \n"
332 "pavgb %%xmm7,%%xmm0 \n"
333 "movdqa %%xmm2,%%xmm7 \n"
334 "shufps $0x88,%%xmm6,%%xmm2 \n"
335 "shufps $0xdd,%%xmm6,%%xmm7 \n"
336 "pavgb %%xmm7,%%xmm2 \n"
337 "movdqa %%xmm0,%%xmm1 \n"
338 "movdqa %%xmm2,%%xmm6 \n"
339 "pmaddubsw %%xmm4,%%xmm0 \n"
340 "pmaddubsw %%xmm4,%%xmm2 \n"
341 "pmaddubsw %%xmm3,%%xmm1 \n"
342 "pmaddubsw %%xmm3,%%xmm6 \n"
343 "phaddw %%xmm2,%%xmm0 \n"
344 "phaddw %%xmm6,%%xmm1 \n"
345 "psraw $0x8,%%xmm0 \n"
346 "psraw $0x8,%%xmm1 \n"
347 "packsswb %%xmm1,%%xmm0 \n"
348 "paddb %%xmm5,%%xmm0 \n"
349 "movlps %%xmm0,(%1) \n"
350 "movhps %%xmm0,(%1,%2,1) \n"
351 "lea 0x8(%1),%1 \n"
352 "sub $0x10,%3 \n"
353 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000354 : "+r"(src_argb0), // %0
355 "+r"(dst_u), // %1
356 "+r"(dst_v), // %2
357 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000358 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000359 : "memory", "cc"
360#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000361 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000362#endif
363);
364}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000365
366void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
367 uint8* dst_u, uint8* dst_v, int width) {
368 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000369 "movdqa %0,%%xmm4 \n"
370 "movdqa %1,%%xmm3 \n"
371 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000372 :
373 : "m"(kARGBToU), // %0
374 "m"(kARGBToV), // %1
375 "m"(kAddUV128) // %2
376 :
377#if defined(__SSE2__)
378 "xmm3", "xmm4", "xmm5"
379#endif
380 );
381 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000382 "sub %1,%2 \n"
383 "1: \n"
384 "movdqu (%0),%%xmm0 \n"
385 "movdqu 0x10(%0),%%xmm1 \n"
386 "movdqu 0x20(%0),%%xmm2 \n"
387 "movdqu 0x30(%0),%%xmm6 \n"
388 "movdqu (%0,%4,1),%%xmm7 \n"
389 "pavgb %%xmm7,%%xmm0 \n"
390 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
391 "pavgb %%xmm7,%%xmm1 \n"
392 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
393 "pavgb %%xmm7,%%xmm2 \n"
394 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
395 "pavgb %%xmm7,%%xmm6 \n"
396 "lea 0x40(%0),%0 \n"
397 "movdqa %%xmm0,%%xmm7 \n"
398 "shufps $0x88,%%xmm1,%%xmm0 \n"
399 "shufps $0xdd,%%xmm1,%%xmm7 \n"
400 "pavgb %%xmm7,%%xmm0 \n"
401 "movdqa %%xmm2,%%xmm7 \n"
402 "shufps $0x88,%%xmm6,%%xmm2 \n"
403 "shufps $0xdd,%%xmm6,%%xmm7 \n"
404 "pavgb %%xmm7,%%xmm2 \n"
405 "movdqa %%xmm0,%%xmm1 \n"
406 "movdqa %%xmm2,%%xmm6 \n"
407 "pmaddubsw %%xmm4,%%xmm0 \n"
408 "pmaddubsw %%xmm4,%%xmm2 \n"
409 "pmaddubsw %%xmm3,%%xmm1 \n"
410 "pmaddubsw %%xmm3,%%xmm6 \n"
411 "phaddw %%xmm2,%%xmm0 \n"
412 "phaddw %%xmm6,%%xmm1 \n"
413 "psraw $0x8,%%xmm0 \n"
414 "psraw $0x8,%%xmm1 \n"
415 "packsswb %%xmm1,%%xmm0 \n"
416 "paddb %%xmm5,%%xmm0 \n"
417 "movlps %%xmm0,(%1) \n"
418 "movhps %%xmm0,(%1,%2,1) \n"
419 "lea 0x8(%1),%1 \n"
420 "sub $0x10,%3 \n"
421 "ja 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000422 : "+r"(src_argb0), // %0
423 "+r"(dst_u), // %1
424 "+r"(dst_v), // %2
425 "+rm"(width) // %3
426 : "r"(static_cast<intptr_t>(src_stride_argb))
427 : "memory", "cc"
428#if defined(__SSE2__)
429 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
430#endif
431);
432}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000433#endif
434
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000435#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000436#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
437#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
438#define UR 0
439
440#define VB 0
441#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
442#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
443
444// Bias
445#define BB UB * 128 + VB * 128
446#define BG UG * 128 + VG * 128
447#define BR UR * 128 + VR * 128
448
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000449#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000450
fbarchard@google.comb6149762011-11-07 21:58:52 +0000451#if defined(__APPLE__) || defined(__x86_64__)
452#define OMITFP
453#else
454#define OMITFP __attribute__((optimize("omit-frame-pointer")))
455#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000456
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000457struct {
458 vec8 kUVToB;
459 vec8 kUVToG;
460 vec8 kUVToR;
461 vec16 kUVBiasB;
462 vec16 kUVBiasG;
463 vec16 kUVBiasR;
464 vec16 kYSub16;
465 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000466} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000467 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
468 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
469 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
470 { BB, BB, BB, BB, BB, BB, BB, BB },
471 { BG, BG, BG, BG, BG, BG, BG, BG },
472 { BR, BR, BR, BR, BR, BR, BR, BR },
473 { 16, 16, 16, 16, 16, 16, 16, 16 },
474 { YG, YG, YG, YG, YG, YG, YG, YG }
475};
476
477// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +0000478#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000479 "movd (%1),%%xmm0 \n" \
480 "movd (%1,%2,1),%%xmm1 \n" \
481 "lea 0x4(%1),%1 \n" \
482 "punpcklbw %%xmm1,%%xmm0 \n" \
483 "punpcklwd %%xmm0,%%xmm0 \n" \
484 "movdqa %%xmm0,%%xmm1 \n" \
485 "movdqa %%xmm0,%%xmm2 \n" \
486 "pmaddubsw (%5),%%xmm0 \n" \
487 "pmaddubsw 16(%5),%%xmm1 \n" \
488 "pmaddubsw 32(%5),%%xmm2 \n" \
489 "psubw 48(%5),%%xmm0 \n" \
490 "psubw 64(%5),%%xmm1 \n" \
491 "psubw 80(%5),%%xmm2 \n" \
492 "movq (%0),%%xmm3 \n" \
493 "lea 0x8(%0),%0 \n" \
494 "punpcklbw %%xmm4,%%xmm3 \n" \
495 "psubsw 96(%5),%%xmm3 \n" \
496 "pmullw 112(%5),%%xmm3 \n" \
497 "paddsw %%xmm3,%%xmm0 \n" \
498 "paddsw %%xmm3,%%xmm1 \n" \
499 "paddsw %%xmm3,%%xmm2 \n" \
500 "psraw $0x6,%%xmm0 \n" \
501 "psraw $0x6,%%xmm1 \n" \
502 "psraw $0x6,%%xmm2 \n" \
503 "packuswb %%xmm0,%%xmm0 \n" \
504 "packuswb %%xmm1,%%xmm1 \n" \
505 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000506
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000507void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, // rdi
508 const uint8* u_buf, // rsi
509 const uint8* v_buf, // rdx
fbarchard@google.comb6149762011-11-07 21:58:52 +0000510 uint8* rgb_buf, // rcx
511 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000512 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000513 "sub %1,%2 \n"
514 "pcmpeqb %%xmm5,%%xmm5 \n"
515 "pxor %%xmm4,%%xmm4 \n"
516
517 "1: \n"
518 YUVTORGB
519 "punpcklbw %%xmm1,%%xmm0 \n"
520 "punpcklbw %%xmm5,%%xmm2 \n"
521 "movdqa %%xmm0,%%xmm1 \n"
522 "punpcklwd %%xmm2,%%xmm0 \n"
523 "punpckhwd %%xmm2,%%xmm1 \n"
524 "movdqa %%xmm0,(%3) \n"
525 "movdqa %%xmm1,0x10(%3) \n"
526 "lea 0x20(%3),%3 \n"
527 "sub $0x8,%4 \n"
528 "ja 1b \n"
529 : "+r"(y_buf), // %0
530 "+r"(u_buf), // %1
531 "+r"(v_buf), // %2
532 "+r"(rgb_buf), // %3
533 "+rm"(width) // %4
534 : "r"(&kYuvConstants.kUVToB) // %5
535 : "memory", "cc"
536#if defined(__SSE2__)
537 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
538#endif
539 );
540}
541
542void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, // rdi
543 const uint8* u_buf, // rsi
544 const uint8* v_buf, // rdx
545 uint8* rgb_buf, // rcx
546 int width) { // r8
547 asm volatile (
548 "sub %1,%2 \n"
549 "pcmpeqb %%xmm5,%%xmm5 \n"
550 "pxor %%xmm4,%%xmm4 \n"
551
552 "1: \n"
553 YUVTORGB
554 "pcmpeqb %%xmm5,%%xmm5 \n"
555 "punpcklbw %%xmm0,%%xmm1 \n"
556 "punpcklbw %%xmm2,%%xmm5 \n"
557 "movdqa %%xmm5,%%xmm0 \n"
558 "punpcklwd %%xmm1,%%xmm5 \n"
559 "punpckhwd %%xmm1,%%xmm0 \n"
560 "movdqa %%xmm5,(%3) \n"
561 "movdqa %%xmm0,0x10(%3) \n"
562 "lea 0x20(%3),%3 \n"
563 "sub $0x8,%4 \n"
564 "ja 1b \n"
565 : "+r"(y_buf), // %0
566 "+r"(u_buf), // %1
567 "+r"(v_buf), // %2
568 "+r"(rgb_buf), // %3
569 "+rm"(width) // %4
570 : "r"(&kYuvConstants.kUVToB) // %5
571 : "memory", "cc"
572#if defined(__SSE2__)
573 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
574#endif
575 );
576}
577
578void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, // rdi
579 const uint8* u_buf, // rsi
580 const uint8* v_buf, // rdx
581 uint8* rgb_buf, // rcx
582 int width) { // r8
583 asm volatile (
584 "sub %1,%2 \n"
585 "pcmpeqb %%xmm5,%%xmm5 \n"
586 "pxor %%xmm4,%%xmm4 \n"
587
588 "1: \n"
589 YUVTORGB
590 "punpcklbw %%xmm1,%%xmm2 \n"
591 "punpcklbw %%xmm5,%%xmm0 \n"
592 "movdqa %%xmm2,%%xmm1 \n"
593 "punpcklwd %%xmm0,%%xmm2 \n"
594 "punpckhwd %%xmm0,%%xmm1 \n"
595 "movdqa %%xmm2,(%3) \n"
596 "movdqa %%xmm1,0x10(%3) \n"
597 "lea 0x20(%3),%3 \n"
598 "sub $0x8,%4 \n"
599 "ja 1b \n"
600 : "+r"(y_buf), // %0
601 "+r"(u_buf), // %1
602 "+r"(v_buf), // %2
603 "+r"(rgb_buf), // %3
604 "+rm"(width) // %4
605 : "r"(&kYuvConstants.kUVToB) // %5
606 : "memory", "cc"
607#if defined(__SSE2__)
608 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
609#endif
610 );
611}
612
613void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
614 const uint8* u_buf, // rsi
615 const uint8* v_buf, // rdx
616 uint8* rgb_buf, // rcx
617 int width) { // r8
618 asm volatile (
619 "sub %1,%2 \n"
620 "pcmpeqb %%xmm5,%%xmm5 \n"
621 "pxor %%xmm4,%%xmm4 \n"
622
623 "1: \n"
624 "movd (%1),%%xmm0 \n"
625 "movd (%1,%2,1),%%xmm1 \n"
626 "lea 0x4(%1),%1 \n"
627 "punpcklbw %%xmm1,%%xmm0 \n"
628 "movdqa %%xmm0,%%xmm1 \n"
629 "movdqa %%xmm0,%%xmm2 \n"
630 "pmaddubsw (%5),%%xmm0 \n"
631 "pmaddubsw 16(%5),%%xmm1 \n"
632 "pmaddubsw 32(%5),%%xmm2 \n"
633 "psubw 48(%5),%%xmm0 \n"
634 "psubw 64(%5),%%xmm1 \n"
635 "psubw 80(%5),%%xmm2 \n"
636 "movd (%0),%%xmm3 \n"
637 "lea 0x4(%0),%0 \n"
638 "punpcklbw %%xmm4,%%xmm3 \n"
639 "psubsw 96(%5),%%xmm3 \n"
640 "pmullw 112(%5),%%xmm3 \n"
641 "paddsw %%xmm3,%%xmm0 \n"
642 "paddsw %%xmm3,%%xmm1 \n"
643 "paddsw %%xmm3,%%xmm2 \n"
644 "psraw $0x6,%%xmm0 \n"
645 "psraw $0x6,%%xmm1 \n"
646 "psraw $0x6,%%xmm2 \n"
647 "packuswb %%xmm0,%%xmm0 \n"
648 "packuswb %%xmm1,%%xmm1 \n"
649 "packuswb %%xmm2,%%xmm2 \n"
650 "punpcklbw %%xmm1,%%xmm0 \n"
651 "punpcklbw %%xmm5,%%xmm2 \n"
652 "punpcklwd %%xmm2,%%xmm0 \n"
653 "movdqa %%xmm0,(%3) \n"
654 "lea 0x10(%3),%3 \n"
655 "sub $0x4,%4 \n"
656 "ja 1b \n"
657 : "+r"(y_buf), // %0
658 "+r"(u_buf), // %1
659 "+r"(v_buf), // %2
660 "+r"(rgb_buf), // %3
661 "+rm"(width) // %4
662 : "r"(&kYuvConstants.kUVToB) // %5
663 : "memory", "cc"
664#if defined(__SSE2__)
665 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
666#endif
667 );
668}
669#endif
670
671#ifdef HAS_YTOARGBROW_SSE2
672
673void YToARGBRow_SSE2(const uint8* y_buf, // rdi
674 uint8* rgb_buf, // rcx
675 int width) { // r8
676 asm volatile (
677 "pcmpeqb %%xmm4,%%xmm4 \n"
678 "pslld $0x18,%%xmm4 \n"
679 "mov $0x10001000,%%eax \n"
680 "movd %%eax,%%xmm3 \n"
681 "pshufd $0x0,%%xmm3,%%xmm3 \n"
682 "mov $0x012a012a,%%eax \n"
683 "movd %%eax,%%xmm2 \n"
684 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000685
686 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000687 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000688 "movq (%0),%%xmm0 \n"
689 "lea 0x8(%0),%0 \n"
690 "punpcklbw %%xmm0,%%xmm0 \n"
691 "psubusw %%xmm3,%%xmm0 \n"
692 "pmulhuw %%xmm2,%%xmm0 \n"
693 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000694
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000695 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000696 "punpcklbw %%xmm0,%%xmm0 \n"
697 "movdqa %%xmm0,%%xmm1 \n"
698 "punpcklwd %%xmm0,%%xmm0 \n"
699 "punpckhwd %%xmm1,%%xmm1 \n"
700 "por %%xmm4,%%xmm0 \n"
701 "por %%xmm4,%%xmm1 \n"
702 "movdqa %%xmm0,(%1) \n"
703 "movdqa %%xmm1,16(%1) \n"
704 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000705
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000706 "sub $0x8,%2 \n"
707 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000708 : "+r"(y_buf), // %0
709 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +0000710 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000711 :
712 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000713#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000714 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000715#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000716 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000717}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000718#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000719
frkoenig@google.come5185422011-11-07 23:07:57 +0000720#ifdef HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000721void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
722 SIMD_ALIGNED(uint8 row[kMaxStride]);
723 ABGRToARGBRow_SSSE3(src_argb, row, pix);
724 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000725}
726
fbarchard@google.comb6149762011-11-07 21:58:52 +0000727void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
728 SIMD_ALIGNED(uint8 row[kMaxStride]);
729 BGRAToARGBRow_SSSE3(src_argb, row, pix);
730 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000731}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000732
733void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
734 SIMD_ALIGNED(uint8 row[kMaxStride]);
735 ABGRToARGBRow_C(src_argb, row, pix);
736 ARGBToYRow_SSSE3(row, dst_y, pix);
737}
738
739void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
740 SIMD_ALIGNED(uint8 row[kMaxStride]);
741 BGRAToARGBRow_C(src_argb, row, pix);
742 ARGBToYRow_SSSE3(row, dst_y, pix);
743}
frkoenig@google.come5185422011-11-07 23:07:57 +0000744#endif
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000745
fbarchard@google.comb6149762011-11-07 21:58:52 +0000746#ifdef HAS_ARGBTOUVROW_SSSE3
747void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
748 uint8* dst_u, uint8* dst_v, int pix) {
749 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
750 ABGRToARGBRow_SSSE3(src_argb, row, pix);
751 ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
752 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000753}
754
fbarchard@google.comb6149762011-11-07 21:58:52 +0000755void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
756 uint8* dst_u, uint8* dst_v, int pix) {
757 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
758 BGRAToARGBRow_SSSE3(src_argb, row, pix);
759 BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
760 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000761}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000762
763void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
764 uint8* dst_u, uint8* dst_v, int pix) {
765 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
766 ABGRToARGBRow_C(src_argb, row, pix);
767 ABGRToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
768 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
769}
770
771void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
772 uint8* dst_u, uint8* dst_v, int pix) {
773 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
774 BGRAToARGBRow_C(src_argb, row, pix);
775 BGRAToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
776 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
777}
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000778#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000779
fbarchard@google.com42831e02012-01-21 02:54:17 +0000780#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +0000781
782// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +0000783CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +0000784 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
785};
786
fbarchard@google.com42831e02012-01-21 02:54:17 +0000787void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +0000788 intptr_t temp_width = static_cast<intptr_t>(width);
789 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000790 "movdqa %3,%%xmm5 \n"
791 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000792 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000793 "movdqa (%0,%2),%%xmm0 \n"
794 "pshufb %%xmm5,%%xmm0 \n"
795 "sub $0x10,%2 \n"
796 "movdqa %%xmm0,(%1) \n"
797 "lea 0x10(%1),%1 \n"
798 "ja 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +0000799 : "+r"(src), // %0
800 "+r"(dst), // %1
801 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +0000802 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +0000803 : "memory", "cc"
804#if defined(__SSE2__)
805 , "xmm0", "xmm5"
806#endif
807 );
808}
809#endif
810
fbarchard@google.com42831e02012-01-21 02:54:17 +0000811#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000812
fbarchard@google.com42831e02012-01-21 02:54:17 +0000813void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000814 intptr_t temp_width = static_cast<intptr_t>(width);
815 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000816 "lea -0x10(%0),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000817 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000818 "movdqu (%0,%2),%%xmm0 \n"
819 "movdqu %%xmm0,%%xmm1 \n"
820 "psllw $0x8,%%xmm0 \n"
821 "psrlw $0x8,%%xmm1 \n"
822 "por %%xmm1,%%xmm0 \n"
823 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
824 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
825 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
826 "sub $0x10,%2 \n"
827 "movdqu %%xmm0,(%1) \n"
828 "lea 0x10(%1),%1 \n"
829 "ja 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000830 : "+r"(src), // %0
831 "+r"(dst), // %1
832 "+r"(temp_width) // %2
833 :
834 : "memory", "cc"
835#if defined(__SSE2__)
836 , "xmm0", "xmm1"
837#endif
838 );
839}
840#endif
841
fbarchard@google.comb95dbf22012-02-11 01:18:30 +0000842#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000843void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
844 asm volatile (
845 "pcmpeqb %%xmm5,%%xmm5 \n"
846 "psrlw $0x8,%%xmm5 \n"
847 "1: \n"
848 "movdqa (%0),%%xmm0 \n"
849 "movdqa 0x10(%0),%%xmm1 \n"
850 "lea 0x20(%0),%0 \n"
851 "pand %%xmm5,%%xmm0 \n"
852 "pand %%xmm5,%%xmm1 \n"
853 "packuswb %%xmm1,%%xmm0 \n"
854 "movdqa %%xmm0,(%1) \n"
855 "lea 0x10(%1),%1 \n"
856 "sub $0x10,%2 \n"
857 "ja 1b \n"
858 : "+r"(src_yuy2), // %0
859 "+r"(dst_y), // %1
860 "+r"(pix) // %2
861 :
862 : "memory", "cc"
863#if defined(__SSE2__)
864 , "xmm0", "xmm1", "xmm5"
865#endif
866 );
867}
868
869void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
870 uint8* dst_u, uint8* dst_y, int pix) {
871 asm volatile (
872 "pcmpeqb %%xmm5,%%xmm5 \n"
873 "psrlw $0x8,%%xmm5 \n"
874 "sub %1,%2 \n"
875 "1: \n"
876 "movdqa (%0),%%xmm0 \n"
877 "movdqa 0x10(%0),%%xmm1 \n"
878 "movdqa (%0,%4,1),%%xmm2 \n"
879 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
880 "lea 0x20(%0),%0 \n"
881 "pavgb %%xmm2,%%xmm0 \n"
882 "pavgb %%xmm3,%%xmm1 \n"
883 "psrlw $0x8,%%xmm0 \n"
884 "psrlw $0x8,%%xmm1 \n"
885 "packuswb %%xmm1,%%xmm0 \n"
886 "movdqa %%xmm0,%%xmm1 \n"
887 "pand %%xmm5,%%xmm0 \n"
888 "packuswb %%xmm0,%%xmm0 \n"
889 "psrlw $0x8,%%xmm1 \n"
890 "packuswb %%xmm1,%%xmm1 \n"
891 "movq %%xmm0,(%1) \n"
892 "movq %%xmm1,(%1,%2) \n"
893 "lea 0x8(%1),%1 \n"
894 "sub $0x10,%3 \n"
895 "ja 1b \n"
896 : "+r"(src_yuy2), // %0
897 "+r"(dst_u), // %1
898 "+r"(dst_y), // %2
899 "+r"(pix) // %3
900 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
901 : "memory", "cc"
902#if defined(__SSE2__)
903 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
904#endif
905 );
906}
907
908void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
909 uint8* dst_y, int pix) {
910 asm volatile (
911 "pcmpeqb %%xmm5,%%xmm5 \n"
912 "psrlw $0x8,%%xmm5 \n"
913 "1: \n"
914 "movdqu (%0),%%xmm0 \n"
915 "movdqu 0x10(%0),%%xmm1 \n"
916 "lea 0x20(%0),%0 \n"
917 "pand %%xmm5,%%xmm0 \n"
918 "pand %%xmm5,%%xmm1 \n"
919 "packuswb %%xmm1,%%xmm0 \n"
920 "movdqu %%xmm0,(%1) \n"
921 "lea 0x10(%1),%1 \n"
922 "sub $0x10,%2 \n"
923 "ja 1b \n"
924 : "+r"(src_yuy2), // %0
925 "+r"(dst_y), // %1
926 "+r"(pix) // %2
927 :
928 : "memory", "cc"
929#if defined(__SSE2__)
930 , "xmm0", "xmm1", "xmm5"
931#endif
932 );
933}
934
935void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
936 int stride_yuy2,
937 uint8* dst_u, uint8* dst_y,
938 int pix) {
939 asm volatile (
940 "pcmpeqb %%xmm5,%%xmm5 \n"
941 "psrlw $0x8,%%xmm5 \n"
942 "sub %1,%2 \n"
943 "1: \n"
944 "movdqu (%0),%%xmm0 \n"
945 "movdqu 0x10(%0),%%xmm1 \n"
946 "movdqu (%0,%4,1),%%xmm2 \n"
947 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
948 "lea 0x20(%0),%0 \n"
949 "pavgb %%xmm2,%%xmm0 \n"
950 "pavgb %%xmm3,%%xmm1 \n"
951 "psrlw $0x8,%%xmm0 \n"
952 "psrlw $0x8,%%xmm1 \n"
953 "packuswb %%xmm1,%%xmm0 \n"
954 "movdqa %%xmm0,%%xmm1 \n"
955 "pand %%xmm5,%%xmm0 \n"
956 "packuswb %%xmm0,%%xmm0 \n"
957 "psrlw $0x8,%%xmm1 \n"
958 "packuswb %%xmm1,%%xmm1 \n"
959 "movq %%xmm0,(%1) \n"
960 "movq %%xmm1,(%1,%2) \n"
961 "lea 0x8(%1),%1 \n"
962 "sub $0x10,%3 \n"
963 "ja 1b \n"
964 : "+r"(src_yuy2), // %0
965 "+r"(dst_u), // %1
966 "+r"(dst_y), // %2
967 "+r"(pix) // %3
968 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
969 : "memory", "cc"
970#if defined(__SSE2__)
971 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
972#endif
973);
974}
975
976void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
977 asm volatile (
978 "1: \n"
979 "movdqa (%0),%%xmm0 \n"
980 "movdqa 0x10(%0),%%xmm1 \n"
981 "lea 0x20(%0),%0 \n"
982 "psrlw $0x8,%%xmm0 \n"
983 "psrlw $0x8,%%xmm1 \n"
984 "packuswb %%xmm1,%%xmm0 \n"
985 "movdqa %%xmm0,(%1) \n"
986 "lea 0x10(%1),%1 \n"
987 "sub $0x10,%2 \n"
988 "ja 1b \n"
989 : "+r"(src_uyvy), // %0
990 "+r"(dst_y), // %1
991 "+r"(pix) // %2
992 :
993 : "memory", "cc"
994#if defined(__SSE2__)
995 , "xmm0", "xmm1"
996#endif
997 );
998}
999
1000void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
1001 uint8* dst_u, uint8* dst_y, int pix) {
1002 asm volatile (
1003 "pcmpeqb %%xmm5,%%xmm5 \n"
1004 "psrlw $0x8,%%xmm5 \n"
1005 "sub %1,%2 \n"
1006 "1: \n"
1007 "movdqa (%0),%%xmm0 \n"
1008 "movdqa 0x10(%0),%%xmm1 \n"
1009 "movdqa (%0,%4,1),%%xmm2 \n"
1010 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1011 "lea 0x20(%0),%0 \n"
1012 "pavgb %%xmm2,%%xmm0 \n"
1013 "pavgb %%xmm3,%%xmm1 \n"
1014 "pand %%xmm5,%%xmm0 \n"
1015 "pand %%xmm5,%%xmm1 \n"
1016 "packuswb %%xmm1,%%xmm0 \n"
1017 "movdqa %%xmm0,%%xmm1 \n"
1018 "pand %%xmm5,%%xmm0 \n"
1019 "packuswb %%xmm0,%%xmm0 \n"
1020 "psrlw $0x8,%%xmm1 \n"
1021 "packuswb %%xmm1,%%xmm1 \n"
1022 "movq %%xmm0,(%1) \n"
1023 "movq %%xmm1,(%1,%2) \n"
1024 "lea 0x8(%1),%1 \n"
1025 "sub $0x10,%3 \n"
1026 "ja 1b \n"
1027 : "+r"(src_uyvy), // %0
1028 "+r"(dst_u), // %1
1029 "+r"(dst_y), // %2
1030 "+r"(pix) // %3
1031 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1032 : "memory", "cc"
1033#if defined(__SSE2__)
1034 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1035#endif
1036 );
1037}
1038
1039void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
1040 uint8* dst_y, int pix) {
1041 asm volatile (
1042 "1: \n"
1043 "movdqu (%0),%%xmm0 \n"
1044 "movdqu 0x10(%0),%%xmm1 \n"
1045 "lea 0x20(%0),%0 \n"
1046 "psrlw $0x8,%%xmm0 \n"
1047 "psrlw $0x8,%%xmm1 \n"
1048 "packuswb %%xmm1,%%xmm0 \n"
1049 "movdqu %%xmm0,(%1) \n"
1050 "lea 0x10(%1),%1 \n"
1051 "sub $0x10,%2 \n"
1052 "ja 1b \n"
1053 : "+r"(src_uyvy), // %0
1054 "+r"(dst_y), // %1
1055 "+r"(pix) // %2
1056 :
1057 : "memory", "cc"
1058#if defined(__SSE2__)
1059 , "xmm0", "xmm1"
1060#endif
1061 );
1062}
1063
1064void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
1065 uint8* dst_u, uint8* dst_y, int pix) {
1066 asm volatile (
1067 "pcmpeqb %%xmm5,%%xmm5 \n"
1068 "psrlw $0x8,%%xmm5 \n"
1069 "sub %1,%2 \n"
1070 "1: \n"
1071 "movdqu (%0),%%xmm0 \n"
1072 "movdqu 0x10(%0),%%xmm1 \n"
1073 "movdqu (%0,%4,1),%%xmm2 \n"
1074 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1075 "lea 0x20(%0),%0 \n"
1076 "pavgb %%xmm2,%%xmm0 \n"
1077 "pavgb %%xmm3,%%xmm1 \n"
1078 "pand %%xmm5,%%xmm0 \n"
1079 "pand %%xmm5,%%xmm1 \n"
1080 "packuswb %%xmm1,%%xmm0 \n"
1081 "movdqa %%xmm0,%%xmm1 \n"
1082 "pand %%xmm5,%%xmm0 \n"
1083 "packuswb %%xmm0,%%xmm0 \n"
1084 "psrlw $0x8,%%xmm1 \n"
1085 "packuswb %%xmm1,%%xmm1 \n"
1086 "movq %%xmm0,(%1) \n"
1087 "movq %%xmm1,(%1,%2) \n"
1088 "lea 0x8(%1),%1 \n"
1089 "sub $0x10,%3 \n"
1090 "ja 1b \n"
1091 : "+r"(src_uyvy), // %0
1092 "+r"(dst_u), // %1
1093 "+r"(dst_y), // %2
1094 "+r"(pix) // %3
1095 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
1096 : "memory", "cc"
1097#if defined(__SSE2__)
1098 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1099#endif
1100 );
1101}
1102
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001103#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001104
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001105#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001106} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00001107} // namespace libyuv
1108#endif