blob: b6e9bf9e09f560f9375081fffab48b3935167568 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000020#ifdef __APPLE__
21#define CONST
22#else
23#define CONST static const
24#endif
25
fbarchard@google.comb6149762011-11-07 21:58:52 +000026#ifdef HAS_ARGBTOUVROW_SSSE3
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000027CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000028 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
29};
30
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000031CONST uvec8 kARGBToV = {
fbarchard@google.com2430e042011-11-11 21:57:06 +000032 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
fbarchard@google.comb6149762011-11-07 21:58:52 +000033};
fbarchard@google.com2430e042011-11-11 21:57:06 +000034
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000035CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000036 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
37 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
38};
39#endif
40
fbarchard@google.com228bdc22011-11-15 21:58:26 +000041#ifdef HAS_ARGBTOYROW_SSSE3
42
43// Constant multiplication table for converting ARGB to I400.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000044CONST vec8 kARGBToY = {
fbarchard@google.com228bdc22011-11-15 21:58:26 +000045 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
46};
47
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000048CONST uvec8 kAddY16 = {
fbarchard@google.com228bdc22011-11-15 21:58:26 +000049 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
50};
51
fbarchard@google.com9394ed92011-10-31 21:36:47 +000052// Shuffle table for converting BG24 to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000053CONST uvec8 kShuffleMaskBG24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000054 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
55};
56
57// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000058CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000059 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
60};
61
fbarchard@google.comb6149762011-11-07 21:58:52 +000062// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000063CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000064 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
65};
66
67// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000068CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000069 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
70};
71
72void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000073 asm volatile (
74 "pcmpeqb %%xmm5,%%xmm5 \n"
75 "pslld $0x18,%%xmm5 \n"
76"1: \n"
77 "movq (%0),%%xmm0 \n"
78 "lea 0x8(%0),%0 \n"
79 "punpcklbw %%xmm0,%%xmm0 \n"
80 "movdqa %%xmm0,%%xmm1 \n"
81 "punpcklwd %%xmm0,%%xmm0 \n"
82 "punpckhwd %%xmm1,%%xmm1 \n"
83 "por %%xmm5,%%xmm0 \n"
84 "por %%xmm5,%%xmm1 \n"
85 "movdqa %%xmm0,(%1) \n"
86 "movdqa %%xmm1,0x10(%1) \n"
87 "lea 0x20(%1),%1 \n"
88 "sub $0x8,%2 \n"
89 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 : "+r"(src_y), // %0
91 "+r"(dst_argb), // %1
92 "+r"(pix) // %2
93 :
94 : "memory", "cc"
95#if defined(__SSE2__)
96 , "xmm0", "xmm1", "xmm5"
97#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +000098);
99}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000100
101void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000102 asm volatile (
103 "movdqa %3,%%xmm5 \n"
104"1: \n"
105 "movdqa (%0),%%xmm0 \n"
106 "lea 0x10(%0),%0 \n"
107 "pshufb %%xmm5,%%xmm0 \n"
108 "movdqa %%xmm0,(%1) \n"
109 "lea 0x10(%1),%1 \n"
110 "sub $0x4,%2 \n"
111 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000112 : "+r"(src_abgr), // %0
113 "+r"(dst_argb), // %1
114 "+r"(pix) // %2
115 : "m"(kShuffleMaskABGRToARGB) // %3
116 : "memory", "cc"
117#if defined(__SSE2__)
118 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000119#endif
120
fbarchard@google.comb6149762011-11-07 21:58:52 +0000121);
122}
123
124void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000125 asm volatile (
126 "movdqa %3,%%xmm5 \n"
127"1: \n"
128 "movdqa (%0),%%xmm0 \n"
129 "lea 0x10(%0),%0 \n"
130 "pshufb %%xmm5,%%xmm0 \n"
131 "movdqa %%xmm0,(%1) \n"
132 "lea 0x10(%1),%1 \n"
133 "sub $0x4,%2 \n"
134 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000135 : "+r"(src_bgra), // %0
136 "+r"(dst_argb), // %1
137 "+r"(pix) // %2
138 : "m"(kShuffleMaskBGRAToARGB) // %3
139 : "memory", "cc"
140#if defined(__SSE2__)
141 , "xmm0", "xmm5"
142#endif
143);
144}
145
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000146void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000147 asm volatile (
148 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
149 "pslld $0x18,%%xmm5 \n"
150 "movdqa %3,%%xmm4 \n"
151"1: \n"
152 "movdqa (%0),%%xmm0 \n"
153 "movdqa 0x10(%0),%%xmm1 \n"
154 "movdqa 0x20(%0),%%xmm3 \n"
155 "lea 0x30(%0),%0 \n"
156 "movdqa %%xmm3,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000157 "palignr $0x8,%%xmm1,%%xmm2 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000158 "pshufb %%xmm4,%%xmm2 \n"
159 "por %%xmm5,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000160 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000161 "pshufb %%xmm4,%%xmm0 \n"
162 "movdqa %%xmm2,0x20(%1) \n"
163 "por %%xmm5,%%xmm0 \n"
164 "pshufb %%xmm4,%%xmm1 \n"
165 "movdqa %%xmm0,(%1) \n"
166 "por %%xmm5,%%xmm1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000167 "palignr $0x4,%%xmm3,%%xmm3 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000168 "pshufb %%xmm4,%%xmm3 \n"
169 "movdqa %%xmm1,0x10(%1) \n"
170 "por %%xmm5,%%xmm3 \n"
171 "movdqa %%xmm3,0x30(%1) \n"
172 "lea 0x40(%1),%1 \n"
173 "sub $0x10,%2 \n"
174 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000175 : "+r"(src_bg24), // %0
176 "+r"(dst_argb), // %1
177 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000178 : "m"(kShuffleMaskBG24ToARGB) // %3
179 : "memory", "cc"
180#if defined(__SSE2__)
181 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
182#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000183);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000184}
185
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000186void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000187 asm volatile (
188 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
189 "pslld $0x18,%%xmm5 \n"
190 "movdqa %3,%%xmm4 \n"
191"1: \n"
192 "movdqa (%0),%%xmm0 \n"
193 "movdqa 0x10(%0),%%xmm1 \n"
194 "movdqa 0x20(%0),%%xmm3 \n"
195 "lea 0x30(%0),%0 \n"
196 "movdqa %%xmm3,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000197 "palignr $0x8,%%xmm1,%%xmm2 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000198 "pshufb %%xmm4,%%xmm2 \n"
199 "por %%xmm5,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000200 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000201 "pshufb %%xmm4,%%xmm0 \n"
202 "movdqa %%xmm2,0x20(%1) \n"
203 "por %%xmm5,%%xmm0 \n"
204 "pshufb %%xmm4,%%xmm1 \n"
205 "movdqa %%xmm0,(%1) \n"
206 "por %%xmm5,%%xmm1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000207 "palignr $0x4,%%xmm3,%%xmm3 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000208 "pshufb %%xmm4,%%xmm3 \n"
209 "movdqa %%xmm1,0x10(%1) \n"
210 "por %%xmm5,%%xmm3 \n"
211 "movdqa %%xmm3,0x30(%1) \n"
212 "lea 0x40(%1),%1 \n"
213 "sub $0x10,%2 \n"
214 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000215 : "+r"(src_raw), // %0
216 "+r"(dst_argb), // %1
217 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000218 : "m"(kShuffleMaskRAWToARGB) // %3
219 : "memory", "cc"
220#if defined(__SSE2__)
221 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
222#endif
223);
224}
225
226void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000227 asm volatile (
228 "movdqa %4,%%xmm5 \n"
229 "movdqa %3,%%xmm4 \n"
230"1: \n"
231 "movdqa (%0),%%xmm0 \n"
232 "movdqa 0x10(%0),%%xmm1 \n"
233 "movdqa 0x20(%0),%%xmm2 \n"
234 "movdqa 0x30(%0),%%xmm3 \n"
235 "pmaddubsw %%xmm4,%%xmm0 \n"
236 "pmaddubsw %%xmm4,%%xmm1 \n"
237 "pmaddubsw %%xmm4,%%xmm2 \n"
238 "pmaddubsw %%xmm4,%%xmm3 \n"
239 "lea 0x40(%0),%0 \n"
240 "phaddw %%xmm1,%%xmm0 \n"
241 "phaddw %%xmm3,%%xmm2 \n"
242 "psrlw $0x7,%%xmm0 \n"
243 "psrlw $0x7,%%xmm2 \n"
244 "packuswb %%xmm2,%%xmm0 \n"
245 "paddb %%xmm5,%%xmm0 \n"
246 "movdqa %%xmm0,(%1) \n"
247 "lea 0x10(%1),%1 \n"
248 "sub $0x10,%2 \n"
249 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000250 : "+r"(src_argb), // %0
251 "+r"(dst_y), // %1
252 "+r"(pix) // %2
253 : "m"(kARGBToY), // %3
254 "m"(kAddY16) // %4
255 : "memory", "cc"
256#if defined(__SSE2__)
257 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
258#endif
259
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000260);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000261}
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000262#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +0000263
fbarchard@google.comb6149762011-11-07 21:58:52 +0000264#ifdef HAS_ARGBTOUVROW_SSSE3
265void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
266 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000267 asm volatile (
268 "movdqa %0,%%xmm4 \n"
269 "movdqa %1,%%xmm3 \n"
270 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000271 :
272 : "m"(kARGBToU), // %0
273 "m"(kARGBToV), // %1
274 "m"(kAddUV128) // %2
275 :
276#if defined(__SSE2__)
277 "xmm3", "xmm4", "xmm5"
278#endif
279 );
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000280 asm volatile (
281 "sub %1,%2 \n"
282"1: \n"
283 "movdqa (%0),%%xmm0 \n"
284 "movdqa 0x10(%0),%%xmm1 \n"
285 "movdqa 0x20(%0),%%xmm2 \n"
286 "movdqa 0x30(%0),%%xmm6 \n"
287 "pavgb (%0,%4,1),%%xmm0 \n"
288 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
289 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
290 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
291 "lea 0x40(%0),%0 \n"
292 "movdqa %%xmm0,%%xmm7 \n"
293 "shufps $0x88,%%xmm1,%%xmm0 \n"
294 "shufps $0xdd,%%xmm1,%%xmm7 \n"
295 "pavgb %%xmm7,%%xmm0 \n"
296 "movdqa %%xmm2,%%xmm7 \n"
297 "shufps $0x88,%%xmm6,%%xmm2 \n"
298 "shufps $0xdd,%%xmm6,%%xmm7 \n"
299 "pavgb %%xmm7,%%xmm2 \n"
300 "movdqa %%xmm0,%%xmm1 \n"
301 "movdqa %%xmm2,%%xmm6 \n"
302 "pmaddubsw %%xmm4,%%xmm0 \n"
303 "pmaddubsw %%xmm4,%%xmm2 \n"
304 "pmaddubsw %%xmm3,%%xmm1 \n"
305 "pmaddubsw %%xmm3,%%xmm6 \n"
306 "phaddw %%xmm2,%%xmm0 \n"
307 "phaddw %%xmm6,%%xmm1 \n"
308 "psraw $0x8,%%xmm0 \n"
309 "psraw $0x8,%%xmm1 \n"
310 "packsswb %%xmm1,%%xmm0 \n"
311 "paddb %%xmm5,%%xmm0 \n"
312 "movlps %%xmm0,(%1) \n"
313 "movhps %%xmm0,(%1,%2,1) \n"
314 "lea 0x8(%1),%1 \n"
315 "sub $0x10,%3 \n"
316 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000317 : "+r"(src_argb0), // %0
318 "+r"(dst_u), // %1
319 "+r"(dst_v), // %2
320 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000321 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000322 : "memory", "cc"
323#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000324 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000325#endif
326);
327}
328#endif
329
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000330#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
331#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
332#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
333#define UR 0
334
335#define VB 0
336#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
337#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
338
339// Bias
340#define BB UB * 128 + VB * 128
341#define BG UG * 128 + VG * 128
342#define BR UR * 128 + VR * 128
343
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000344#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000345
fbarchard@google.comb6149762011-11-07 21:58:52 +0000346#if defined(__APPLE__) || defined(__x86_64__)
347#define OMITFP
348#else
349#define OMITFP __attribute__((optimize("omit-frame-pointer")))
350#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000351
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000352struct {
353 vec8 kUVToB;
354 vec8 kUVToG;
355 vec8 kUVToR;
356 vec16 kUVBiasB;
357 vec16 kUVBiasG;
358 vec16 kUVBiasR;
359 vec16 kYSub16;
360 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000361} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000362 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
363 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
364 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
365 { BB, BB, BB, BB, BB, BB, BB, BB },
366 { BG, BG, BG, BG, BG, BG, BG, BG },
367 { BR, BR, BR, BR, BR, BR, BR, BR },
368 { 16, 16, 16, 16, 16, 16, 16, 16 },
369 { YG, YG, YG, YG, YG, YG, YG, YG }
370};
371
372// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +0000373#define YUVTORGB \
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000374 "movd (%1),%%xmm0 \n" \
375 "movd (%1,%2,1),%%xmm1 \n" \
376 "lea 0x4(%1),%1 \n" \
377 "punpcklbw %%xmm1,%%xmm0 \n" \
378 "punpcklwd %%xmm0,%%xmm0 \n" \
379 "movdqa %%xmm0,%%xmm1 \n" \
380 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000381 "pmaddubsw (%5),%%xmm0 \n" \
382 "pmaddubsw 16(%5),%%xmm1 \n" \
383 "pmaddubsw 32(%5),%%xmm2 \n" \
384 "psubw 48(%5),%%xmm0 \n" \
385 "psubw 64(%5),%%xmm1 \n" \
386 "psubw 80(%5),%%xmm2 \n" \
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000387 "movq (%0),%%xmm3 \n" \
388 "lea 0x8(%0),%0 \n" \
389 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000390 "psubsw 96(%5),%%xmm3 \n" \
391 "pmullw 112(%5),%%xmm3 \n" \
fbarchard@google.combc8f28e2011-11-18 21:03:54 +0000392 "paddsw %%xmm3,%%xmm0 \n" \
393 "paddsw %%xmm3,%%xmm1 \n" \
394 "paddsw %%xmm3,%%xmm2 \n" \
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000395 "psraw $0x6,%%xmm0 \n" \
396 "psraw $0x6,%%xmm1 \n" \
397 "psraw $0x6,%%xmm2 \n" \
398 "packuswb %%xmm0,%%xmm0 \n" \
399 "packuswb %%xmm1,%%xmm1 \n" \
400 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000401
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000402void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, // rdi
fbarchard@google.comb6149762011-11-07 21:58:52 +0000403 const uint8* u_buf, // rsi
404 const uint8* v_buf, // rdx
405 uint8* rgb_buf, // rcx
406 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000407 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000408 "sub %1,%2 \n"
409 "pcmpeqb %%xmm5,%%xmm5 \n"
410 "pxor %%xmm4,%%xmm4 \n"
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000411
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000412 "1: \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000413 YUVTORGB
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000414 "punpcklbw %%xmm1,%%xmm0 \n"
415 "punpcklbw %%xmm5,%%xmm2 \n"
416 "movdqa %%xmm0,%%xmm1 \n"
417 "punpcklwd %%xmm2,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000418 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com3fe36962011-12-13 02:49:22 +0000419 "movdqa %%xmm0,(%3) \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000420 "movdqa %%xmm1,0x10(%3) \n"
421 "lea 0x20(%3),%3 \n"
422 "sub $0x8,%4 \n"
423 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000424 : "+r"(y_buf), // %0
425 "+r"(u_buf), // %1
426 "+r"(v_buf), // %2
427 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000428 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000429 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000430 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000431#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000432 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000433#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000434 );
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000435}
436
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000437void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
438 const uint8* u_buf, // rsi
439 const uint8* v_buf, // rdx
440 uint8* rgb_buf, // rcx
441 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000442 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000443 "sub %1,%2 \n"
444 "pcmpeqb %%xmm5,%%xmm5 \n"
445 "pxor %%xmm4,%%xmm4 \n"
446
447 "1: \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000448 YUVTORGB
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000449 "pcmpeqb %%xmm5,%%xmm5 \n"
450 "punpcklbw %%xmm0,%%xmm1 \n"
451 "punpcklbw %%xmm2,%%xmm5 \n"
452 "movdqa %%xmm5,%%xmm0 \n"
453 "punpcklwd %%xmm1,%%xmm5 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000454 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000455 "movdqa %%xmm5,(%3) \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000456 "movdqa %%xmm0,0x10(%3) \n"
457 "lea 0x20(%3),%3 \n"
458 "sub $0x8,%4 \n"
459 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000460 : "+r"(y_buf), // %0
461 "+r"(u_buf), // %1
462 "+r"(v_buf), // %2
463 "+r"(rgb_buf), // %3
464 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000465 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000466 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000467#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000468 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000469#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000470 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000471}
472
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000473void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
474 const uint8* u_buf, // rsi
475 const uint8* v_buf, // rdx
476 uint8* rgb_buf, // rcx
477 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000478 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000479 "sub %1,%2 \n"
480 "pcmpeqb %%xmm5,%%xmm5 \n"
481 "pxor %%xmm4,%%xmm4 \n"
482
483 "1: \n"
484 YUVTORGB
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000485 "punpcklbw %%xmm1,%%xmm2 \n"
486 "punpcklbw %%xmm5,%%xmm0 \n"
487 "movdqa %%xmm2,%%xmm1 \n"
488 "punpcklwd %%xmm0,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000489 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000490 "movdqa %%xmm2,(%3) \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000491 "movdqa %%xmm1,0x10(%3) \n"
492 "lea 0x20(%3),%3 \n"
493 "sub $0x8,%4 \n"
494 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000495 : "+r"(y_buf), // %0
496 "+r"(u_buf), // %1
497 "+r"(v_buf), // %2
498 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000499 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000500 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000501 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000502#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000503 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000504#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000505 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000506}
507
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000508void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
509 const uint8* u_buf, // rsi
510 const uint8* v_buf, // rdx
511 uint8* rgb_buf, // rcx
512 int width) { // r8
513 asm volatile (
514 "sub %1,%2 \n"
515 "pcmpeqb %%xmm5,%%xmm5 \n"
516 "pxor %%xmm4,%%xmm4 \n"
517
518 "1: \n"
519 "movd (%1),%%xmm0 \n"
520 "movd (%1,%2,1),%%xmm1 \n"
521 "lea 0x4(%1),%1 \n"
522 "punpcklbw %%xmm1,%%xmm0 \n"
523 "movdqa %%xmm0,%%xmm1 \n"
524 "movdqa %%xmm0,%%xmm2 \n"
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000525 "pmaddubsw (%5),%%xmm0 \n"
526 "pmaddubsw 16(%5),%%xmm1 \n"
527 "pmaddubsw 32(%5),%%xmm2 \n"
528 "psubw 48(%5),%%xmm0 \n"
529 "psubw 64(%5),%%xmm1 \n"
530 "psubw 80(%5),%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000531 "movd (%0),%%xmm3 \n"
532 "lea 0x4(%0),%0 \n"
533 "punpcklbw %%xmm4,%%xmm3 \n"
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000534 "psubsw 96(%5),%%xmm3 \n"
535 "pmullw 112(%5),%%xmm3 \n"
fbarchard@google.combc8f28e2011-11-18 21:03:54 +0000536 "paddsw %%xmm3,%%xmm0 \n"
537 "paddsw %%xmm3,%%xmm1 \n"
538 "paddsw %%xmm3,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000539 "psraw $0x6,%%xmm0 \n"
540 "psraw $0x6,%%xmm1 \n"
541 "psraw $0x6,%%xmm2 \n"
542 "packuswb %%xmm0,%%xmm0 \n"
543 "packuswb %%xmm1,%%xmm1 \n"
544 "packuswb %%xmm2,%%xmm2 \n"
545 "punpcklbw %%xmm1,%%xmm0 \n"
546 "punpcklbw %%xmm5,%%xmm2 \n"
547 "punpcklwd %%xmm2,%%xmm0 \n"
548 "movdqa %%xmm0,(%3) \n"
549 "lea 0x10(%3),%3 \n"
550 "sub $0x4,%4 \n"
551 "ja 1b \n"
552 : "+r"(y_buf), // %0
553 "+r"(u_buf), // %1
554 "+r"(v_buf), // %2
555 "+r"(rgb_buf), // %3
556 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000557 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000558 : "memory", "cc"
559#if defined(__SSE2__)
560 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
561#endif
562 );
563}
564#endif
565
566#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000567
fbarchard@google.comb6149762011-11-07 21:58:52 +0000568void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
569 uint8* rgb_buf, // rcx
570 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000571 asm volatile (
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000572 "pcmpeqb %%xmm4,%%xmm4 \n"
573 "pslld $0x18,%%xmm4 \n"
574 "mov $0x10001000,%%eax \n"
575 "movd %%eax,%%xmm3 \n"
576 "pshufd $0x0,%%xmm3,%%xmm3 \n"
577 "mov $0x012a012a,%%eax \n"
578 "movd %%eax,%%xmm2 \n"
579 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000580
581 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000582 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
583 "movq (%0),%%xmm0 \n"
584 "lea 0x8(%0),%0 \n"
585 "punpcklbw %%xmm0,%%xmm0 \n"
586 "psubusw %%xmm3,%%xmm0 \n"
587 "pmulhuw %%xmm2,%%xmm0 \n"
588 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000589
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000590 // Step 2: Weave into ARGB
591 "punpcklbw %%xmm0,%%xmm0 \n"
592 "movdqa %%xmm0,%%xmm1 \n"
593 "punpcklwd %%xmm0,%%xmm0 \n"
594 "punpckhwd %%xmm1,%%xmm1 \n"
595 "por %%xmm4,%%xmm0 \n"
596 "por %%xmm4,%%xmm1 \n"
597 "movdqa %%xmm0,(%1) \n"
598 "movdqa %%xmm1,16(%1) \n"
599 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000600
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000601 "sub $0x8,%2 \n"
602 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000603 : "+r"(y_buf), // %0
604 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +0000605 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000606 :
607 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000608#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000609 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000610#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000611 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000612}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000613#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000614
frkoenig@google.come5185422011-11-07 23:07:57 +0000615#ifdef HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000616void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
617 SIMD_ALIGNED(uint8 row[kMaxStride]);
618 ABGRToARGBRow_SSSE3(src_argb, row, pix);
619 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000620}
621
fbarchard@google.comb6149762011-11-07 21:58:52 +0000622void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
623 SIMD_ALIGNED(uint8 row[kMaxStride]);
624 BGRAToARGBRow_SSSE3(src_argb, row, pix);
625 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000626}
frkoenig@google.come5185422011-11-07 23:07:57 +0000627#endif
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000628
fbarchard@google.comb6149762011-11-07 21:58:52 +0000629#ifdef HAS_ARGBTOUVROW_SSSE3
630void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
631 uint8* dst_u, uint8* dst_v, int pix) {
632 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
633 ABGRToARGBRow_SSSE3(src_argb, row, pix);
634 ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
635 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000636}
637
fbarchard@google.comb6149762011-11-07 21:58:52 +0000638void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
639 uint8* dst_u, uint8* dst_v, int pix) {
640 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
641 BGRAToARGBRow_SSSE3(src_argb, row, pix);
642 BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
643 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000644}
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000645#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000646
fbarchard@google.com12d04832011-11-21 23:54:38 +0000647#ifdef HAS_REVERSE_ROW_SSSE3
648
649// Shuffle table for reversing the bytes.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000650CONST uvec8 kShuffleReverse = {
fbarchard@google.com12d04832011-11-21 23:54:38 +0000651 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
652};
653
654void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
655 intptr_t temp_width = static_cast<intptr_t>(width);
656 asm volatile (
657 "movdqa %3,%%xmm5 \n"
658 "lea -0x10(%0,%2,1),%0 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000659 "1: \n"
660 "movdqa (%0),%%xmm0 \n"
661 "lea -0x10(%0),%0 \n"
662 "pshufb %%xmm5,%%xmm0 \n"
663 "movdqa %%xmm0,(%1) \n"
664 "lea 0x10(%1),%1 \n"
665 "sub $0x10,%2 \n"
666 "ja 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +0000667 : "+r"(src), // %0
668 "+r"(dst), // %1
669 "+r"(temp_width) // %2
670 : "m"(kShuffleReverse) // %3
671 : "memory", "cc"
672#if defined(__SSE2__)
673 , "xmm0", "xmm5"
674#endif
675 );
676}
677#endif
678
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000679#ifdef HAS_REVERSE_ROW_SSE2
680
681void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
682 intptr_t temp_width = static_cast<intptr_t>(width);
683 asm volatile (
684 "lea -0x10(%0,%2,1),%0 \n"
685 "1: \n"
686 "movdqa (%0),%%xmm0 \n"
687 "lea -0x10(%0),%0 \n"
688 "movdqa %%xmm0,%%xmm1 \n"
689 "psllw $0x8,%%xmm0 \n"
690 "psrlw $0x8,%%xmm1 \n"
691 "por %%xmm1,%%xmm0 \n"
692 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
693 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
694 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
695 "movdqa %%xmm0,(%1) \n"
696 "lea 0x10(%1),%1 \n"
697 "sub $0x10,%2 \n"
698 "ja 1b \n"
699 : "+r"(src), // %0
700 "+r"(dst), // %1
701 "+r"(temp_width) // %2
702 :
703 : "memory", "cc"
704#if defined(__SSE2__)
705 , "xmm0", "xmm1"
706#endif
707 );
708}
709#endif
710
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +0000711#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000712} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +0000713} // namespace libyuv
714#endif