blob: eadde7818520e17315ae9f229d125586320eebbc [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.comb6149762011-11-07 21:58:52 +000020#ifdef HAS_ARGBTOUVROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +000021vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000022 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
23};
24
fbarchard@google.com228bdc22011-11-15 21:58:26 +000025uvec8 kARGBToV = {
fbarchard@google.com2430e042011-11-11 21:57:06 +000026 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
fbarchard@google.comb6149762011-11-07 21:58:52 +000027};
fbarchard@google.com2430e042011-11-11 21:57:06 +000028
fbarchard@google.com228bdc22011-11-15 21:58:26 +000029uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000030 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
31 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
32};
33#endif
34
fbarchard@google.com228bdc22011-11-15 21:58:26 +000035#ifdef HAS_ARGBTOYROW_SSSE3
36
37// Constant multiplication table for converting ARGB to I400.
38vec8 kARGBToY = {
39 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
40};
41
42uvec8 kAddY16 = {
43 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
44};
45
fbarchard@google.com9394ed92011-10-31 21:36:47 +000046// Shuffle table for converting BG24 to ARGB.
fbarchard@google.com228bdc22011-11-15 21:58:26 +000047uvec8 kShuffleMaskBG24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000048 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
49};
50
51// Shuffle table for converting RAW to ARGB.
fbarchard@google.com228bdc22011-11-15 21:58:26 +000052uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000053 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
54};
55
fbarchard@google.comb6149762011-11-07 21:58:52 +000056// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com228bdc22011-11-15 21:58:26 +000057uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000058 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
59};
60
61// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com228bdc22011-11-15 21:58:26 +000062uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000063 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
64};
65
66void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000067 asm volatile (
68 "pcmpeqb %%xmm5,%%xmm5 \n"
69 "pslld $0x18,%%xmm5 \n"
70"1: \n"
71 "movq (%0),%%xmm0 \n"
72 "lea 0x8(%0),%0 \n"
73 "punpcklbw %%xmm0,%%xmm0 \n"
74 "movdqa %%xmm0,%%xmm1 \n"
75 "punpcklwd %%xmm0,%%xmm0 \n"
76 "punpckhwd %%xmm1,%%xmm1 \n"
77 "por %%xmm5,%%xmm0 \n"
78 "por %%xmm5,%%xmm1 \n"
79 "movdqa %%xmm0,(%1) \n"
80 "movdqa %%xmm1,0x10(%1) \n"
81 "lea 0x20(%1),%1 \n"
82 "sub $0x8,%2 \n"
83 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +000084 : "+r"(src_y), // %0
85 "+r"(dst_argb), // %1
86 "+r"(pix) // %2
87 :
88 : "memory", "cc"
89#if defined(__SSE2__)
90 , "xmm0", "xmm1", "xmm5"
91#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +000092);
93}
fbarchard@google.comb6149762011-11-07 21:58:52 +000094
95void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000096 asm volatile (
97 "movdqa %3,%%xmm5 \n"
98"1: \n"
99 "movdqa (%0),%%xmm0 \n"
100 "lea 0x10(%0),%0 \n"
101 "pshufb %%xmm5,%%xmm0 \n"
102 "movdqa %%xmm0,(%1) \n"
103 "lea 0x10(%1),%1 \n"
104 "sub $0x4,%2 \n"
105 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000106 : "+r"(src_abgr), // %0
107 "+r"(dst_argb), // %1
108 "+r"(pix) // %2
109 : "m"(kShuffleMaskABGRToARGB) // %3
110 : "memory", "cc"
111#if defined(__SSE2__)
112 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000113#endif
114
fbarchard@google.comb6149762011-11-07 21:58:52 +0000115);
116}
117
118void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000119 asm volatile (
120 "movdqa %3,%%xmm5 \n"
121"1: \n"
122 "movdqa (%0),%%xmm0 \n"
123 "lea 0x10(%0),%0 \n"
124 "pshufb %%xmm5,%%xmm0 \n"
125 "movdqa %%xmm0,(%1) \n"
126 "lea 0x10(%1),%1 \n"
127 "sub $0x4,%2 \n"
128 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000129 : "+r"(src_bgra), // %0
130 "+r"(dst_argb), // %1
131 "+r"(pix) // %2
132 : "m"(kShuffleMaskBGRAToARGB) // %3
133 : "memory", "cc"
134#if defined(__SSE2__)
135 , "xmm0", "xmm5"
136#endif
137);
138}
139
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000140void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000141 asm volatile (
142 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
143 "pslld $0x18,%%xmm5 \n"
144 "movdqa %3,%%xmm4 \n"
145"1: \n"
146 "movdqa (%0),%%xmm0 \n"
147 "movdqa 0x10(%0),%%xmm1 \n"
148 "movdqa 0x20(%0),%%xmm3 \n"
149 "lea 0x30(%0),%0 \n"
150 "movdqa %%xmm3,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000151 "palignr $0x8,%%xmm1,%%xmm2 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000152 "pshufb %%xmm4,%%xmm2 \n"
153 "por %%xmm5,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000154 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000155 "pshufb %%xmm4,%%xmm0 \n"
156 "movdqa %%xmm2,0x20(%1) \n"
157 "por %%xmm5,%%xmm0 \n"
158 "pshufb %%xmm4,%%xmm1 \n"
159 "movdqa %%xmm0,(%1) \n"
160 "por %%xmm5,%%xmm1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000161 "palignr $0x4,%%xmm3,%%xmm3 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000162 "pshufb %%xmm4,%%xmm3 \n"
163 "movdqa %%xmm1,0x10(%1) \n"
164 "por %%xmm5,%%xmm3 \n"
165 "movdqa %%xmm3,0x30(%1) \n"
166 "lea 0x40(%1),%1 \n"
167 "sub $0x10,%2 \n"
168 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000169 : "+r"(src_bg24), // %0
170 "+r"(dst_argb), // %1
171 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000172 : "m"(kShuffleMaskBG24ToARGB) // %3
173 : "memory", "cc"
174#if defined(__SSE2__)
175 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
176#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000177);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000178}
179
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000180void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000181 asm volatile (
182 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
183 "pslld $0x18,%%xmm5 \n"
184 "movdqa %3,%%xmm4 \n"
185"1: \n"
186 "movdqa (%0),%%xmm0 \n"
187 "movdqa 0x10(%0),%%xmm1 \n"
188 "movdqa 0x20(%0),%%xmm3 \n"
189 "lea 0x30(%0),%0 \n"
190 "movdqa %%xmm3,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000191 "palignr $0x8,%%xmm1,%%xmm2 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000192 "pshufb %%xmm4,%%xmm2 \n"
193 "por %%xmm5,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000194 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000195 "pshufb %%xmm4,%%xmm0 \n"
196 "movdqa %%xmm2,0x20(%1) \n"
197 "por %%xmm5,%%xmm0 \n"
198 "pshufb %%xmm4,%%xmm1 \n"
199 "movdqa %%xmm0,(%1) \n"
200 "por %%xmm5,%%xmm1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000201 "palignr $0x4,%%xmm3,%%xmm3 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000202 "pshufb %%xmm4,%%xmm3 \n"
203 "movdqa %%xmm1,0x10(%1) \n"
204 "por %%xmm5,%%xmm3 \n"
205 "movdqa %%xmm3,0x30(%1) \n"
206 "lea 0x40(%1),%1 \n"
207 "sub $0x10,%2 \n"
208 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000209 : "+r"(src_raw), // %0
210 "+r"(dst_argb), // %1
211 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000212 : "m"(kShuffleMaskRAWToARGB) // %3
213 : "memory", "cc"
214#if defined(__SSE2__)
215 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
216#endif
217);
218}
219
220void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000221 asm volatile (
222 "movdqa %4,%%xmm5 \n"
223 "movdqa %3,%%xmm4 \n"
224"1: \n"
225 "movdqa (%0),%%xmm0 \n"
226 "movdqa 0x10(%0),%%xmm1 \n"
227 "movdqa 0x20(%0),%%xmm2 \n"
228 "movdqa 0x30(%0),%%xmm3 \n"
229 "pmaddubsw %%xmm4,%%xmm0 \n"
230 "pmaddubsw %%xmm4,%%xmm1 \n"
231 "pmaddubsw %%xmm4,%%xmm2 \n"
232 "pmaddubsw %%xmm4,%%xmm3 \n"
233 "lea 0x40(%0),%0 \n"
234 "phaddw %%xmm1,%%xmm0 \n"
235 "phaddw %%xmm3,%%xmm2 \n"
236 "psrlw $0x7,%%xmm0 \n"
237 "psrlw $0x7,%%xmm2 \n"
238 "packuswb %%xmm2,%%xmm0 \n"
239 "paddb %%xmm5,%%xmm0 \n"
240 "movdqa %%xmm0,(%1) \n"
241 "lea 0x10(%1),%1 \n"
242 "sub $0x10,%2 \n"
243 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000244 : "+r"(src_argb), // %0
245 "+r"(dst_y), // %1
246 "+r"(pix) // %2
247 : "m"(kARGBToY), // %3
248 "m"(kAddY16) // %4
249 : "memory", "cc"
250#if defined(__SSE2__)
251 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
252#endif
253
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000254);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000255}
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000256#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +0000257
fbarchard@google.comb6149762011-11-07 21:58:52 +0000258#ifdef HAS_ARGBTOUVROW_SSSE3
259void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
260 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000261 asm volatile (
262 "movdqa %0,%%xmm4 \n"
263 "movdqa %1,%%xmm3 \n"
264 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000265 :
266 : "m"(kARGBToU), // %0
267 "m"(kARGBToV), // %1
268 "m"(kAddUV128) // %2
269 :
270#if defined(__SSE2__)
271 "xmm3", "xmm4", "xmm5"
272#endif
273 );
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000274 asm volatile (
275 "sub %1,%2 \n"
276"1: \n"
277 "movdqa (%0),%%xmm0 \n"
278 "movdqa 0x10(%0),%%xmm1 \n"
279 "movdqa 0x20(%0),%%xmm2 \n"
280 "movdqa 0x30(%0),%%xmm6 \n"
281 "pavgb (%0,%4,1),%%xmm0 \n"
282 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
283 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
284 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
285 "lea 0x40(%0),%0 \n"
286 "movdqa %%xmm0,%%xmm7 \n"
287 "shufps $0x88,%%xmm1,%%xmm0 \n"
288 "shufps $0xdd,%%xmm1,%%xmm7 \n"
289 "pavgb %%xmm7,%%xmm0 \n"
290 "movdqa %%xmm2,%%xmm7 \n"
291 "shufps $0x88,%%xmm6,%%xmm2 \n"
292 "shufps $0xdd,%%xmm6,%%xmm7 \n"
293 "pavgb %%xmm7,%%xmm2 \n"
294 "movdqa %%xmm0,%%xmm1 \n"
295 "movdqa %%xmm2,%%xmm6 \n"
296 "pmaddubsw %%xmm4,%%xmm0 \n"
297 "pmaddubsw %%xmm4,%%xmm2 \n"
298 "pmaddubsw %%xmm3,%%xmm1 \n"
299 "pmaddubsw %%xmm3,%%xmm6 \n"
300 "phaddw %%xmm2,%%xmm0 \n"
301 "phaddw %%xmm6,%%xmm1 \n"
302 "psraw $0x8,%%xmm0 \n"
303 "psraw $0x8,%%xmm1 \n"
304 "packsswb %%xmm1,%%xmm0 \n"
305 "paddb %%xmm5,%%xmm0 \n"
306 "movlps %%xmm0,(%1) \n"
307 "movhps %%xmm0,(%1,%2,1) \n"
308 "lea 0x8(%1),%1 \n"
309 "sub $0x10,%3 \n"
310 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000311 : "+r"(src_argb0), // %0
312 "+r"(dst_u), // %1
313 "+r"(dst_v), // %2
314 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000315 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000316 : "memory", "cc"
317#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000318 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000319#endif
320);
321}
322#endif
323
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000324#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
325#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
326#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
327#define UR 0
328
329#define VB 0
330#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
331#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
332
333// Bias
334#define BB UB * 128 + VB * 128
335#define BG UG * 128 + VG * 128
336#define BR UR * 128 + VR * 128
337
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000338#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000339
fbarchard@google.comb6149762011-11-07 21:58:52 +0000340#if defined(__APPLE__) || defined(__x86_64__)
341#define OMITFP
342#else
343#define OMITFP __attribute__((optimize("omit-frame-pointer")))
344#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000345
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000346struct {
347 vec8 kUVToB;
348 vec8 kUVToG;
349 vec8 kUVToR;
350 vec16 kUVBiasB;
351 vec16 kUVBiasG;
352 vec16 kUVBiasR;
353 vec16 kYSub16;
354 vec16 kYToRgb;
355} SIMD_ALIGNED(kYuvConstants) = {
356 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
357 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
358 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
359 { BB, BB, BB, BB, BB, BB, BB, BB },
360 { BG, BG, BG, BG, BG, BG, BG, BG },
361 { BR, BR, BR, BR, BR, BR, BR, BR },
362 { 16, 16, 16, 16, 16, 16, 16, 16 },
363 { YG, YG, YG, YG, YG, YG, YG, YG }
364};
365
366// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +0000367#define YUVTORGB \
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000368 "movd (%1),%%xmm0 \n" \
369 "movd (%1,%2,1),%%xmm1 \n" \
370 "lea 0x4(%1),%1 \n" \
371 "punpcklbw %%xmm1,%%xmm0 \n" \
372 "punpcklwd %%xmm0,%%xmm0 \n" \
373 "movdqa %%xmm0,%%xmm1 \n" \
374 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000375 "pmaddubsw (%5),%%xmm0 \n" \
376 "pmaddubsw 16(%5),%%xmm1 \n" \
377 "pmaddubsw 32(%5),%%xmm2 \n" \
378 "psubw 48(%5),%%xmm0 \n" \
379 "psubw 64(%5),%%xmm1 \n" \
380 "psubw 80(%5),%%xmm2 \n" \
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000381 "movq (%0),%%xmm3 \n" \
382 "lea 0x8(%0),%0 \n" \
383 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000384 "psubsw 96(%5),%%xmm3 \n" \
385 "pmullw 112(%5),%%xmm3 \n" \
fbarchard@google.combc8f28e2011-11-18 21:03:54 +0000386 "paddsw %%xmm3,%%xmm0 \n" \
387 "paddsw %%xmm3,%%xmm1 \n" \
388 "paddsw %%xmm3,%%xmm2 \n" \
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000389 "psraw $0x6,%%xmm0 \n" \
390 "psraw $0x6,%%xmm1 \n" \
391 "psraw $0x6,%%xmm2 \n" \
392 "packuswb %%xmm0,%%xmm0 \n" \
393 "packuswb %%xmm1,%%xmm1 \n" \
394 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000395
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000396void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, // rdi
fbarchard@google.comb6149762011-11-07 21:58:52 +0000397 const uint8* u_buf, // rsi
398 const uint8* v_buf, // rdx
399 uint8* rgb_buf, // rcx
400 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000401 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000402 "sub %1,%2 \n"
403 "pcmpeqb %%xmm5,%%xmm5 \n"
404 "pxor %%xmm4,%%xmm4 \n"
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000405
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000406 "1: \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000407 YUVTORGB
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000408 "punpcklbw %%xmm1,%%xmm0 \n"
409 "punpcklbw %%xmm5,%%xmm2 \n"
410 "movdqa %%xmm0,%%xmm1 \n"
411 "punpcklwd %%xmm2,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000412 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com3fe36962011-12-13 02:49:22 +0000413 "movdqa %%xmm0,(%3) \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000414 "movdqa %%xmm1,0x10(%3) \n"
415 "lea 0x20(%3),%3 \n"
416 "sub $0x8,%4 \n"
417 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000418 : "+r"(y_buf), // %0
419 "+r"(u_buf), // %1
420 "+r"(v_buf), // %2
421 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000422 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000423 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000424 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000425#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000427#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000428 );
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000429}
430
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000431void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
432 const uint8* u_buf, // rsi
433 const uint8* v_buf, // rdx
434 uint8* rgb_buf, // rcx
435 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000436 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000437 "sub %1,%2 \n"
438 "pcmpeqb %%xmm5,%%xmm5 \n"
439 "pxor %%xmm4,%%xmm4 \n"
440
441 "1: \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000442 YUVTORGB
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000443 "pcmpeqb %%xmm5,%%xmm5 \n"
444 "punpcklbw %%xmm0,%%xmm1 \n"
445 "punpcklbw %%xmm2,%%xmm5 \n"
446 "movdqa %%xmm5,%%xmm0 \n"
447 "punpcklwd %%xmm1,%%xmm5 \n"
448 "movdqa %%xmm5,(%3) \n"
449 "punpckhwd %%xmm1,%%xmm0 \n"
450 "movdqa %%xmm0,0x10(%3) \n"
451 "lea 0x20(%3),%3 \n"
452 "sub $0x8,%4 \n"
453 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000454 : "+r"(y_buf), // %0
455 "+r"(u_buf), // %1
456 "+r"(v_buf), // %2
457 "+r"(rgb_buf), // %3
458 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000459 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000460 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000461#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000462 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000463#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000464 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000465}
466
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000467void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
468 const uint8* u_buf, // rsi
469 const uint8* v_buf, // rdx
470 uint8* rgb_buf, // rcx
471 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000472 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000473 "sub %1,%2 \n"
474 "pcmpeqb %%xmm5,%%xmm5 \n"
475 "pxor %%xmm4,%%xmm4 \n"
476
477 "1: \n"
478 YUVTORGB
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000479 "punpcklbw %%xmm1,%%xmm2 \n"
480 "punpcklbw %%xmm5,%%xmm0 \n"
481 "movdqa %%xmm2,%%xmm1 \n"
482 "punpcklwd %%xmm0,%%xmm2 \n"
483 "movdqa %%xmm2,(%3) \n"
484 "punpckhwd %%xmm0,%%xmm1 \n"
485 "movdqa %%xmm1,0x10(%3) \n"
486 "lea 0x20(%3),%3 \n"
487 "sub $0x8,%4 \n"
488 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000489 : "+r"(y_buf), // %0
490 "+r"(u_buf), // %1
491 "+r"(v_buf), // %2
492 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000493 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000494 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000495 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000496#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000497 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000498#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000499 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000500}
501
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000502void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
503 const uint8* u_buf, // rsi
504 const uint8* v_buf, // rdx
505 uint8* rgb_buf, // rcx
506 int width) { // r8
507 asm volatile (
508 "sub %1,%2 \n"
509 "pcmpeqb %%xmm5,%%xmm5 \n"
510 "pxor %%xmm4,%%xmm4 \n"
511
512 "1: \n"
513 "movd (%1),%%xmm0 \n"
514 "movd (%1,%2,1),%%xmm1 \n"
515 "lea 0x4(%1),%1 \n"
516 "punpcklbw %%xmm1,%%xmm0 \n"
517 "movdqa %%xmm0,%%xmm1 \n"
518 "movdqa %%xmm0,%%xmm2 \n"
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000519 "pmaddubsw (%5),%%xmm0 \n"
520 "pmaddubsw 16(%5),%%xmm1 \n"
521 "pmaddubsw 32(%5),%%xmm2 \n"
522 "psubw 48(%5),%%xmm0 \n"
523 "psubw 64(%5),%%xmm1 \n"
524 "psubw 80(%5),%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000525 "movd (%0),%%xmm3 \n"
526 "lea 0x4(%0),%0 \n"
527 "punpcklbw %%xmm4,%%xmm3 \n"
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000528 "psubsw 96(%5),%%xmm3 \n"
529 "pmullw 112(%5),%%xmm3 \n"
fbarchard@google.combc8f28e2011-11-18 21:03:54 +0000530 "paddsw %%xmm3,%%xmm0 \n"
531 "paddsw %%xmm3,%%xmm1 \n"
532 "paddsw %%xmm3,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000533 "psraw $0x6,%%xmm0 \n"
534 "psraw $0x6,%%xmm1 \n"
535 "psraw $0x6,%%xmm2 \n"
536 "packuswb %%xmm0,%%xmm0 \n"
537 "packuswb %%xmm1,%%xmm1 \n"
538 "packuswb %%xmm2,%%xmm2 \n"
539 "punpcklbw %%xmm1,%%xmm0 \n"
540 "punpcklbw %%xmm5,%%xmm2 \n"
541 "punpcklwd %%xmm2,%%xmm0 \n"
542 "movdqa %%xmm0,(%3) \n"
543 "lea 0x10(%3),%3 \n"
544 "sub $0x4,%4 \n"
545 "ja 1b \n"
546 : "+r"(y_buf), // %0
547 "+r"(u_buf), // %1
548 "+r"(v_buf), // %2
549 "+r"(rgb_buf), // %3
550 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000551 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000552 : "memory", "cc"
553#if defined(__SSE2__)
554 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
555#endif
556 );
557}
558#endif
559
560#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000561
fbarchard@google.comb6149762011-11-07 21:58:52 +0000562void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
563 uint8* rgb_buf, // rcx
564 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000565 asm volatile (
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000566 "pcmpeqb %%xmm4,%%xmm4 \n"
567 "pslld $0x18,%%xmm4 \n"
568 "mov $0x10001000,%%eax \n"
569 "movd %%eax,%%xmm3 \n"
570 "pshufd $0x0,%%xmm3,%%xmm3 \n"
571 "mov $0x012a012a,%%eax \n"
572 "movd %%eax,%%xmm2 \n"
573 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000574
575 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000576 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
577 "movq (%0),%%xmm0 \n"
578 "lea 0x8(%0),%0 \n"
579 "punpcklbw %%xmm0,%%xmm0 \n"
580 "psubusw %%xmm3,%%xmm0 \n"
581 "pmulhuw %%xmm2,%%xmm0 \n"
582 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000583
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000584 // Step 2: Weave into ARGB
585 "punpcklbw %%xmm0,%%xmm0 \n"
586 "movdqa %%xmm0,%%xmm1 \n"
587 "punpcklwd %%xmm0,%%xmm0 \n"
588 "punpckhwd %%xmm1,%%xmm1 \n"
589 "por %%xmm4,%%xmm0 \n"
590 "por %%xmm4,%%xmm1 \n"
591 "movdqa %%xmm0,(%1) \n"
592 "movdqa %%xmm1,16(%1) \n"
593 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000594
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000595 "sub $0x8,%2 \n"
596 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000597 : "+r"(y_buf), // %0
598 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +0000599 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000600 :
601 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000602#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +0000603 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000604#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000605 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000606}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000607#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000608
frkoenig@google.come5185422011-11-07 23:07:57 +0000609#ifdef HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000610void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
611 SIMD_ALIGNED(uint8 row[kMaxStride]);
612 ABGRToARGBRow_SSSE3(src_argb, row, pix);
613 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000614}
615
fbarchard@google.comb6149762011-11-07 21:58:52 +0000616void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
617 SIMD_ALIGNED(uint8 row[kMaxStride]);
618 BGRAToARGBRow_SSSE3(src_argb, row, pix);
619 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000620}
frkoenig@google.come5185422011-11-07 23:07:57 +0000621#endif
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000622
fbarchard@google.comb6149762011-11-07 21:58:52 +0000623#ifdef HAS_ARGBTOUVROW_SSSE3
624void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
625 uint8* dst_u, uint8* dst_v, int pix) {
626 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
627 ABGRToARGBRow_SSSE3(src_argb, row, pix);
628 ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
629 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000630}
631
fbarchard@google.comb6149762011-11-07 21:58:52 +0000632void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
633 uint8* dst_u, uint8* dst_v, int pix) {
634 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
635 BGRAToARGBRow_SSSE3(src_argb, row, pix);
636 BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
637 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000638}
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000639#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000640
fbarchard@google.com12d04832011-11-21 23:54:38 +0000641#ifdef HAS_REVERSE_ROW_SSSE3
642
fbarchard@google.com19a248a2011-12-12 20:21:52 +0000643// TODO(fbarchard): define CONST macro that is static const for linux, but
644// does nothing for gcc on OSX (which has an internal compiler fault)
645
fbarchard@google.com12d04832011-11-21 23:54:38 +0000646// Shuffle table for reversing the bytes.
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +0000647uvec8 kShuffleReverse = {
fbarchard@google.com12d04832011-11-21 23:54:38 +0000648 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
649};
650
651void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
652 intptr_t temp_width = static_cast<intptr_t>(width);
653 asm volatile (
654 "movdqa %3,%%xmm5 \n"
655 "lea -0x10(%0,%2,1),%0 \n"
656"1: \n"
657 "movdqa (%0),%%xmm0 \n"
658 "lea -0x10(%0),%0 \n"
659 "pshufb %%xmm5,%%xmm0 \n"
660 "movdqa %%xmm0,(%1) \n"
661 "lea 0x10(%1),%1 \n"
662 "sub $0x10,%2 \n"
663 "ja 1b \n"
664 : "+r"(src), // %0
665 "+r"(dst), // %1
666 "+r"(temp_width) // %2
667 : "m"(kShuffleReverse) // %3
668 : "memory", "cc"
669#if defined(__SSE2__)
670 , "xmm0", "xmm5"
671#endif
672 );
673}
674#endif
675
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +0000676#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000677} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +0000678} // namespace libyuv
679#endif