blob: 005efbb461389e6b220642376b78e725786b66e6 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000015extern "C" {
16
fbarchard@google.comb6149762011-11-07 21:58:52 +000017#ifdef HAS_ARGBTOUVROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +000018vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000019 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
20};
21
fbarchard@google.com228bdc22011-11-15 21:58:26 +000022uvec8 kARGBToV = {
fbarchard@google.com2430e042011-11-11 21:57:06 +000023 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
fbarchard@google.comb6149762011-11-07 21:58:52 +000024};
fbarchard@google.com2430e042011-11-11 21:57:06 +000025
fbarchard@google.com228bdc22011-11-15 21:58:26 +000026uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000027 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
28 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
29};
30#endif
31
fbarchard@google.com228bdc22011-11-15 21:58:26 +000032#ifdef HAS_ARGBTOYROW_SSSE3
33
34// Constant multiplication table for converting ARGB to I400.
35vec8 kARGBToY = {
36 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
37};
38
39uvec8 kAddY16 = {
40 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
41};
42
fbarchard@google.com9394ed92011-10-31 21:36:47 +000043// Shuffle table for converting BG24 to ARGB.
fbarchard@google.com228bdc22011-11-15 21:58:26 +000044uvec8 kShuffleMaskBG24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000045 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
46};
47
48// Shuffle table for converting RAW to ARGB.
fbarchard@google.com228bdc22011-11-15 21:58:26 +000049uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000050 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
51};
52
fbarchard@google.comb6149762011-11-07 21:58:52 +000053// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com228bdc22011-11-15 21:58:26 +000054uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000055 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
56};
57
58// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com228bdc22011-11-15 21:58:26 +000059uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000060 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
61};
62
63void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000064 asm volatile (
65 "pcmpeqb %%xmm5,%%xmm5 \n"
66 "pslld $0x18,%%xmm5 \n"
67"1: \n"
68 "movq (%0),%%xmm0 \n"
69 "lea 0x8(%0),%0 \n"
70 "punpcklbw %%xmm0,%%xmm0 \n"
71 "movdqa %%xmm0,%%xmm1 \n"
72 "punpcklwd %%xmm0,%%xmm0 \n"
73 "punpckhwd %%xmm1,%%xmm1 \n"
74 "por %%xmm5,%%xmm0 \n"
75 "por %%xmm5,%%xmm1 \n"
76 "movdqa %%xmm0,(%1) \n"
77 "movdqa %%xmm1,0x10(%1) \n"
78 "lea 0x20(%1),%1 \n"
79 "sub $0x8,%2 \n"
80 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +000081 : "+r"(src_y), // %0
82 "+r"(dst_argb), // %1
83 "+r"(pix) // %2
84 :
85 : "memory", "cc"
86#if defined(__SSE2__)
87 , "xmm0", "xmm1", "xmm5"
88#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +000089);
90}
fbarchard@google.comb6149762011-11-07 21:58:52 +000091
92void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000093 asm volatile (
94 "movdqa %3,%%xmm5 \n"
95"1: \n"
96 "movdqa (%0),%%xmm0 \n"
97 "lea 0x10(%0),%0 \n"
98 "pshufb %%xmm5,%%xmm0 \n"
99 "movdqa %%xmm0,(%1) \n"
100 "lea 0x10(%1),%1 \n"
101 "sub $0x4,%2 \n"
102 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000103 : "+r"(src_abgr), // %0
104 "+r"(dst_argb), // %1
105 "+r"(pix) // %2
106 : "m"(kShuffleMaskABGRToARGB) // %3
107 : "memory", "cc"
108#if defined(__SSE2__)
109 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000110#endif
111
fbarchard@google.comb6149762011-11-07 21:58:52 +0000112);
113}
114
115void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000116 asm volatile (
117 "movdqa %3,%%xmm5 \n"
118"1: \n"
119 "movdqa (%0),%%xmm0 \n"
120 "lea 0x10(%0),%0 \n"
121 "pshufb %%xmm5,%%xmm0 \n"
122 "movdqa %%xmm0,(%1) \n"
123 "lea 0x10(%1),%1 \n"
124 "sub $0x4,%2 \n"
125 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000126 : "+r"(src_bgra), // %0
127 "+r"(dst_argb), // %1
128 "+r"(pix) // %2
129 : "m"(kShuffleMaskBGRAToARGB) // %3
130 : "memory", "cc"
131#if defined(__SSE2__)
132 , "xmm0", "xmm5"
133#endif
134);
135}
136
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000137void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000138 asm volatile (
139 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
140 "pslld $0x18,%%xmm5 \n"
141 "movdqa %3,%%xmm4 \n"
142"1: \n"
143 "movdqa (%0),%%xmm0 \n"
144 "movdqa 0x10(%0),%%xmm1 \n"
145 "movdqa 0x20(%0),%%xmm3 \n"
146 "lea 0x30(%0),%0 \n"
147 "movdqa %%xmm3,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000148 "palignr $0x8,%%xmm1,%%xmm2 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000149 "pshufb %%xmm4,%%xmm2 \n"
150 "por %%xmm5,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000151 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000152 "pshufb %%xmm4,%%xmm0 \n"
153 "movdqa %%xmm2,0x20(%1) \n"
154 "por %%xmm5,%%xmm0 \n"
155 "pshufb %%xmm4,%%xmm1 \n"
156 "movdqa %%xmm0,(%1) \n"
157 "por %%xmm5,%%xmm1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000158 "palignr $0x4,%%xmm3,%%xmm3 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000159 "pshufb %%xmm4,%%xmm3 \n"
160 "movdqa %%xmm1,0x10(%1) \n"
161 "por %%xmm5,%%xmm3 \n"
162 "movdqa %%xmm3,0x30(%1) \n"
163 "lea 0x40(%1),%1 \n"
164 "sub $0x10,%2 \n"
165 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000166 : "+r"(src_bg24), // %0
167 "+r"(dst_argb), // %1
168 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000169 : "m"(kShuffleMaskBG24ToARGB) // %3
170 : "memory", "cc"
171#if defined(__SSE2__)
172 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
173#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000174);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000175}
176
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000177void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000178 asm volatile (
179 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
180 "pslld $0x18,%%xmm5 \n"
181 "movdqa %3,%%xmm4 \n"
182"1: \n"
183 "movdqa (%0),%%xmm0 \n"
184 "movdqa 0x10(%0),%%xmm1 \n"
185 "movdqa 0x20(%0),%%xmm3 \n"
186 "lea 0x30(%0),%0 \n"
187 "movdqa %%xmm3,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000188 "palignr $0x8,%%xmm1,%%xmm2 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000189 "pshufb %%xmm4,%%xmm2 \n"
190 "por %%xmm5,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000191 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000192 "pshufb %%xmm4,%%xmm0 \n"
193 "movdqa %%xmm2,0x20(%1) \n"
194 "por %%xmm5,%%xmm0 \n"
195 "pshufb %%xmm4,%%xmm1 \n"
196 "movdqa %%xmm0,(%1) \n"
197 "por %%xmm5,%%xmm1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000198 "palignr $0x4,%%xmm3,%%xmm3 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000199 "pshufb %%xmm4,%%xmm3 \n"
200 "movdqa %%xmm1,0x10(%1) \n"
201 "por %%xmm5,%%xmm3 \n"
202 "movdqa %%xmm3,0x30(%1) \n"
203 "lea 0x40(%1),%1 \n"
204 "sub $0x10,%2 \n"
205 "ja 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000206 : "+r"(src_raw), // %0
207 "+r"(dst_argb), // %1
208 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000209 : "m"(kShuffleMaskRAWToARGB) // %3
210 : "memory", "cc"
211#if defined(__SSE2__)
212 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
213#endif
214);
215}
216
217void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000218 asm volatile (
219 "movdqa %4,%%xmm5 \n"
220 "movdqa %3,%%xmm4 \n"
221"1: \n"
222 "movdqa (%0),%%xmm0 \n"
223 "movdqa 0x10(%0),%%xmm1 \n"
224 "movdqa 0x20(%0),%%xmm2 \n"
225 "movdqa 0x30(%0),%%xmm3 \n"
226 "pmaddubsw %%xmm4,%%xmm0 \n"
227 "pmaddubsw %%xmm4,%%xmm1 \n"
228 "pmaddubsw %%xmm4,%%xmm2 \n"
229 "pmaddubsw %%xmm4,%%xmm3 \n"
230 "lea 0x40(%0),%0 \n"
231 "phaddw %%xmm1,%%xmm0 \n"
232 "phaddw %%xmm3,%%xmm2 \n"
233 "psrlw $0x7,%%xmm0 \n"
234 "psrlw $0x7,%%xmm2 \n"
235 "packuswb %%xmm2,%%xmm0 \n"
236 "paddb %%xmm5,%%xmm0 \n"
237 "movdqa %%xmm0,(%1) \n"
238 "lea 0x10(%1),%1 \n"
239 "sub $0x10,%2 \n"
240 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000241 : "+r"(src_argb), // %0
242 "+r"(dst_y), // %1
243 "+r"(pix) // %2
244 : "m"(kARGBToY), // %3
245 "m"(kAddY16) // %4
246 : "memory", "cc"
247#if defined(__SSE2__)
248 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
249#endif
250
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000251);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000252}
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000253#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +0000254
fbarchard@google.comb6149762011-11-07 21:58:52 +0000255#ifdef HAS_ARGBTOUVROW_SSSE3
256void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
257 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000258 asm volatile (
259 "movdqa %0,%%xmm4 \n"
260 "movdqa %1,%%xmm3 \n"
261 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000262 :
263 : "m"(kARGBToU), // %0
264 "m"(kARGBToV), // %1
265 "m"(kAddUV128) // %2
266 :
267#if defined(__SSE2__)
268 "xmm3", "xmm4", "xmm5"
269#endif
270 );
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000271 asm volatile (
272 "sub %1,%2 \n"
273"1: \n"
274 "movdqa (%0),%%xmm0 \n"
275 "movdqa 0x10(%0),%%xmm1 \n"
276 "movdqa 0x20(%0),%%xmm2 \n"
277 "movdqa 0x30(%0),%%xmm6 \n"
278 "pavgb (%0,%4,1),%%xmm0 \n"
279 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
280 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
281 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
282 "lea 0x40(%0),%0 \n"
283 "movdqa %%xmm0,%%xmm7 \n"
284 "shufps $0x88,%%xmm1,%%xmm0 \n"
285 "shufps $0xdd,%%xmm1,%%xmm7 \n"
286 "pavgb %%xmm7,%%xmm0 \n"
287 "movdqa %%xmm2,%%xmm7 \n"
288 "shufps $0x88,%%xmm6,%%xmm2 \n"
289 "shufps $0xdd,%%xmm6,%%xmm7 \n"
290 "pavgb %%xmm7,%%xmm2 \n"
291 "movdqa %%xmm0,%%xmm1 \n"
292 "movdqa %%xmm2,%%xmm6 \n"
293 "pmaddubsw %%xmm4,%%xmm0 \n"
294 "pmaddubsw %%xmm4,%%xmm2 \n"
295 "pmaddubsw %%xmm3,%%xmm1 \n"
296 "pmaddubsw %%xmm3,%%xmm6 \n"
297 "phaddw %%xmm2,%%xmm0 \n"
298 "phaddw %%xmm6,%%xmm1 \n"
299 "psraw $0x8,%%xmm0 \n"
300 "psraw $0x8,%%xmm1 \n"
301 "packsswb %%xmm1,%%xmm0 \n"
302 "paddb %%xmm5,%%xmm0 \n"
303 "movlps %%xmm0,(%1) \n"
304 "movhps %%xmm0,(%1,%2,1) \n"
305 "lea 0x8(%1),%1 \n"
306 "sub $0x10,%3 \n"
307 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000308 : "+r"(src_argb0), // %0
309 "+r"(dst_u), // %1
310 "+r"(dst_v), // %2
311 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000312 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000313 : "memory", "cc"
314#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000315 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000316#endif
317);
318}
319#endif
320
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000321
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000322#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
323#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
324#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
325#define UR 0
326
327#define VB 0
328#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
329#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
330
331// Bias
332#define BB UB * 128 + VB * 128
333#define BG UG * 128 + VG * 128
334#define BR UR * 128 + VR * 128
335
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000336#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000337
fbarchard@google.comb6149762011-11-07 21:58:52 +0000338#if defined(__APPLE__) || defined(__x86_64__)
339#define OMITFP
340#else
341#define OMITFP __attribute__((optimize("omit-frame-pointer")))
342#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000343
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000344struct {
345 vec8 kUVToB;
346 vec8 kUVToG;
347 vec8 kUVToR;
348 vec16 kUVBiasB;
349 vec16 kUVBiasG;
350 vec16 kUVBiasR;
351 vec16 kYSub16;
352 vec16 kYToRgb;
353} SIMD_ALIGNED(kYuvConstants) = {
354 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
355 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
356 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
357 { BB, BB, BB, BB, BB, BB, BB, BB },
358 { BG, BG, BG, BG, BG, BG, BG, BG },
359 { BR, BR, BR, BR, BR, BR, BR, BR },
360 { 16, 16, 16, 16, 16, 16, 16, 16 },
361 { YG, YG, YG, YG, YG, YG, YG, YG }
362};
363
364// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +0000365#define YUVTORGB \
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000366 "movd (%1),%%xmm0 \n" \
367 "movd (%1,%2,1),%%xmm1 \n" \
368 "lea 0x4(%1),%1 \n" \
369 "punpcklbw %%xmm1,%%xmm0 \n" \
370 "punpcklwd %%xmm0,%%xmm0 \n" \
371 "movdqa %%xmm0,%%xmm1 \n" \
372 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000373 "pmaddubsw (%5),%%xmm0 \n" \
374 "pmaddubsw 16(%5),%%xmm1 \n" \
375 "pmaddubsw 32(%5),%%xmm2 \n" \
376 "psubw 48(%5),%%xmm0 \n" \
377 "psubw 64(%5),%%xmm1 \n" \
378 "psubw 80(%5),%%xmm2 \n" \
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000379 "movq (%0),%%xmm3 \n" \
380 "lea 0x8(%0),%0 \n" \
381 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000382 "psubsw 96(%5),%%xmm3 \n" \
383 "pmullw 112(%5),%%xmm3 \n" \
fbarchard@google.combc8f28e2011-11-18 21:03:54 +0000384 "paddsw %%xmm3,%%xmm0 \n" \
385 "paddsw %%xmm3,%%xmm1 \n" \
386 "paddsw %%xmm3,%%xmm2 \n" \
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000387 "psraw $0x6,%%xmm0 \n" \
388 "psraw $0x6,%%xmm1 \n" \
389 "psraw $0x6,%%xmm2 \n" \
390 "packuswb %%xmm0,%%xmm0 \n" \
391 "packuswb %%xmm1,%%xmm1 \n" \
392 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000393
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000394void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, // rdi
fbarchard@google.comb6149762011-11-07 21:58:52 +0000395 const uint8* u_buf, // rsi
396 const uint8* v_buf, // rdx
397 uint8* rgb_buf, // rcx
398 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000399 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000400 "sub %1,%2 \n"
401 "pcmpeqb %%xmm5,%%xmm5 \n"
402 "pxor %%xmm4,%%xmm4 \n"
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000403
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000404 "1: \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000405 YUVTORGB
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000406 "punpcklbw %%xmm1,%%xmm0 \n"
407 "punpcklbw %%xmm5,%%xmm2 \n"
408 "movdqa %%xmm0,%%xmm1 \n"
409 "punpcklwd %%xmm2,%%xmm0 \n"
410 "movdqa %%xmm0,(%3) \n"
411 "punpckhwd %%xmm2,%%xmm1 \n"
412 "movdqa %%xmm1,0x10(%3) \n"
413 "lea 0x20(%3),%3 \n"
414 "sub $0x8,%4 \n"
415 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000416 : "+r"(y_buf), // %0
417 "+r"(u_buf), // %1
418 "+r"(v_buf), // %2
419 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000420 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000421 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000422 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000423#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000424 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000425#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000426 );
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000427}
428
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000429void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
430 const uint8* u_buf, // rsi
431 const uint8* v_buf, // rdx
432 uint8* rgb_buf, // rcx
433 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000434 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000435 "sub %1,%2 \n"
436 "pcmpeqb %%xmm5,%%xmm5 \n"
437 "pxor %%xmm4,%%xmm4 \n"
438
439 "1: \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000440 YUVTORGB
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000441 "pcmpeqb %%xmm5,%%xmm5 \n"
442 "punpcklbw %%xmm0,%%xmm1 \n"
443 "punpcklbw %%xmm2,%%xmm5 \n"
444 "movdqa %%xmm5,%%xmm0 \n"
445 "punpcklwd %%xmm1,%%xmm5 \n"
446 "movdqa %%xmm5,(%3) \n"
447 "punpckhwd %%xmm1,%%xmm0 \n"
448 "movdqa %%xmm0,0x10(%3) \n"
449 "lea 0x20(%3),%3 \n"
450 "sub $0x8,%4 \n"
451 "ja 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000452 : "+r"(y_buf), // %0
453 "+r"(u_buf), // %1
454 "+r"(v_buf), // %2
455 "+r"(rgb_buf), // %3
456 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000457 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000458 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000459#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000460 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000461#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000462 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000463}
464
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000465void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
466 const uint8* u_buf, // rsi
467 const uint8* v_buf, // rdx
468 uint8* rgb_buf, // rcx
469 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000470 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000471 "sub %1,%2 \n"
472 "pcmpeqb %%xmm5,%%xmm5 \n"
473 "pxor %%xmm4,%%xmm4 \n"
474
475 "1: \n"
476 YUVTORGB
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000477 "punpcklbw %%xmm1,%%xmm2 \n"
478 "punpcklbw %%xmm5,%%xmm0 \n"
479 "movdqa %%xmm2,%%xmm1 \n"
480 "punpcklwd %%xmm0,%%xmm2 \n"
481 "movdqa %%xmm2,(%3) \n"
482 "punpckhwd %%xmm0,%%xmm1 \n"
483 "movdqa %%xmm1,0x10(%3) \n"
484 "lea 0x20(%3),%3 \n"
485 "sub $0x8,%4 \n"
486 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000487 : "+r"(y_buf), // %0
488 "+r"(u_buf), // %1
489 "+r"(v_buf), // %2
490 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000491 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000492 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000493 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000494#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000495 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000496#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000497 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000498}
499
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000500void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
501 const uint8* u_buf, // rsi
502 const uint8* v_buf, // rdx
503 uint8* rgb_buf, // rcx
504 int width) { // r8
505 asm volatile (
506 "sub %1,%2 \n"
507 "pcmpeqb %%xmm5,%%xmm5 \n"
508 "pxor %%xmm4,%%xmm4 \n"
509
510 "1: \n"
511 "movd (%1),%%xmm0 \n"
512 "movd (%1,%2,1),%%xmm1 \n"
513 "lea 0x4(%1),%1 \n"
514 "punpcklbw %%xmm1,%%xmm0 \n"
515 "movdqa %%xmm0,%%xmm1 \n"
516 "movdqa %%xmm0,%%xmm2 \n"
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000517 "pmaddubsw (%5),%%xmm0 \n"
518 "pmaddubsw 16(%5),%%xmm1 \n"
519 "pmaddubsw 32(%5),%%xmm2 \n"
520 "psubw 48(%5),%%xmm0 \n"
521 "psubw 64(%5),%%xmm1 \n"
522 "psubw 80(%5),%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000523 "movd (%0),%%xmm3 \n"
524 "lea 0x4(%0),%0 \n"
525 "punpcklbw %%xmm4,%%xmm3 \n"
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000526 "psubsw 96(%5),%%xmm3 \n"
527 "pmullw 112(%5),%%xmm3 \n"
fbarchard@google.combc8f28e2011-11-18 21:03:54 +0000528 "paddsw %%xmm3,%%xmm0 \n"
529 "paddsw %%xmm3,%%xmm1 \n"
530 "paddsw %%xmm3,%%xmm2 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000531 "psraw $0x6,%%xmm0 \n"
532 "psraw $0x6,%%xmm1 \n"
533 "psraw $0x6,%%xmm2 \n"
534 "packuswb %%xmm0,%%xmm0 \n"
535 "packuswb %%xmm1,%%xmm1 \n"
536 "packuswb %%xmm2,%%xmm2 \n"
537 "punpcklbw %%xmm1,%%xmm0 \n"
538 "punpcklbw %%xmm5,%%xmm2 \n"
539 "punpcklwd %%xmm2,%%xmm0 \n"
540 "movdqa %%xmm0,(%3) \n"
541 "lea 0x10(%3),%3 \n"
542 "sub $0x4,%4 \n"
543 "ja 1b \n"
544 : "+r"(y_buf), // %0
545 "+r"(u_buf), // %1
546 "+r"(v_buf), // %2
547 "+r"(rgb_buf), // %3
548 "+rm"(width) // %4
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000549 : "r"(&kYuvConstants.kUVToB) // %5
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000550 : "memory", "cc"
551#if defined(__SSE2__)
552 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
553#endif
554 );
555}
556#endif
557
558#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000559void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
560 uint8* rgb_buf, // rcx
561 int width) { // r8
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000562 asm volatile (
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000563 "pcmpeqb %%xmm5,%%xmm5 \n"
564 "pslld $0x18,%%xmm5 \n"
565 "pxor %%xmm4,%%xmm4 \n"
566 "movdqa %3,%%xmm3 \n"
567 "movdqa %4,%%xmm2 \n"
568
569 "1: \n"
570 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
571 "movq (%0),%%xmm0 \n"
572 "lea 0x8(%0),%0 \n"
573 "punpcklbw %%xmm4,%%xmm0 \n"
574 "psubsw %%xmm3,%%xmm0 \n"
575 "pmullw %%xmm2,%%xmm0 \n"
576 "psraw $0x6,%%xmm0 \n"
577 "packuswb %%xmm0,%%xmm0 \n"
578
579 // Step 2: Weave into ARGB
580 "punpcklbw %%xmm0,%%xmm0 \n"
581 "movdqa %%xmm0,%%xmm1 \n"
582 "punpcklwd %%xmm0,%%xmm0 \n"
583 "por %%xmm5,%%xmm0 \n"
584 "movdqa %%xmm0,(%1) \n"
585 "punpckhwd %%xmm1,%%xmm1 \n"
586 "por %%xmm5,%%xmm1 \n"
587 "movdqa %%xmm1,16(%1) \n"
588 "lea 32(%1),%1 \n"
589
590 "sub $0x8,%2 \n"
591 "ja 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000592 : "+r"(y_buf), // %0
593 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +0000594 "+rm"(width) // %2
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +0000595 : "m"(kYuvConstants.kYSub16), // %3
596 "m"(kYuvConstants.kYToRgb) // %4
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000597 : "memory", "cc"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000598#if defined(__SSE2__)
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000599 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000600#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000601 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000602}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000603#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000604
frkoenig@google.come5185422011-11-07 23:07:57 +0000605#ifdef HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000606void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
607 SIMD_ALIGNED(uint8 row[kMaxStride]);
608 ABGRToARGBRow_SSSE3(src_argb, row, pix);
609 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000610}
611
fbarchard@google.comb6149762011-11-07 21:58:52 +0000612void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
613 SIMD_ALIGNED(uint8 row[kMaxStride]);
614 BGRAToARGBRow_SSSE3(src_argb, row, pix);
615 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000616}
frkoenig@google.come5185422011-11-07 23:07:57 +0000617#endif
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000618
fbarchard@google.comb6149762011-11-07 21:58:52 +0000619#ifdef HAS_ARGBTOUVROW_SSSE3
620void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
621 uint8* dst_u, uint8* dst_v, int pix) {
622 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
623 ABGRToARGBRow_SSSE3(src_argb, row, pix);
624 ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
625 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000626}
627
fbarchard@google.comb6149762011-11-07 21:58:52 +0000628void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
629 uint8* dst_u, uint8* dst_v, int pix) {
630 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
631 BGRAToARGBRow_SSSE3(src_argb, row, pix);
632 BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
633 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000634}
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000635#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000636
fbarchard@google.com12d04832011-11-21 23:54:38 +0000637#ifdef HAS_REVERSE_ROW_SSSE3
638
639// Shuffle table for reversing the bytes.
640static const uvec8 kShuffleReverse = {
641 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
642};
643
644void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
645 intptr_t temp_width = static_cast<intptr_t>(width);
646 asm volatile (
647 "movdqa %3,%%xmm5 \n"
648 "lea -0x10(%0,%2,1),%0 \n"
649"1: \n"
650 "movdqa (%0),%%xmm0 \n"
651 "lea -0x10(%0),%0 \n"
652 "pshufb %%xmm5,%%xmm0 \n"
653 "movdqa %%xmm0,(%1) \n"
654 "lea 0x10(%1),%1 \n"
655 "sub $0x10,%2 \n"
656 "ja 1b \n"
657 : "+r"(src), // %0
658 "+r"(dst), // %1
659 "+r"(temp_width) // %2
660 : "m"(kShuffleReverse) // %3
661 : "memory", "cc"
662#if defined(__SSE2__)
663 , "xmm0", "xmm5"
664#endif
665 );
666}
667#endif
668
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000669} // extern "C"