blob: 88ce475b419b87401d0d8278af26c106c70f4907 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
13extern "C" {
14
fbarchard@google.com585a1262011-10-28 23:51:08 +000015#ifdef HAS_ARGBTOYROW_SSSE3
16
17// Constant multiplication table for converting ARGB to I400.
18extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
19 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
20};
21
22extern "C" TALIGN16(const uint8, kAdd16[16]) = {
23 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
24};
25
fbarchard@google.com9394ed92011-10-31 21:36:47 +000026// Shuffle table for converting BG24 to ARGB.
27extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
28 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
29};
30
31// Shuffle table for converting RAW to ARGB.
32extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
33 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
34};
35
fbarchard@google.com585a1262011-10-28 23:51:08 +000036void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
37 asm volatile(
38 "movdqa (%3),%%xmm7\n"
39 "movdqa (%4),%%xmm6\n"
40 "movdqa %%xmm6,%%xmm5\n"
41 "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte.
42"1:"
43 "movdqa (%0),%%xmm0\n"
44 "pmaddubsw %%xmm7,%%xmm0\n"
45 "movdqa 0x10(%0),%%xmm1\n"
46 "psrlw $0x7,%%xmm0\n"
47 "pmaddubsw %%xmm7,%%xmm1\n"
48 "lea 0x20(%0),%0\n"
49 "psrlw $0x7,%%xmm1\n"
50 "packuswb %%xmm1,%%xmm0\n"
51 "pmaddubsw %%xmm6,%%xmm0\n"
52 "packuswb %%xmm0,%%xmm0\n"
53 "paddb %%xmm5,%%xmm0\n"
54 "movq %%xmm0,(%1)\n"
55 "lea 0x8(%1),%1\n"
56 "sub $0x8,%2\n"
57 "ja 1b\n"
58 : "+r"(src_argb), // %0
59 "+r"(dst_y), // %1
60 "+r"(pix) // %2
61 : "r"(kMultiplyMaskARGBToI400), // %3
62 "r"(kAdd16) // %4
63 : "memory"
64);
65}
66#endif
67
fbarchard@google.com9394ed92011-10-31 21:36:47 +000068#ifdef HAS_BG24TOARGBROW_SSSE3
69void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
70 asm volatile(
71 "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
72 "pslld $0x18,%%xmm7\n"
73 "movdqa (%3),%%xmm6\n"
74"1:"
75 "movdqa (%0),%%xmm0\n"
76 "movdqa 0x10(%0),%%xmm1\n"
77 "movdqa 0x20(%0),%%xmm3\n"
78 "lea 0x30(%0),%0\n"
79 "movdqa %%xmm3,%%xmm2\n"
80 "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
81 "pshufb %%xmm6,%%xmm2\n"
82 "por %%xmm7,%%xmm2\n"
83 "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
84 "pshufb %%xmm6,%%xmm0\n"
85 "movdqa %%xmm2,0x20(%1)\n"
86 "por %%xmm7,%%xmm0\n"
87 "pshufb %%xmm6,%%xmm1\n"
88 "movdqa %%xmm0,(%1)\n"
89 "por %%xmm7,%%xmm1\n"
90 "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
91 "pshufb %%xmm6,%%xmm3\n"
92 "movdqa %%xmm1,0x10(%1)\n"
93 "por %%xmm7,%%xmm3\n"
94 "movdqa %%xmm3,0x30(%1)\n"
95 "lea 0x40(%1),%1\n"
96 "sub $0x10,%2\n"
97 "ja 1b\n"
98 : "+r"(src_bg24), // %0
99 "+r"(dst_argb), // %1
100 "+r"(pix) // %2
101 : "r"(kShuffleMaskBG24ToARGB) // %3
102 : "memory"
103);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000104}
105
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000106void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
107 asm volatile(
108 "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
109 "pslld $0x18,%%xmm7\n"
110 "movdqa (%3),%%xmm6\n"
111"1:"
112 "movdqa (%0),%%xmm0\n"
113 "movdqa 0x10(%0),%%xmm1\n"
114 "movdqa 0x20(%0),%%xmm3\n"
115 "lea 0x30(%0),%0\n"
116 "movdqa %%xmm3,%%xmm2\n"
117 "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
118 "pshufb %%xmm6,%%xmm2\n"
119 "por %%xmm7,%%xmm2\n"
120 "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
121 "pshufb %%xmm6,%%xmm0\n"
122 "movdqa %%xmm2,0x20(%1)\n"
123 "por %%xmm7,%%xmm0\n"
124 "pshufb %%xmm6,%%xmm1\n"
125 "movdqa %%xmm0,(%1)\n"
126 "por %%xmm7,%%xmm1\n"
127 "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
128 "pshufb %%xmm6,%%xmm3\n"
129 "movdqa %%xmm1,0x10(%1)\n"
130 "por %%xmm7,%%xmm3\n"
131 "movdqa %%xmm3,0x30(%1)\n"
132 "lea 0x40(%1),%1\n"
133 "sub $0x10,%2\n"
134 "ja 1b\n"
135 : "+r"(src_raw), // %0
136 "+r"(dst_argb), // %1
137 "+r"(pix) // %2
138 : "r"(kShuffleMaskRAWToARGB) // %3
139 : "memory"
140);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000141}
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000142#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +0000143
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000144#if defined(__x86_64__)
145
146// 64 bit linux gcc version
147
148void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
149 const uint8* u_buf, // rsi
150 const uint8* v_buf, // rdx
151 uint8* rgb_buf, // rcx
152 int width) { // r8
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000153 asm volatile(
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000154"1:"
155 "movzb (%1),%%r10\n"
156 "lea 1(%1),%1\n"
157 "movzb (%2),%%r11\n"
158 "lea 1(%2),%2\n"
159 "movq 2048(%5,%%r10,8),%%xmm0\n"
160 "movzb (%0),%%r10\n"
161 "movq 4096(%5,%%r11,8),%%xmm1\n"
162 "movzb 0x1(%0),%%r11\n"
163 "paddsw %%xmm1,%%xmm0\n"
164 "movq (%5,%%r10,8),%%xmm2\n"
165 "lea 2(%0),%0\n"
166 "movq (%5,%%r11,8),%%xmm3\n"
167 "paddsw %%xmm0,%%xmm2\n"
168 "paddsw %%xmm0,%%xmm3\n"
169 "shufps $0x44,%%xmm3,%%xmm2\n"
170 "psraw $0x6,%%xmm2\n"
171 "packuswb %%xmm2,%%xmm2\n"
172 "movq %%xmm2,0x0(%3)\n"
173 "lea 8(%3),%3\n"
174 "sub $0x2,%4\n"
175 "ja 1b\n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000176 : "+r"(y_buf), // %0
177 "+r"(u_buf), // %1
178 "+r"(v_buf), // %2
179 "+r"(rgb_buf), // %3
180 "+r"(width) // %4
181 : "r" (_kCoefficientsRgbY) // %5
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000182 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
183);
184}
185
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000186void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi
187 const uint8* u_buf, // rsi
188 const uint8* v_buf, // rdx
189 uint8* rgb_buf, // rcx
190 int width) { // r8
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000191 asm volatile(
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000192"1:"
193 "movzb (%1),%%r10\n"
194 "lea 1(%1),%1\n"
195 "movzb (%2),%%r11\n"
196 "lea 1(%2),%2\n"
197 "movq 2048(%5,%%r10,8),%%xmm0\n"
198 "movzb (%0),%%r10\n"
199 "movq 4096(%5,%%r11,8),%%xmm1\n"
200 "movzb 0x1(%0),%%r11\n"
201 "paddsw %%xmm1,%%xmm0\n"
202 "movq (%5,%%r10,8),%%xmm2\n"
203 "lea 2(%0),%0\n"
204 "movq (%5,%%r11,8),%%xmm3\n"
205 "paddsw %%xmm0,%%xmm2\n"
206 "paddsw %%xmm0,%%xmm3\n"
207 "shufps $0x44,%%xmm3,%%xmm2\n"
208 "psraw $0x6,%%xmm2\n"
209 "packuswb %%xmm2,%%xmm2\n"
210 "movq %%xmm2,0x0(%3)\n"
211 "lea 8(%3),%3\n"
212 "sub $0x2,%4\n"
213 "ja 1b\n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000214 : "+r"(y_buf), // %0
215 "+r"(u_buf), // %1
216 "+r"(v_buf), // %2
217 "+r"(rgb_buf), // %3
218 "+r"(width) // %4
219 : "r" (_kCoefficientsBgraY) // %5
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000220 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
221);
222}
223
224void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi
225 const uint8* u_buf, // rsi
226 const uint8* v_buf, // rdx
227 uint8* rgb_buf, // rcx
228 int width) { // r8
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000229 asm volatile(
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000230"1:"
231 "movzb (%1),%%r10\n"
232 "lea 1(%1),%1\n"
233 "movzb (%2),%%r11\n"
234 "lea 1(%2),%2\n"
235 "movq 2048(%5,%%r10,8),%%xmm0\n"
236 "movzb (%0),%%r10\n"
237 "movq 4096(%5,%%r11,8),%%xmm1\n"
238 "movzb 0x1(%0),%%r11\n"
239 "paddsw %%xmm1,%%xmm0\n"
240 "movq (%5,%%r10,8),%%xmm2\n"
241 "lea 2(%0),%0\n"
242 "movq (%5,%%r11,8),%%xmm3\n"
243 "paddsw %%xmm0,%%xmm2\n"
244 "paddsw %%xmm0,%%xmm3\n"
245 "shufps $0x44,%%xmm3,%%xmm2\n"
246 "psraw $0x6,%%xmm2\n"
247 "packuswb %%xmm2,%%xmm2\n"
248 "movq %%xmm2,0x0(%3)\n"
249 "lea 8(%3),%3\n"
250 "sub $0x2,%4\n"
251 "ja 1b\n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000252 : "+r"(y_buf), // %0
253 "+r"(u_buf), // %1
254 "+r"(v_buf), // %2
255 "+r"(rgb_buf), // %3
256 "+r"(width) // %4
257 : "r" (_kCoefficientsAbgrY) // %5
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000258 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
259);
260}
261
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000262void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi
263 const uint8* u_buf, // rsi
264 const uint8* v_buf, // rdx
265 uint8* rgb_buf, // rcx
266 int width) { // r8
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000267 asm volatile(
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000268"1:"
269 "movzb (%1),%%r10\n"
270 "lea 1(%1),%1\n"
271 "movzb (%2),%%r11\n"
272 "lea 1(%2),%2\n"
273 "movq 2048(%5,%%r10,8),%%xmm0\n"
274 "movzb (%0),%%r10\n"
275 "movq 4096(%5,%%r11,8),%%xmm1\n"
276 "paddsw %%xmm1,%%xmm0\n"
277 "movq (%5,%%r10,8),%%xmm2\n"
278 "lea 1(%0),%0\n"
279 "paddsw %%xmm0,%%xmm2\n"
280 "shufps $0x44,%%xmm2,%%xmm2\n"
281 "psraw $0x6,%%xmm2\n"
282 "packuswb %%xmm2,%%xmm2\n"
283 "movd %%xmm2,0x0(%3)\n"
284 "lea 4(%3),%3\n"
285 "sub $0x1,%4\n"
286 "ja 1b\n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000287 : "+r"(y_buf), // %0
288 "+r"(u_buf), // %1
289 "+r"(v_buf), // %2
290 "+r"(rgb_buf), // %3
291 "+r"(width) // %4
292 : "r" (_kCoefficientsRgbY) // %5
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000293 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
294);
295}
296
297void FastConvertYToRGB32Row(const uint8* y_buf, // rdi
298 uint8* rgb_buf, // rcx
299 int width) { // r8
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000300 asm volatile(
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000301"1:"
302 "movzb (%0),%%r10\n"
303 "movzb 0x1(%0),%%r11\n"
304 "movq (%3,%%r10,8),%%xmm2\n"
305 "lea 2(%0),%0\n"
306 "movq (%3,%%r11,8),%%xmm3\n"
307 "shufps $0x44,%%xmm3,%%xmm2\n"
308 "psraw $0x6,%%xmm2\n"
309 "packuswb %%xmm2,%%xmm2\n"
310 "movq %%xmm2,0x0(%1)\n"
311 "lea 8(%1),%1\n"
312 "sub $0x2,%2\n"
313 "ja 1b\n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000314 : "+r"(y_buf), // %0
315 "+r"(rgb_buf), // %1
316 "+r"(width) // %2
317 : "r" (_kCoefficientsRgbY) // %3
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000318 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
319);
320}
321
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000322#elif defined(__i386__)
323// 32 bit gcc version
324
325void FastConvertYUVToRGB32Row(const uint8* y_buf,
326 const uint8* u_buf,
327 const uint8* v_buf,
328 uint8* rgb_buf,
329 int width);
330 asm(
331 ".text\n"
332#if defined(OSX) || defined(IOS)
333 ".globl _FastConvertYUVToRGB32Row\n"
334"_FastConvertYUVToRGB32Row:\n"
335#else
336 ".global FastConvertYUVToRGB32Row\n"
337"FastConvertYUVToRGB32Row:\n"
338#endif
339 "pusha\n"
340 "mov 0x24(%esp),%edx\n"
341 "mov 0x28(%esp),%edi\n"
342 "mov 0x2c(%esp),%esi\n"
343 "mov 0x30(%esp),%ebp\n"
344 "mov 0x34(%esp),%ecx\n"
345
346"1:"
347 "movzbl (%edi),%eax\n"
348 "lea 1(%edi),%edi\n"
349 "movzbl (%esi),%ebx\n"
350 "lea 1(%esi),%esi\n"
351 "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
352 "movzbl (%edx),%eax\n"
353 "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
354 "movzbl 0x1(%edx),%ebx\n"
355 "movq _kCoefficientsRgbY(,%eax,8),%mm1\n"
356 "lea 2(%edx),%edx\n"
357 "movq _kCoefficientsRgbY(,%ebx,8),%mm2\n"
358 "paddsw %mm0,%mm1\n"
359 "paddsw %mm0,%mm2\n"
360 "psraw $0x6,%mm1\n"
361 "psraw $0x6,%mm2\n"
362 "packuswb %mm2,%mm1\n"
363 "movntq %mm1,0x0(%ebp)\n"
364 "lea 8(%ebp),%ebp\n"
365 "sub $0x2,%ecx\n"
366 "ja 1b\n"
367 "popa\n"
368 "ret\n"
369);
370
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000371void FastConvertYUVToBGRARow(const uint8* y_buf,
372 const uint8* u_buf,
373 const uint8* v_buf,
374 uint8* rgb_buf,
375 int width);
376 asm(
377 ".text\n"
378#if defined(OSX) || defined(IOS)
379 ".globl _FastConvertYUVToBGRARow\n"
380"_FastConvertYUVToBGRARow:\n"
381#else
382 ".global FastConvertYUVToBGRARow\n"
383"FastConvertYUVToBGRARow:\n"
384#endif
385 "pusha\n"
386 "mov 0x24(%esp),%edx\n"
387 "mov 0x28(%esp),%edi\n"
388 "mov 0x2c(%esp),%esi\n"
389 "mov 0x30(%esp),%ebp\n"
390 "mov 0x34(%esp),%ecx\n"
391
392"1:"
393 "movzbl (%edi),%eax\n"
394 "lea 1(%edi),%edi\n"
395 "movzbl (%esi),%ebx\n"
396 "lea 1(%esi),%esi\n"
397 "movq _kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
398 "movzbl (%edx),%eax\n"
399 "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
400 "movzbl 0x1(%edx),%ebx\n"
401 "movq _kCoefficientsBgraY(,%eax,8),%mm1\n"
402 "lea 2(%edx),%edx\n"
403 "movq _kCoefficientsBgraY(,%ebx,8),%mm2\n"
404 "paddsw %mm0,%mm1\n"
405 "paddsw %mm0,%mm2\n"
406 "psraw $0x6,%mm1\n"
407 "psraw $0x6,%mm2\n"
408 "packuswb %mm2,%mm1\n"
409 "movntq %mm1,0x0(%ebp)\n"
410 "lea 8(%ebp),%ebp\n"
411 "sub $0x2,%ecx\n"
412 "ja 1b\n"
413 "popa\n"
414 "ret\n"
415);
416
417void FastConvertYUVToABGRRow(const uint8* y_buf,
418 const uint8* u_buf,
419 const uint8* v_buf,
420 uint8* rgb_buf,
421 int width);
422 asm(
423 ".text\n"
424#if defined(OSX) || defined(IOS)
425 ".globl _FastConvertYUVToABGRRow\n"
426"_FastConvertYUVToABGRRow:\n"
427#else
428 ".global FastConvertYUVToABGRRow\n"
429"FastConvertYUVToABGRRow:\n"
430#endif
431 "pusha\n"
432 "mov 0x24(%esp),%edx\n"
433 "mov 0x28(%esp),%edi\n"
434 "mov 0x2c(%esp),%esi\n"
435 "mov 0x30(%esp),%ebp\n"
436 "mov 0x34(%esp),%ecx\n"
437
438"1:"
439 "movzbl (%edi),%eax\n"
440 "lea 1(%edi),%edi\n"
441 "movzbl (%esi),%ebx\n"
442 "lea 1(%esi),%esi\n"
443 "movq _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
444 "movzbl (%edx),%eax\n"
445 "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
446 "movzbl 0x1(%edx),%ebx\n"
447 "movq _kCoefficientsAbgrY(,%eax,8),%mm1\n"
448 "lea 2(%edx),%edx\n"
449 "movq _kCoefficientsAbgrY(,%ebx,8),%mm2\n"
450 "paddsw %mm0,%mm1\n"
451 "paddsw %mm0,%mm2\n"
452 "psraw $0x6,%mm1\n"
453 "psraw $0x6,%mm2\n"
454 "packuswb %mm2,%mm1\n"
455 "movntq %mm1,0x0(%ebp)\n"
456 "lea 8(%ebp),%ebp\n"
457 "sub $0x2,%ecx\n"
458 "ja 1b\n"
459 "popa\n"
460 "ret\n"
461);
462
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000463void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
464 const uint8* u_buf,
465 const uint8* v_buf,
466 uint8* rgb_buf,
467 int width);
468 asm(
469 ".text\n"
470#if defined(OSX) || defined(IOS)
471 ".globl _FastConvertYUV444ToRGB32Row\n"
472"_FastConvertYUV444ToRGB32Row:\n"
473#else
474 ".global FastConvertYUV444ToRGB32Row\n"
475"FastConvertYUV444ToRGB32Row:\n"
476#endif
477 "pusha\n"
478 "mov 0x24(%esp),%edx\n"
479 "mov 0x28(%esp),%edi\n"
480 "mov 0x2c(%esp),%esi\n"
481 "mov 0x30(%esp),%ebp\n"
482 "mov 0x34(%esp),%ecx\n"
483
484"1:"
485 "movzbl (%edi),%eax\n"
486 "lea 1(%edi),%edi\n"
487 "movzbl (%esi),%ebx\n"
488 "lea 1(%esi),%esi\n"
489 "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
490 "movzbl (%edx),%eax\n"
491 "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
492 "lea 1(%edx),%edx\n"
493 "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
494 "psraw $0x6,%mm0\n"
495 "packuswb %mm0,%mm0\n"
496 "movd %mm0,0x0(%ebp)\n"
497 "lea 4(%ebp),%ebp\n"
498 "sub $0x1,%ecx\n"
499 "ja 1b\n"
500 "popa\n"
501 "ret\n"
502);
503
504void FastConvertYToRGB32Row(const uint8* y_buf,
505 uint8* rgb_buf,
506 int width);
507 asm(
508 ".text\n"
509#if defined(OSX) || defined(IOS)
510 ".globl _FastConvertYToRGB32Row\n"
511"_FastConvertYToRGB32Row:\n"
512#else
513 ".global FastConvertYToRGB32Row\n"
514"FastConvertYToRGB32Row:\n"
515#endif
516 "push %ebx\n"
517 "mov 0x8(%esp),%eax\n"
518 "mov 0xc(%esp),%edx\n"
519 "mov 0x10(%esp),%ecx\n"
520
521"1:"
522 "movzbl (%eax),%ebx\n"
523 "movq _kCoefficientsRgbY(,%ebx,8),%mm0\n"
524 "psraw $0x6,%mm0\n"
525 "movzbl 0x1(%eax),%ebx\n"
526 "movq _kCoefficientsRgbY(,%ebx,8),%mm1\n"
527 "psraw $0x6,%mm1\n"
528 "packuswb %mm1,%mm0\n"
529 "lea 0x2(%eax),%eax\n"
530 "movq %mm0,(%edx)\n"
531 "lea 0x8(%edx),%edx\n"
532 "sub $0x2,%ecx\n"
533 "ja 1b\n"
534 "pop %ebx\n"
535 "ret\n"
536);
537
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000538#else
539// C reference code that mimic the YUV assembly.
540#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
541#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
542 (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
543
544static inline void YuvPixel(uint8 y,
545 uint8 u,
546 uint8 v,
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000547 uint8* rgb_buf,
548 int ashift,
549 int rshift,
550 int gshift,
551 int bshift) {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000552
553 int b = _kCoefficientsRgbY[256+u][0];
554 int g = _kCoefficientsRgbY[256+u][1];
555 int r = _kCoefficientsRgbY[256+u][2];
556 int a = _kCoefficientsRgbY[256+u][3];
557
558 b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
559 g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
560 r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
561 a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
562
563 b = paddsw(b, _kCoefficientsRgbY[y][0]);
564 g = paddsw(g, _kCoefficientsRgbY[y][1]);
565 r = paddsw(r, _kCoefficientsRgbY[y][2]);
566 a = paddsw(a, _kCoefficientsRgbY[y][3]);
567
568 b >>= 6;
569 g >>= 6;
570 r >>= 6;
571 a >>= 6;
572
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000573 *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
574 (packuswb(g) << gshift) |
575 (packuswb(r) << rshift) |
576 (packuswb(a) << ashift);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000577}
578
579void FastConvertYUVToRGB32Row(const uint8* y_buf,
580 const uint8* u_buf,
581 const uint8* v_buf,
582 uint8* rgb_buf,
583 int width) {
584 for (int x = 0; x < width; x += 2) {
585 uint8 u = u_buf[x >> 1];
586 uint8 v = v_buf[x >> 1];
587 uint8 y0 = y_buf[x];
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000588 YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000589 if ((x + 1) < width) {
590 uint8 y1 = y_buf[x + 1];
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000591 YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
592 }
593 rgb_buf += 8; // Advance 2 pixels.
594 }
595}
596
597void FastConvertYUVToBGRARow(const uint8* y_buf,
598 const uint8* u_buf,
599 const uint8* v_buf,
600 uint8* rgb_buf,
601 int width) {
602 for (int x = 0; x < width; x += 2) {
603 uint8 u = u_buf[x >> 1];
604 uint8 v = v_buf[x >> 1];
605 uint8 y0 = y_buf[x];
606 YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
607 if ((x + 1) < width) {
608 uint8 y1 = y_buf[x + 1];
609 YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
610 }
611 rgb_buf += 8; // Advance 2 pixels.
612 }
613}
614
615void FastConvertYUVToABGRRow(const uint8* y_buf,
616 const uint8* u_buf,
617 const uint8* v_buf,
618 uint8* rgb_buf,
619 int width) {
620 for (int x = 0; x < width; x += 2) {
621 uint8 u = u_buf[x >> 1];
622 uint8 v = v_buf[x >> 1];
623 uint8 y0 = y_buf[x];
624 YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
625 if ((x + 1) < width) {
626 uint8 y1 = y_buf[x + 1];
627 YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000628 }
629 rgb_buf += 8; // Advance 2 pixels.
630 }
631}
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000632
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000633void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
634 const uint8* u_buf,
635 const uint8* v_buf,
636 uint8* rgb_buf,
637 int width) {
638 for (int x = 0; x < width; ++x) {
639 uint8 u = u_buf[x];
640 uint8 v = v_buf[x];
641 uint8 y = y_buf[x];
frkoenig@google.com3dcaf732011-10-20 23:42:36 +0000642 YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000643 rgb_buf += 4; // Advance 1 pixel.
644 }
645}
646
647void FastConvertYToRGB32Row(const uint8* y_buf,
648 uint8* rgb_buf,
649 int width) {
650 for (int x = 0; x < width; ++x) {
651 uint8 y = y_buf[x];
frkoenig@google.com3dcaf732011-10-20 23:42:36 +0000652 YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000653 rgb_buf += 4; // Advance 1 pixel.
654 }
655}
656
657#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000658
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000659} // extern "C"