blob: 090c1a635cbd08e793a06e95ec94be27922157dc [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "row.h"
12
13extern "C" {
14
fbarchard@google.com585a1262011-10-28 23:51:08 +000015#ifdef HAS_ARGBTOYROW_SSSE3
16
17// Constant multiplication table for converting ARGB to I400.
fbarchard@google.comb6149762011-11-07 21:58:52 +000018static const vec8 kARGBToY = {
19 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
fbarchard@google.com585a1262011-10-28 23:51:08 +000020};
21
fbarchard@google.comb6149762011-11-07 21:58:52 +000022static const uvec8 kAddY16 = {
23 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
24 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
fbarchard@google.com585a1262011-10-28 23:51:08 +000025};
26
fbarchard@google.comb6149762011-11-07 21:58:52 +000027#ifdef HAS_ARGBTOUVROW_SSSE3
28static const vec8 kARGBToU = {
29 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
30};
31
32static const uvec8 kARGBToV = {
33 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
34};
35static const uvec8 kAddUV128 = {
36 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
37 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
38};
39#endif
40
fbarchard@google.com9394ed92011-10-31 21:36:47 +000041// Shuffle table for converting BG24 to ARGB.
fbarchard@google.comb6149762011-11-07 21:58:52 +000042static const uvec8 kShuffleMaskBG24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000043 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
44};
45
46// Shuffle table for converting RAW to ARGB.
fbarchard@google.comb6149762011-11-07 21:58:52 +000047static const uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000048 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
49};
50
fbarchard@google.comb6149762011-11-07 21:58:52 +000051// Shuffle table for converting ABGR to ARGB.
52static const uvec8 kShuffleMaskABGRToARGB = {
53 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
54};
55
56// Shuffle table for converting BGRA to ARGB.
57static const uvec8 kShuffleMaskBGRAToARGB = {
58 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
59};
60
61void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.com585a1262011-10-28 23:51:08 +000062 asm volatile(
fbarchard@google.comb6149762011-11-07 21:58:52 +000063 "pcmpeqb %%xmm5,%%xmm5\n"
64 "pslld $0x18,%%xmm5\n"
fbarchard@google.com585a1262011-10-28 23:51:08 +000065"1:"
fbarchard@google.comb6149762011-11-07 21:58:52 +000066 "movq (%0),%%xmm0\n"
67 "lea 0x8(%0),%0\n"
68 "punpcklbw %%xmm0,%%xmm0\n"
69 "movdqa %%xmm0,%%xmm1\n"
70 "punpcklwd %%xmm0,%%xmm0\n"
71 "punpckhwd %%xmm1,%%xmm1\n"
72 "por %%xmm5,%%xmm0\n"
73 "por %%xmm5,%%xmm1\n"
74 "movdqa %%xmm0,(%1)\n"
75 "movdqa %%xmm1,0x10(%1)\n"
76 "lea 0x20(%1),%1\n"
fbarchard@google.com585a1262011-10-28 23:51:08 +000077 "sub $0x8,%2\n"
78 "ja 1b\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +000079 : "+r"(src_y), // %0
80 "+r"(dst_argb), // %1
81 "+r"(pix) // %2
82 :
83 : "memory", "cc"
84#if defined(__SSE2__)
85 , "xmm0", "xmm1", "xmm5"
86#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +000087);
88}
fbarchard@google.comb6149762011-11-07 21:58:52 +000089
90void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
91 asm volatile(
92 "movdqa %3,%%xmm5\n"
93"1:"
94 "movdqa (%0),%%xmm0\n"
95 "lea 0x10(%0),%0\n"
96 "pshufb %%xmm5,%%xmm0\n"
97 "movdqa %%xmm0,(%1)\n"
98 "lea 0x10(%1),%1\n"
99 "sub $0x4,%2\n"
100 "ja 1b\n"
101 : "+r"(src_abgr), // %0
102 "+r"(dst_argb), // %1
103 "+r"(pix) // %2
104 : "m"(kShuffleMaskABGRToARGB) // %3
105 : "memory", "cc"
106#if defined(__SSE2__)
107 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000108#endif
109
fbarchard@google.comb6149762011-11-07 21:58:52 +0000110);
111}
112
113void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
114 asm volatile(
115 "movdqa %3,%%xmm5\n"
116"1:"
117 "movdqa (%0),%%xmm0\n"
118 "lea 0x10(%0),%0\n"
119 "pshufb %%xmm5,%%xmm0\n"
120 "movdqa %%xmm0,(%1)\n"
121 "lea 0x10(%1),%1\n"
122 "sub $0x4,%2\n"
123 "ja 1b\n"
124 : "+r"(src_bgra), // %0
125 "+r"(dst_argb), // %1
126 "+r"(pix) // %2
127 : "m"(kShuffleMaskBGRAToARGB) // %3
128 : "memory", "cc"
129#if defined(__SSE2__)
130 , "xmm0", "xmm5"
131#endif
132);
133}
134
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000135void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
136 asm volatile(
fbarchard@google.comb6149762011-11-07 21:58:52 +0000137 "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
138 "pslld $0x18,%%xmm5\n"
139 "movdqa %3,%%xmm4\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000140"1:"
141 "movdqa (%0),%%xmm0\n"
142 "movdqa 0x10(%0),%%xmm1\n"
143 "movdqa 0x20(%0),%%xmm3\n"
144 "lea 0x30(%0),%0\n"
145 "movdqa %%xmm3,%%xmm2\n"
146 "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
fbarchard@google.comb6149762011-11-07 21:58:52 +0000147 "pshufb %%xmm4,%%xmm2\n"
148 "por %%xmm5,%%xmm2\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000149 "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
fbarchard@google.comb6149762011-11-07 21:58:52 +0000150 "pshufb %%xmm4,%%xmm0\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000151 "movdqa %%xmm2,0x20(%1)\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000152 "por %%xmm5,%%xmm0\n"
153 "pshufb %%xmm4,%%xmm1\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000154 "movdqa %%xmm0,(%1)\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000155 "por %%xmm5,%%xmm1\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000156 "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
fbarchard@google.comb6149762011-11-07 21:58:52 +0000157 "pshufb %%xmm4,%%xmm3\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000158 "movdqa %%xmm1,0x10(%1)\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000159 "por %%xmm5,%%xmm3\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000160 "movdqa %%xmm3,0x30(%1)\n"
161 "lea 0x40(%1),%1\n"
162 "sub $0x10,%2\n"
163 "ja 1b\n"
164 : "+r"(src_bg24), // %0
165 "+r"(dst_argb), // %1
166 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000167 : "m"(kShuffleMaskBG24ToARGB) // %3
168 : "memory", "cc"
169#if defined(__SSE2__)
170 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
171#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000172);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000173}
174
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000175void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
176 asm volatile(
fbarchard@google.comb6149762011-11-07 21:58:52 +0000177 "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
178 "pslld $0x18,%%xmm5\n"
179 "movdqa %3,%%xmm4\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000180"1:"
181 "movdqa (%0),%%xmm0\n"
182 "movdqa 0x10(%0),%%xmm1\n"
183 "movdqa 0x20(%0),%%xmm3\n"
184 "lea 0x30(%0),%0\n"
185 "movdqa %%xmm3,%%xmm2\n"
186 "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
fbarchard@google.comb6149762011-11-07 21:58:52 +0000187 "pshufb %%xmm4,%%xmm2\n"
188 "por %%xmm5,%%xmm2\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000189 "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
fbarchard@google.comb6149762011-11-07 21:58:52 +0000190 "pshufb %%xmm4,%%xmm0\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000191 "movdqa %%xmm2,0x20(%1)\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000192 "por %%xmm5,%%xmm0\n"
193 "pshufb %%xmm4,%%xmm1\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000194 "movdqa %%xmm0,(%1)\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000195 "por %%xmm5,%%xmm1\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000196 "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
fbarchard@google.comb6149762011-11-07 21:58:52 +0000197 "pshufb %%xmm4,%%xmm3\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000198 "movdqa %%xmm1,0x10(%1)\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000199 "por %%xmm5,%%xmm3\n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000200 "movdqa %%xmm3,0x30(%1)\n"
201 "lea 0x40(%1),%1\n"
202 "sub $0x10,%2\n"
203 "ja 1b\n"
204 : "+r"(src_raw), // %0
205 "+r"(dst_argb), // %1
206 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000207 : "m"(kShuffleMaskRAWToARGB) // %3
208 : "memory", "cc"
209#if defined(__SSE2__)
210 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
211#endif
212);
213}
214
215void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
216 asm volatile(
217 "movdqa %4,%%xmm5\n"
218 "movdqa %3,%%xmm4\n"
219"1:"
220 "movdqa (%0),%%xmm0\n"
221 "movdqa 0x10(%0),%%xmm1\n"
222 "movdqa 0x20(%0),%%xmm2\n"
223 "movdqa 0x30(%0),%%xmm3\n"
224 "pmaddubsw %%xmm4,%%xmm0\n"
225 "pmaddubsw %%xmm4,%%xmm1\n"
226 "pmaddubsw %%xmm4,%%xmm2\n"
227 "pmaddubsw %%xmm4,%%xmm3\n"
228 "lea 0x40(%0),%0\n"
229 "phaddw %%xmm1,%%xmm0\n"
230 "phaddw %%xmm3,%%xmm2\n"
231 "psrlw $0x7,%%xmm0\n"
232 "psrlw $0x7,%%xmm2\n"
233 "packuswb %%xmm2,%%xmm0\n"
234 "paddb %%xmm5,%%xmm0\n"
235 "movdqa %%xmm0,(%1)\n"
236 "lea 0x10(%1),%1\n"
237 "sub $0x10,%2\n"
238 "ja 1b\n"
239 : "+r"(src_argb), // %0
240 "+r"(dst_y), // %1
241 "+r"(pix) // %2
242 : "m"(kARGBToY), // %3
243 "m"(kAddY16) // %4
244 : "memory", "cc"
245#if defined(__SSE2__)
246 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
247#endif
248
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000249);
fbarchard@google.com585a1262011-10-28 23:51:08 +0000250}
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000251#endif
fbarchard@google.com585a1262011-10-28 23:51:08 +0000252
fbarchard@google.comb6149762011-11-07 21:58:52 +0000253#ifdef HAS_ARGBTOUVROW_SSSE3
254void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
255 uint8* dst_u, uint8* dst_v, int width) {
256 asm volatile(
257 "movdqa %5,%%xmm7\n"
258 "movdqa %6,%%xmm6\n"
259 "movdqa %7,%%xmm5\n"
260 "sub %1,%2\n"
261"1:"
262 "movdqa (%0),%%xmm0\n"
263 "movdqa 0x10(%0),%%xmm1\n"
264 "movdqa 0x20(%0),%%xmm2\n"
265 "movdqa 0x30(%0),%%xmm3\n"
266 "pavgb (%0,%4,1),%%xmm0\n"
267 "pavgb 0x10(%0,%4,1),%%xmm1\n"
268 "pavgb 0x20(%0,%4,1),%%xmm2\n"
269 "pavgb 0x30(%0,%4,1),%%xmm3\n"
270 "lea 0x40(%0),%0\n"
271 "movdqa %%xmm0,%%xmm4\n"
272 "shufps $0x88,%%xmm1,%%xmm0\n"
273 "shufps $0xdd,%%xmm1,%%xmm4\n"
274 "pavgb %%xmm4,%%xmm0\n"
275 "movdqa %%xmm2,%%xmm4\n"
276 "shufps $0x88,%%xmm3,%%xmm2\n"
277 "shufps $0xdd,%%xmm3,%%xmm4\n"
278 "pavgb %%xmm4,%%xmm2\n"
279 "movdqa %%xmm0,%%xmm1\n"
280 "movdqa %%xmm2,%%xmm3\n"
281 "pmaddubsw %%xmm7,%%xmm0\n"
282 "pmaddubsw %%xmm7,%%xmm2\n"
283 "pmaddubsw %%xmm6,%%xmm1\n"
284 "pmaddubsw %%xmm6,%%xmm3\n"
285 "phaddw %%xmm2,%%xmm0\n"
286 "phaddw %%xmm3,%%xmm1\n"
287 "psraw $0x8,%%xmm0\n"
288 "psraw $0x8,%%xmm1\n"
289 "packsswb %%xmm1,%%xmm0\n"
290 "paddb %%xmm5,%%xmm0\n"
291 "movlps %%xmm0,(%1)\n"
292 "movhps %%xmm0,(%1,%2,1)\n"
293 "lea 0x8(%1),%1\n"
294 "sub $0x10,%3\n"
295 "ja 1b\n"
296 : "+r"(src_argb0), // %0
297 "+r"(dst_u), // %1
298 "+r"(dst_v), // %2
299 "+rm"(width) // %3
300 : "r"(static_cast<intptr_t>(src_stride_argb)), // %4
301 "m"(kARGBToU), // %5
302 "m"(kARGBToV), // %6
303 "m"(kAddUV128) // %7
304 : "memory", "cc"
305#if defined(__SSE2__)
306 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
307#endif
308);
309}
310#endif
311
312// The following code requires 6 registers and prefers 7 registers.
313// 7 registers requires -fpic to be off, and -fomit-frame-pointer
314#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000315#if defined(__x86_64__)
fbarchard@google.comb6149762011-11-07 21:58:52 +0000316#define REG_a "rax"
317#define REG_d "rdx"
318#else
319#define REG_a "eax"
320#define REG_d "edx"
321#endif
322#if defined(__APPLE__) || defined(__x86_64__)
323#define OMITFP
324#else
325#define OMITFP __attribute__((optimize("omit-frame-pointer")))
326#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000327
fbarchard@google.comb6149762011-11-07 21:58:52 +0000328#if defined(__APPLE__)
329// REG6 version uses 1 less register but is slower
330#define REG6
331#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000332
fbarchard@google.comb6149762011-11-07 21:58:52 +0000333#ifdef REG6
334// 6 register version only has REG_a for temporary
335#define CLOBBER "%"REG_a
336#define YUVTORGB \
337 "1:" \
338 "movzb (%1),%%"REG_a"\n" \
339 "lea 1(%1),%1\n" \
340 "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
341 "movzb (%2),%%"REG_a"\n" \
342 "lea 1(%2),%2\n" \
343 "movq 4096(%5,%%"REG_a",8),%%xmm1\n" \
344 "paddsw %%xmm1,%%xmm0\n" \
345 "movzb (%0),%%"REG_a"\n" \
346 "movq 0(%5,%%"REG_a",8),%%xmm2\n" \
347 "movzb 0x1(%0),%%"REG_a"\n" \
348 "movq 0(%5,%%"REG_a",8),%%xmm3\n" \
349 "lea 2(%0),%0\n" \
350 "paddsw %%xmm0,%%xmm2\n" \
351 "paddsw %%xmm0,%%xmm3\n" \
352 "shufps $0x44,%%xmm3,%%xmm2\n" \
353 "psraw $0x6,%%xmm2\n" \
354 "packuswb %%xmm2,%%xmm2\n" \
355 "movq %%xmm2,0x0(%3)\n" \
356 "lea 8(%3),%3\n" \
357 "sub $0x2,%4\n" \
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000358 "ja 1b\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000359#else
360#define CLOBBER "%"REG_a, "%"REG_d
361// This version produces 2 pixels
362#define YUVTORGB \
363"1:" \
364 "movzb (%1),%%"REG_a"\n" \
365 "lea 1(%1),%1\n" \
366 "movzb (%2),%%"REG_d"\n" \
367 "lea 1(%2),%2\n" \
368 "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
369 "movzb 0(%0),%%"REG_a"\n" \
370 "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
371 "paddsw %%xmm1,%%xmm0\n" \
372 "movzb 1(%0),%%"REG_d"\n" \
373 "punpcklqdq %%xmm0,%%xmm0\n" \
374 "lea 2(%0),%0\n" \
375 "movq 0(%5,%%"REG_a",8),%%xmm1\n" \
376 "movhps 0(%5,%%"REG_d",8),%%xmm1\n" \
377 "paddsw %%xmm0,%%xmm1\n" \
378 "psraw $6,%%xmm1\n" \
379 "packuswb %%xmm1,%%xmm1\n" \
380 "movq %%xmm1,0(%3)\n" \
381 "lea 8(%3),%3\n" \
382 "sub $0x2,%4\n" \
383 "ja 1b\n"
384// This version produces 4 pixels
385#define YUVTORGB4 \
386"1:" \
387 "movzb 0(%1),%%"REG_a"\n" \
388 "movzb 0(%2),%%"REG_d"\n" \
389 "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
390 "movzb 0(%0),%%"REG_a"\n" \
391 "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
392 "paddsw %%xmm1,%%xmm0\n" \
393 "movzb 1(%0),%%"REG_d"\n" \
394 "punpcklqdq %%xmm0,%%xmm0\n" \
395 "movq 0(%5,%%"REG_a",8),%%xmm2\n" \
396 "movhps 0(%5,%%"REG_d",8),%%xmm2\n" \
397 "paddsw %%xmm0,%%xmm2\n" \
398 "psraw $6,%%xmm2\n" \
399 "movzb 1(%1),%%"REG_a"\n" \
400 "movzb 1(%2),%%"REG_d"\n" \
401 "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
402 "movzb 2(%0),%%"REG_a"\n" \
403 "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
404 "paddsw %%xmm1,%%xmm0\n" \
405 "movzb 3(%0),%%"REG_d"\n" \
406 "punpcklqdq %%xmm0,%%xmm0\n" \
407 "movq 0(%5,%%"REG_a",8),%%xmm3\n" \
408 "movhps 0(%5,%%"REG_d",8),%%xmm3\n" \
409 "paddsw %%xmm0,%%xmm3\n" \
410 "psraw $6,%%xmm3\n" \
411 "lea 2(%1),%1\n" \
412 "lea 2(%2),%2\n" \
413 "lea 4(%0),%0\n" \
414 "packuswb %%xmm3,%%xmm2\n" \
415 "movdqa %%xmm2,0(%3)\n" \
416 "lea 16(%3),%3\n" \
417 "sub $0x4,%4\n" \
418 "ja 1b\n"
419#endif
420
421// 6 or 7 registers
422void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi
423 const uint8* u_buf, // rsi
424 const uint8* v_buf, // rdx
425 uint8* rgb_buf, // rcx
426 int width) { // r8
427 asm volatile(
428 YUVTORGB
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000429 : "+r"(y_buf), // %0
430 "+r"(u_buf), // %1
431 "+r"(v_buf), // %2
432 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000433 "+rm"(width) // %4
434 : "r" (kCoefficientsRgbY) // %5
435 : "memory", "cc", CLOBBER
436#if defined(__SSE2__)
437 , "xmm0", "xmm1", "xmm2", "xmm3"
438#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000439);
440}
441
fbarchard@google.comb6149762011-11-07 21:58:52 +0000442// 6 or 7 registers
443void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi
444 const uint8* u_buf, // rsi
445 const uint8* v_buf, // rdx
446 uint8* rgb_buf, // rcx
447 int width) { // r8
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000448 asm volatile(
fbarchard@google.comb6149762011-11-07 21:58:52 +0000449 YUVTORGB4
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000450 : "+r"(y_buf), // %0
451 "+r"(u_buf), // %1
452 "+r"(v_buf), // %2
453 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000454 "+rm"(width) // %4
455 : "r" (kCoefficientsRgbY) // %5
456 : "memory", "cc", CLOBBER
457#if defined(__SSE2__)
458 , "xmm0", "xmm1", "xmm2", "xmm3"
459#endif
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000460);
461}
462
fbarchard@google.comb6149762011-11-07 21:58:52 +0000463void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi
464 const uint8* u_buf, // rsi
465 const uint8* v_buf, // rdx
466 uint8* rgb_buf, // rcx
467 int width) { // r8
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000468 asm volatile(
fbarchard@google.comb6149762011-11-07 21:58:52 +0000469 YUVTORGB
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000470 : "+r"(y_buf), // %0
471 "+r"(u_buf), // %1
472 "+r"(v_buf), // %2
473 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000474 "+rm"(width) // %4
475 : "r" (kCoefficientsBgraY) // %5
476 : "memory", "cc", CLOBBER
477#if defined(__SSE2__)
478 , "xmm0", "xmm1", "xmm2", "xmm3"
479#endif
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000480);
481}
482
fbarchard@google.comb6149762011-11-07 21:58:52 +0000483void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi
484 const uint8* u_buf, // rsi
485 const uint8* v_buf, // rdx
486 uint8* rgb_buf, // rcx
487 int width) { // r8
488 asm volatile(
489 YUVTORGB
490 : "+r"(y_buf), // %0
491 "+r"(u_buf), // %1
492 "+r"(v_buf), // %2
493 "+r"(rgb_buf), // %3
494 "+rm"(width) // %4
495 : "r" (kCoefficientsAbgrY) // %5
496 : "memory", "cc", CLOBBER
497#if defined(__SSE2__)
498 , "xmm0", "xmm1", "xmm2", "xmm3"
499#endif
500);
501}
502
503// 6 registers
504void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi
505 const uint8* u_buf, // rsi
506 const uint8* v_buf, // rdx
507 uint8* rgb_buf, // rcx
508 int width) { // r8
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000509 asm volatile(
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000510"1:"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000511 "movzb (%1),%%"REG_a"\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000512 "lea 1(%1),%1\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000513 "movq 2048(%5,%%"REG_a",8),%%xmm0\n"
514 "movzb (%2),%%"REG_a"\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000515 "lea 1(%2),%2\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000516 "movq 4096(%5,%%"REG_a",8),%%xmm1\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000517 "paddsw %%xmm1,%%xmm0\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000518 "movzb (%0),%%"REG_a"\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000519 "lea 1(%0),%0\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000520 "movq 0(%5,%%"REG_a",8),%%xmm2\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000521 "paddsw %%xmm0,%%xmm2\n"
522 "shufps $0x44,%%xmm2,%%xmm2\n"
523 "psraw $0x6,%%xmm2\n"
524 "packuswb %%xmm2,%%xmm2\n"
525 "movd %%xmm2,0x0(%3)\n"
526 "lea 4(%3),%3\n"
527 "sub $0x1,%4\n"
528 "ja 1b\n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000529 : "+r"(y_buf), // %0
530 "+r"(u_buf), // %1
531 "+r"(v_buf), // %2
532 "+r"(rgb_buf), // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000533 "+rm"(width) // %4
534 : "r" (kCoefficientsRgbY) // %5
535 : "memory", "cc", "%"REG_a
536#if defined(__SSE2__)
537 , "xmm0", "xmm1", "xmm2"
538#endif
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000539);
540}
541
fbarchard@google.comb6149762011-11-07 21:58:52 +0000542// 5 registers
543void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
544 uint8* rgb_buf, // rcx
545 int width) { // r8
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000546 asm volatile(
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000547"1:"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000548 "movzb (%0),%%"REG_a"\n"
549 "movzb 0x1(%0),%%"REG_d"\n"
550 "movq (%3,%%"REG_a",8),%%xmm2\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000551 "lea 2(%0),%0\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000552 "movhps (%3,%%"REG_d",8),%%xmm2\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000553 "psraw $0x6,%%xmm2\n"
554 "packuswb %%xmm2,%%xmm2\n"
555 "movq %%xmm2,0x0(%1)\n"
556 "lea 8(%1),%1\n"
557 "sub $0x2,%2\n"
558 "ja 1b\n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +0000559 : "+r"(y_buf), // %0
560 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +0000561 "+rm"(width) // %2
562 : "r" (kCoefficientsRgbY) // %3
563 : "memory", "cc", "%"REG_a, "%"REG_d
564#if defined(__SSE2__)
565 , "xmm0", "xmm1", "xmm2"
566#endif
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000567);
568}
569
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000570#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000571
fbarchard@google.comb6149762011-11-07 21:58:52 +0000572#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
573// 32 bit mmx gcc version
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000574
fbarchard@google.comb6149762011-11-07 21:58:52 +0000575#ifdef OSX
576#define UNDERSCORE "_"
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000577#else
fbarchard@google.comb6149762011-11-07 21:58:52 +0000578#define UNDERSCORE ""
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000579#endif
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000580
fbarchard@google.comb6149762011-11-07 21:58:52 +0000581void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000582 const uint8* u_buf,
583 const uint8* v_buf,
584 uint8* rgb_buf,
585 int width);
586 asm(
587 ".text\n"
588#if defined(OSX) || defined(IOS)
fbarchard@google.comb6149762011-11-07 21:58:52 +0000589 ".globl _FastConvertYUVToARGBRow_MMX\n"
590"_FastConvertYUVToARGBRow_MMX:\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000591#else
fbarchard@google.comb6149762011-11-07 21:58:52 +0000592 ".global FastConvertYUVToARGBRow_MMX\n"
593"FastConvertYUVToARGBRow_MMX:\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000594#endif
595 "pusha\n"
596 "mov 0x24(%esp),%edx\n"
597 "mov 0x28(%esp),%edi\n"
598 "mov 0x2c(%esp),%esi\n"
599 "mov 0x30(%esp),%ebp\n"
600 "mov 0x34(%esp),%ecx\n"
601
602"1:"
603 "movzbl (%edi),%eax\n"
604 "lea 1(%edi),%edi\n"
605 "movzbl (%esi),%ebx\n"
606 "lea 1(%esi),%esi\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000607 "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000608 "movzbl (%edx),%eax\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000609 "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
610 "movzbl 0x1(%edx),%ebx\n"
611 "movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
612 "lea 2(%edx),%edx\n"
613 "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
614 "paddsw %mm0,%mm1\n"
615 "paddsw %mm0,%mm2\n"
616 "psraw $0x6,%mm1\n"
617 "psraw $0x6,%mm2\n"
618 "packuswb %mm2,%mm1\n"
619 "movq %mm1,0x0(%ebp)\n"
620 "lea 8(%ebp),%ebp\n"
621 "sub $0x2,%ecx\n"
622 "ja 1b\n"
623 "popa\n"
624 "ret\n"
625);
626
627void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
628 const uint8* u_buf,
629 const uint8* v_buf,
630 uint8* rgb_buf,
631 int width);
632 asm(
633 ".text\n"
634#if defined(OSX) || defined(IOS)
635 ".globl _FastConvertYUVToBGRARow_MMX\n"
636"_FastConvertYUVToBGRARow_MMX:\n"
637#else
638 ".global FastConvertYUVToBGRARow_MMX\n"
639"FastConvertYUVToBGRARow_MMX:\n"
640#endif
641 "pusha\n"
642 "mov 0x24(%esp),%edx\n"
643 "mov 0x28(%esp),%edi\n"
644 "mov 0x2c(%esp),%esi\n"
645 "mov 0x30(%esp),%ebp\n"
646 "mov 0x34(%esp),%ecx\n"
647
648"1:"
649 "movzbl (%edi),%eax\n"
650 "lea 1(%edi),%edi\n"
651 "movzbl (%esi),%ebx\n"
652 "lea 1(%esi),%esi\n"
653 "movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
654 "movzbl (%edx),%eax\n"
655 "paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
656 "movzbl 0x1(%edx),%ebx\n"
657 "movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
658 "lea 2(%edx),%edx\n"
659 "movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
660 "paddsw %mm0,%mm1\n"
661 "paddsw %mm0,%mm2\n"
662 "psraw $0x6,%mm1\n"
663 "psraw $0x6,%mm2\n"
664 "packuswb %mm2,%mm1\n"
665 "movq %mm1,0x0(%ebp)\n"
666 "lea 8(%ebp),%ebp\n"
667 "sub $0x2,%ecx\n"
668 "ja 1b\n"
669 "popa\n"
670 "ret\n"
671);
672
673void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
674 const uint8* u_buf,
675 const uint8* v_buf,
676 uint8* rgb_buf,
677 int width);
678 asm(
679 ".text\n"
680#if defined(OSX) || defined(IOS)
681 ".globl _FastConvertYUVToABGRRow_MMX\n"
682"_FastConvertYUVToABGRRow_MMX:\n"
683#else
684 ".global FastConvertYUVToABGRRow_MMX\n"
685"FastConvertYUVToABGRRow_MMX:\n"
686#endif
687 "pusha\n"
688 "mov 0x24(%esp),%edx\n"
689 "mov 0x28(%esp),%edi\n"
690 "mov 0x2c(%esp),%esi\n"
691 "mov 0x30(%esp),%ebp\n"
692 "mov 0x34(%esp),%ecx\n"
693
694"1:"
695 "movzbl (%edi),%eax\n"
696 "lea 1(%edi),%edi\n"
697 "movzbl (%esi),%ebx\n"
698 "lea 1(%esi),%esi\n"
699 "movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
700 "movzbl (%edx),%eax\n"
701 "paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
702 "movzbl 0x1(%edx),%ebx\n"
703 "movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
704 "lea 2(%edx),%edx\n"
705 "movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
706 "paddsw %mm0,%mm1\n"
707 "paddsw %mm0,%mm2\n"
708 "psraw $0x6,%mm1\n"
709 "psraw $0x6,%mm2\n"
710 "packuswb %mm2,%mm1\n"
711 "movq %mm1,0x0(%ebp)\n"
712 "lea 8(%ebp),%ebp\n"
713 "sub $0x2,%ecx\n"
714 "ja 1b\n"
715 "popa\n"
716 "ret\n"
717);
718
719void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
720 const uint8* u_buf,
721 const uint8* v_buf,
722 uint8* rgb_buf,
723 int width);
724 asm(
725 ".text\n"
726#if defined(OSX) || defined(IOS)
727 ".globl _FastConvertYUV444ToARGBRow_MMX\n"
728"_FastConvertYUV444ToARGBRow_MMX:\n"
729#else
730 ".global FastConvertYUV444ToARGBRow_MMX\n"
731"FastConvertYUV444ToARGBRow_MMX:\n"
732#endif
733 "pusha\n"
734 "mov 0x24(%esp),%edx\n"
735 "mov 0x28(%esp),%edi\n"
736 "mov 0x2c(%esp),%esi\n"
737 "mov 0x30(%esp),%ebp\n"
738 "mov 0x34(%esp),%ecx\n"
739
740"1:"
741 "movzbl (%edi),%eax\n"
742 "lea 1(%edi),%edi\n"
743 "movzbl (%esi),%ebx\n"
744 "lea 1(%esi),%esi\n"
745 "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
746 "movzbl (%edx),%eax\n"
747 "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000748 "lea 1(%edx),%edx\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000749 "paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000750 "psraw $0x6,%mm0\n"
751 "packuswb %mm0,%mm0\n"
752 "movd %mm0,0x0(%ebp)\n"
753 "lea 4(%ebp),%ebp\n"
754 "sub $0x1,%ecx\n"
755 "ja 1b\n"
756 "popa\n"
757 "ret\n"
758);
759
fbarchard@google.comb6149762011-11-07 21:58:52 +0000760void FastConvertYToARGBRow_MMX(const uint8* y_buf,
761 uint8* rgb_buf,
762 int width);
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000763 asm(
764 ".text\n"
765#if defined(OSX) || defined(IOS)
fbarchard@google.comb6149762011-11-07 21:58:52 +0000766 ".globl _FastConvertYToARGBRow_MMX\n"
767"_FastConvertYToARGBRow_MMX:\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000768#else
fbarchard@google.comb6149762011-11-07 21:58:52 +0000769 ".global FastConvertYToARGBRow_MMX\n"
770"FastConvertYToARGBRow_MMX:\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000771#endif
772 "push %ebx\n"
773 "mov 0x8(%esp),%eax\n"
774 "mov 0xc(%esp),%edx\n"
775 "mov 0x10(%esp),%ecx\n"
776
777"1:"
778 "movzbl (%eax),%ebx\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000779 "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000780 "psraw $0x6,%mm0\n"
781 "movzbl 0x1(%eax),%ebx\n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000782 "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000783 "psraw $0x6,%mm1\n"
784 "packuswb %mm1,%mm0\n"
785 "lea 0x2(%eax),%eax\n"
786 "movq %mm0,(%edx)\n"
787 "lea 0x8(%edx),%edx\n"
788 "sub $0x2,%ecx\n"
789 "ja 1b\n"
790 "pop %ebx\n"
791 "ret\n"
792);
793
fbarchard@google.comb6149762011-11-07 21:58:52 +0000794#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000795
fbarchard@google.comb6149762011-11-07 21:58:52 +0000796void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
797 SIMD_ALIGNED(uint8 row[kMaxStride]);
798 ABGRToARGBRow_SSSE3(src_argb, row, pix);
799 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000800}
801
fbarchard@google.comb6149762011-11-07 21:58:52 +0000802void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
803 SIMD_ALIGNED(uint8 row[kMaxStride]);
804 BGRAToARGBRow_SSSE3(src_argb, row, pix);
805 ARGBToYRow_SSSE3(row, dst_y, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000806}
807
fbarchard@google.comb6149762011-11-07 21:58:52 +0000808#ifdef HAS_ARGBTOUVROW_SSSE3
809void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
810 uint8* dst_u, uint8* dst_v, int pix) {
811 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
812 ABGRToARGBRow_SSSE3(src_argb, row, pix);
813 ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
814 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.org43575c82011-10-12 18:49:21 +0000815}
816
fbarchard@google.comb6149762011-11-07 21:58:52 +0000817void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
818 uint8* dst_u, uint8* dst_v, int pix) {
819 SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
820 BGRAToARGBRow_SSSE3(src_argb, row, pix);
821 BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
822 ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000823}
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +0000824#endif
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000825
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +0000826} // extern "C"