blob: 28b06b3308e3581b8502431745cced301a54efe1 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000115 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000116 "1: \n"
117 "movq (%0),%%xmm0 \n"
118 "lea 0x8(%0),%0 \n"
119 "punpcklbw %%xmm0,%%xmm0 \n"
120 "movdqa %%xmm0,%%xmm1 \n"
121 "punpcklwd %%xmm0,%%xmm0 \n"
122 "punpckhwd %%xmm1,%%xmm1 \n"
123 "por %%xmm5,%%xmm0 \n"
124 "por %%xmm5,%%xmm1 \n"
125 "movdqa %%xmm0,(%1) \n"
126 "movdqa %%xmm1,0x10(%1) \n"
127 "lea 0x20(%1),%1 \n"
128 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000129 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000130 : "+r"(src_y), // %0
131 "+r"(dst_argb), // %1
132 "+r"(pix) // %2
133 :
134 : "memory", "cc"
135#if defined(__SSE2__)
136 , "xmm0", "xmm1", "xmm5"
137#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000138 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000139}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140
141void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000142 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000143 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000144 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000145 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "1: \n"
147 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000148 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000150 "movdqa %%xmm0,(%0,%1,1) \n"
151 "lea 0x10(%0),%0 \n"
152 "jg 1b \n"
153
fbarchard@google.comb6149762011-11-07 21:58:52 +0000154 : "+r"(src_abgr), // %0
155 "+r"(dst_argb), // %1
156 "+r"(pix) // %2
157 : "m"(kShuffleMaskABGRToARGB) // %3
158 : "memory", "cc"
159#if defined(__SSE2__)
160 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000161#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000162 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163}
164
165void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000166 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000168 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000169 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "1: \n"
171 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000172 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000173 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000174 "movdqa %%xmm0,(%0,%1,1) \n"
175 "lea 0x10(%0),%0 \n"
176 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000177 : "+r"(src_bgra), // %0
178 "+r"(dst_argb), // %1
179 "+r"(pix) // %2
180 : "m"(kShuffleMaskBGRAToARGB) // %3
181 : "memory", "cc"
182#if defined(__SSE2__)
183 , "xmm0", "xmm5"
184#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000185 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000186}
187
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000188void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000189 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000190 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
191 "pslld $0x18,%%xmm5 \n"
192 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000193 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000194 "1: \n"
195 "movdqu (%0),%%xmm0 \n"
196 "movdqu 0x10(%0),%%xmm1 \n"
197 "movdqu 0x20(%0),%%xmm3 \n"
198 "lea 0x30(%0),%0 \n"
199 "movdqa %%xmm3,%%xmm2 \n"
200 "palignr $0x8,%%xmm1,%%xmm2 \n"
201 "pshufb %%xmm4,%%xmm2 \n"
202 "por %%xmm5,%%xmm2 \n"
203 "palignr $0xc,%%xmm0,%%xmm1 \n"
204 "pshufb %%xmm4,%%xmm0 \n"
205 "movdqa %%xmm2,0x20(%1) \n"
206 "por %%xmm5,%%xmm0 \n"
207 "pshufb %%xmm4,%%xmm1 \n"
208 "movdqa %%xmm0,(%1) \n"
209 "por %%xmm5,%%xmm1 \n"
210 "palignr $0x4,%%xmm3,%%xmm3 \n"
211 "pshufb %%xmm4,%%xmm3 \n"
212 "movdqa %%xmm1,0x10(%1) \n"
213 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000215 "movdqa %%xmm3,0x30(%1) \n"
216 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000217 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000218 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000221 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000227}
228
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000229void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
232 "pslld $0x18,%%xmm5 \n"
233 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000234 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000235 "1: \n"
236 "movdqu (%0),%%xmm0 \n"
237 "movdqu 0x10(%0),%%xmm1 \n"
238 "movdqu 0x20(%0),%%xmm3 \n"
239 "lea 0x30(%0),%0 \n"
240 "movdqa %%xmm3,%%xmm2 \n"
241 "palignr $0x8,%%xmm1,%%xmm2 \n"
242 "pshufb %%xmm4,%%xmm2 \n"
243 "por %%xmm5,%%xmm2 \n"
244 "palignr $0xc,%%xmm0,%%xmm1 \n"
245 "pshufb %%xmm4,%%xmm0 \n"
246 "movdqa %%xmm2,0x20(%1) \n"
247 "por %%xmm5,%%xmm0 \n"
248 "pshufb %%xmm4,%%xmm1 \n"
249 "movdqa %%xmm0,(%1) \n"
250 "por %%xmm5,%%xmm1 \n"
251 "palignr $0x4,%%xmm3,%%xmm3 \n"
252 "pshufb %%xmm4,%%xmm3 \n"
253 "movdqa %%xmm1,0x10(%1) \n"
254 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000255 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000256 "movdqa %%xmm3,0x30(%1) \n"
257 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000258 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000259 : "+r"(src_raw), // %0
260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000262 : "m"(kShuffleMaskRAWToARGB) // %3
263 : "memory", "cc"
264#if defined(__SSE2__)
265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
266#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000268}
269
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000270void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000271 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000272 "mov $0x1080108,%%eax \n"
273 "movd %%eax,%%xmm5 \n"
274 "pshufd $0x0,%%xmm5,%%xmm5 \n"
275 "mov $0x20082008,%%eax \n"
276 "movd %%eax,%%xmm6 \n"
277 "pshufd $0x0,%%xmm6,%%xmm6 \n"
278 "pcmpeqb %%xmm3,%%xmm3 \n"
279 "psllw $0xb,%%xmm3 \n"
280 "pcmpeqb %%xmm4,%%xmm4 \n"
281 "psllw $0xa,%%xmm4 \n"
282 "psrlw $0x5,%%xmm4 \n"
283 "pcmpeqb %%xmm7,%%xmm7 \n"
284 "psllw $0x8,%%xmm7 \n"
285 "sub %0,%1 \n"
286 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000287 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000288 "1: \n"
289 "movdqu (%0),%%xmm0 \n"
290 "movdqa %%xmm0,%%xmm1 \n"
291 "movdqa %%xmm0,%%xmm2 \n"
292 "pand %%xmm3,%%xmm1 \n"
293 "psllw $0xb,%%xmm2 \n"
294 "pmulhuw %%xmm5,%%xmm1 \n"
295 "pmulhuw %%xmm5,%%xmm2 \n"
296 "psllw $0x8,%%xmm1 \n"
297 "por %%xmm2,%%xmm1 \n"
298 "pand %%xmm4,%%xmm0 \n"
299 "pmulhuw %%xmm6,%%xmm0 \n"
300 "por %%xmm7,%%xmm0 \n"
301 "movdqa %%xmm1,%%xmm2 \n"
302 "punpcklbw %%xmm0,%%xmm1 \n"
303 "punpckhbw %%xmm0,%%xmm2 \n"
304 "movdqa %%xmm1,(%1,%0,2) \n"
305 "movdqa %%xmm2,0x10(%1,%0,2) \n"
306 "lea 0x10(%0),%0 \n"
307 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000308 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000309 : "+r"(src), // %0
310 "+r"(dst), // %1
311 "+r"(pix) // %2
312 :
313 : "memory", "cc", "eax"
314#if defined(__SSE2__)
315 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
316#endif
317 );
318}
319
320void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000321 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000322 "mov $0x1080108,%%eax \n"
323 "movd %%eax,%%xmm5 \n"
324 "pshufd $0x0,%%xmm5,%%xmm5 \n"
325 "mov $0x42004200,%%eax \n"
326 "movd %%eax,%%xmm6 \n"
327 "pshufd $0x0,%%xmm6,%%xmm6 \n"
328 "pcmpeqb %%xmm3,%%xmm3 \n"
329 "psllw $0xb,%%xmm3 \n"
330 "movdqa %%xmm3,%%xmm4 \n"
331 "psrlw $0x6,%%xmm4 \n"
332 "pcmpeqb %%xmm7,%%xmm7 \n"
333 "psllw $0x8,%%xmm7 \n"
334 "sub %0,%1 \n"
335 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000336 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000337 "1: \n"
338 "movdqu (%0),%%xmm0 \n"
339 "movdqa %%xmm0,%%xmm1 \n"
340 "movdqa %%xmm0,%%xmm2 \n"
341 "psllw $0x1,%%xmm1 \n"
342 "psllw $0xb,%%xmm2 \n"
343 "pand %%xmm3,%%xmm1 \n"
344 "pmulhuw %%xmm5,%%xmm2 \n"
345 "pmulhuw %%xmm5,%%xmm1 \n"
346 "psllw $0x8,%%xmm1 \n"
347 "por %%xmm2,%%xmm1 \n"
348 "movdqa %%xmm0,%%xmm2 \n"
349 "pand %%xmm4,%%xmm0 \n"
350 "psraw $0x8,%%xmm2 \n"
351 "pmulhuw %%xmm6,%%xmm0 \n"
352 "pand %%xmm7,%%xmm2 \n"
353 "por %%xmm2,%%xmm0 \n"
354 "movdqa %%xmm1,%%xmm2 \n"
355 "punpcklbw %%xmm0,%%xmm1 \n"
356 "punpckhbw %%xmm0,%%xmm2 \n"
357 "movdqa %%xmm1,(%1,%0,2) \n"
358 "movdqa %%xmm2,0x10(%1,%0,2) \n"
359 "lea 0x10(%0),%0 \n"
360 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000361 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000362 : "+r"(src), // %0
363 "+r"(dst), // %1
364 "+r"(pix) // %2
365 :
366 : "memory", "cc", "eax"
367#if defined(__SSE2__)
368 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
369#endif
370 );
371}
372
373void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000374 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000375 "mov $0xf0f0f0f,%%eax \n"
376 "movd %%eax,%%xmm4 \n"
377 "pshufd $0x0,%%xmm4,%%xmm4 \n"
378 "movdqa %%xmm4,%%xmm5 \n"
379 "pslld $0x4,%%xmm5 \n"
380 "sub %0,%1 \n"
381 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000382 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000383 "1: \n"
384 "movdqu (%0),%%xmm0 \n"
385 "movdqa %%xmm0,%%xmm2 \n"
386 "pand %%xmm4,%%xmm0 \n"
387 "pand %%xmm5,%%xmm2 \n"
388 "movdqa %%xmm0,%%xmm1 \n"
389 "movdqa %%xmm2,%%xmm3 \n"
390 "psllw $0x4,%%xmm1 \n"
391 "psrlw $0x4,%%xmm3 \n"
392 "por %%xmm1,%%xmm0 \n"
393 "por %%xmm3,%%xmm2 \n"
394 "movdqa %%xmm0,%%xmm1 \n"
395 "punpcklbw %%xmm2,%%xmm0 \n"
396 "punpckhbw %%xmm2,%%xmm1 \n"
397 "movdqa %%xmm0,(%1,%0,2) \n"
398 "movdqa %%xmm1,0x10(%1,%0,2) \n"
399 "lea 0x10(%0),%0 \n"
400 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000401 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000402 : "+r"(src), // %0
403 "+r"(dst), // %1
404 "+r"(pix) // %2
405 :
406 : "memory", "cc", "eax"
407#if defined(__SSE2__)
408 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
409#endif
410 );
411}
412
413void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000414 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000415 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000416 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000417 "1: \n"
418 "movdqa (%0),%%xmm0 \n"
419 "movdqa 0x10(%0),%%xmm1 \n"
420 "movdqa 0x20(%0),%%xmm2 \n"
421 "movdqa 0x30(%0),%%xmm3 \n"
422 "lea 0x40(%0),%0 \n"
423 "pshufb %%xmm6,%%xmm0 \n"
424 "pshufb %%xmm6,%%xmm1 \n"
425 "pshufb %%xmm6,%%xmm2 \n"
426 "pshufb %%xmm6,%%xmm3 \n"
427 "movdqa %%xmm1,%%xmm4 \n"
428 "psrldq $0x4,%%xmm1 \n"
429 "pslldq $0xc,%%xmm4 \n"
430 "movdqa %%xmm2,%%xmm5 \n"
431 "por %%xmm4,%%xmm0 \n"
432 "pslldq $0x8,%%xmm5 \n"
433 "movdqa %%xmm0,(%1) \n"
434 "por %%xmm5,%%xmm1 \n"
435 "psrldq $0x8,%%xmm2 \n"
436 "pslldq $0x4,%%xmm3 \n"
437 "por %%xmm3,%%xmm2 \n"
438 "movdqa %%xmm1,0x10(%1) \n"
439 "movdqa %%xmm2,0x20(%1) \n"
440 "lea 0x30(%1),%1 \n"
441 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 : "m"(kShuffleMaskARGBToRGB24) // %3
447 : "memory", "cc"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
450#endif
451 );
452}
453
454void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000457 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000458 "1: \n"
459 "movdqa (%0),%%xmm0 \n"
460 "movdqa 0x10(%0),%%xmm1 \n"
461 "movdqa 0x20(%0),%%xmm2 \n"
462 "movdqa 0x30(%0),%%xmm3 \n"
463 "lea 0x40(%0),%0 \n"
464 "pshufb %%xmm6,%%xmm0 \n"
465 "pshufb %%xmm6,%%xmm1 \n"
466 "pshufb %%xmm6,%%xmm2 \n"
467 "pshufb %%xmm6,%%xmm3 \n"
468 "movdqa %%xmm1,%%xmm4 \n"
469 "psrldq $0x4,%%xmm1 \n"
470 "pslldq $0xc,%%xmm4 \n"
471 "movdqa %%xmm2,%%xmm5 \n"
472 "por %%xmm4,%%xmm0 \n"
473 "pslldq $0x8,%%xmm5 \n"
474 "movdqa %%xmm0,(%1) \n"
475 "por %%xmm5,%%xmm1 \n"
476 "psrldq $0x8,%%xmm2 \n"
477 "pslldq $0x4,%%xmm3 \n"
478 "por %%xmm3,%%xmm2 \n"
479 "movdqa %%xmm1,0x10(%1) \n"
480 "movdqa %%xmm2,0x20(%1) \n"
481 "lea 0x30(%1),%1 \n"
482 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000483 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000484 : "+r"(src), // %0
485 "+r"(dst), // %1
486 "+r"(pix) // %2
487 : "m"(kShuffleMaskARGBToRAW) // %3
488 : "memory", "cc"
489#if defined(__SSE2__)
490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
491#endif
492 );
493}
494
495void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000496 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000497 "pcmpeqb %%xmm3,%%xmm3 \n"
498 "psrld $0x1b,%%xmm3 \n"
499 "pcmpeqb %%xmm4,%%xmm4 \n"
500 "psrld $0x1a,%%xmm4 \n"
501 "pslld $0x5,%%xmm4 \n"
502 "pcmpeqb %%xmm5,%%xmm5 \n"
503 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000504 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000505 "1: \n"
506 "movdqa (%0),%%xmm0 \n"
507 "movdqa %%xmm0,%%xmm1 \n"
508 "movdqa %%xmm0,%%xmm2 \n"
509 "pslld $0x8,%%xmm0 \n"
510 "psrld $0x3,%%xmm1 \n"
511 "psrld $0x5,%%xmm2 \n"
512 "psrad $0x10,%%xmm0 \n"
513 "pand %%xmm3,%%xmm1 \n"
514 "pand %%xmm4,%%xmm2 \n"
515 "pand %%xmm5,%%xmm0 \n"
516 "por %%xmm2,%%xmm1 \n"
517 "por %%xmm1,%%xmm0 \n"
518 "packssdw %%xmm0,%%xmm0 \n"
519 "lea 0x10(%0),%0 \n"
520 "movq %%xmm0,(%1) \n"
521 "lea 0x8(%1),%1 \n"
522 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 :
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
531#endif
532 );
533}
534
535void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "pcmpeqb %%xmm4,%%xmm4 \n"
538 "psrld $0x1b,%%xmm4 \n"
539 "movdqa %%xmm4,%%xmm5 \n"
540 "pslld $0x5,%%xmm5 \n"
541 "movdqa %%xmm4,%%xmm6 \n"
542 "pslld $0xa,%%xmm6 \n"
543 "pcmpeqb %%xmm7,%%xmm7 \n"
544 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000545 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000546 "1: \n"
547 "movdqa (%0),%%xmm0 \n"
548 "movdqa %%xmm0,%%xmm1 \n"
549 "movdqa %%xmm0,%%xmm2 \n"
550 "movdqa %%xmm0,%%xmm3 \n"
551 "psrad $0x10,%%xmm0 \n"
552 "psrld $0x3,%%xmm1 \n"
553 "psrld $0x6,%%xmm2 \n"
554 "psrld $0x9,%%xmm3 \n"
555 "pand %%xmm7,%%xmm0 \n"
556 "pand %%xmm4,%%xmm1 \n"
557 "pand %%xmm5,%%xmm2 \n"
558 "pand %%xmm6,%%xmm3 \n"
559 "por %%xmm1,%%xmm0 \n"
560 "por %%xmm3,%%xmm2 \n"
561 "por %%xmm2,%%xmm0 \n"
562 "packssdw %%xmm0,%%xmm0 \n"
563 "lea 0x10(%0),%0 \n"
564 "movq %%xmm0,(%1) \n"
565 "lea 0x8(%1),%1 \n"
566 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000567 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 : "+r"(src), // %0
569 "+r"(dst), // %1
570 "+r"(pix) // %2
571 :
572 : "memory", "cc"
573#if defined(__SSE2__)
574 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
575#endif
576 );
577}
578
579void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000580 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000581 "pcmpeqb %%xmm4,%%xmm4 \n"
582 "psllw $0xc,%%xmm4 \n"
583 "movdqa %%xmm4,%%xmm3 \n"
584 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "pand %%xmm3,%%xmm0 \n"
590 "pand %%xmm4,%%xmm1 \n"
591 "psrlq $0x4,%%xmm0 \n"
592 "psrlq $0x8,%%xmm1 \n"
593 "por %%xmm1,%%xmm0 \n"
594 "packuswb %%xmm0,%%xmm0 \n"
595 "lea 0x10(%0),%0 \n"
596 "movq %%xmm0,(%1) \n"
597 "lea 0x8(%1),%1 \n"
598 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000599 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000600 : "+r"(src), // %0
601 "+r"(dst), // %1
602 "+r"(pix) // %2
603 :
604 : "memory", "cc"
605#if defined(__SSE2__)
606 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
607#endif
608 );
609}
610
fbarchard@google.comb6149762011-11-07 21:58:52 +0000611void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000612 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000613 "movdqa %4,%%xmm5 \n"
614 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000615 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000616 "1: \n"
617 "movdqa (%0),%%xmm0 \n"
618 "movdqa 0x10(%0),%%xmm1 \n"
619 "movdqa 0x20(%0),%%xmm2 \n"
620 "movdqa 0x30(%0),%%xmm3 \n"
621 "pmaddubsw %%xmm4,%%xmm0 \n"
622 "pmaddubsw %%xmm4,%%xmm1 \n"
623 "pmaddubsw %%xmm4,%%xmm2 \n"
624 "pmaddubsw %%xmm4,%%xmm3 \n"
625 "lea 0x40(%0),%0 \n"
626 "phaddw %%xmm1,%%xmm0 \n"
627 "phaddw %%xmm3,%%xmm2 \n"
628 "psrlw $0x7,%%xmm0 \n"
629 "psrlw $0x7,%%xmm2 \n"
630 "packuswb %%xmm2,%%xmm0 \n"
631 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000632 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000633 "movdqa %%xmm0,(%1) \n"
634 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000636 : "+r"(src_argb), // %0
637 "+r"(dst_y), // %1
638 "+r"(pix) // %2
639 : "m"(kARGBToY), // %3
640 "m"(kAddY16) // %4
641 : "memory", "cc"
642#if defined(__SSE2__)
643 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
644#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000645 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000646}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000647
648void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000649 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000650 "movdqa %4,%%xmm5 \n"
651 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000652 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000653 "1: \n"
654 "movdqu (%0),%%xmm0 \n"
655 "movdqu 0x10(%0),%%xmm1 \n"
656 "movdqu 0x20(%0),%%xmm2 \n"
657 "movdqu 0x30(%0),%%xmm3 \n"
658 "pmaddubsw %%xmm4,%%xmm0 \n"
659 "pmaddubsw %%xmm4,%%xmm1 \n"
660 "pmaddubsw %%xmm4,%%xmm2 \n"
661 "pmaddubsw %%xmm4,%%xmm3 \n"
662 "lea 0x40(%0),%0 \n"
663 "phaddw %%xmm1,%%xmm0 \n"
664 "phaddw %%xmm3,%%xmm2 \n"
665 "psrlw $0x7,%%xmm0 \n"
666 "psrlw $0x7,%%xmm2 \n"
667 "packuswb %%xmm2,%%xmm0 \n"
668 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000669 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000670 "movdqu %%xmm0,(%1) \n"
671 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000672 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000673 : "+r"(src_argb), // %0
674 "+r"(dst_y), // %1
675 "+r"(pix) // %2
676 : "m"(kARGBToY), // %3
677 "m"(kAddY16) // %4
678 : "memory", "cc"
679#if defined(__SSE2__)
680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
681#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000682 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000683}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000684
fbarchard@google.com714050a2012-02-17 22:59:56 +0000685// TODO(fbarchard): pass xmm constants to single block of assembly.
686// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
687// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
688// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
689// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000690void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
691 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000693 "movdqa %0,%%xmm4 \n"
694 "movdqa %1,%%xmm3 \n"
695 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000696 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000697 : "m"(kARGBToU), // %0
698 "m"(kARGBToV), // %1
699 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000700 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000701 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000702 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000703 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000704 "1: \n"
705 "movdqa (%0),%%xmm0 \n"
706 "movdqa 0x10(%0),%%xmm1 \n"
707 "movdqa 0x20(%0),%%xmm2 \n"
708 "movdqa 0x30(%0),%%xmm6 \n"
709 "pavgb (%0,%4,1),%%xmm0 \n"
710 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
711 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
712 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
713 "lea 0x40(%0),%0 \n"
714 "movdqa %%xmm0,%%xmm7 \n"
715 "shufps $0x88,%%xmm1,%%xmm0 \n"
716 "shufps $0xdd,%%xmm1,%%xmm7 \n"
717 "pavgb %%xmm7,%%xmm0 \n"
718 "movdqa %%xmm2,%%xmm7 \n"
719 "shufps $0x88,%%xmm6,%%xmm2 \n"
720 "shufps $0xdd,%%xmm6,%%xmm7 \n"
721 "pavgb %%xmm7,%%xmm2 \n"
722 "movdqa %%xmm0,%%xmm1 \n"
723 "movdqa %%xmm2,%%xmm6 \n"
724 "pmaddubsw %%xmm4,%%xmm0 \n"
725 "pmaddubsw %%xmm4,%%xmm2 \n"
726 "pmaddubsw %%xmm3,%%xmm1 \n"
727 "pmaddubsw %%xmm3,%%xmm6 \n"
728 "phaddw %%xmm2,%%xmm0 \n"
729 "phaddw %%xmm6,%%xmm1 \n"
730 "psraw $0x8,%%xmm0 \n"
731 "psraw $0x8,%%xmm1 \n"
732 "packsswb %%xmm1,%%xmm0 \n"
733 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000734 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000735 "movlps %%xmm0,(%1) \n"
736 "movhps %%xmm0,(%1,%2,1) \n"
737 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000738 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000739 : "+r"(src_argb0), // %0
740 "+r"(dst_u), // %1
741 "+r"(dst_v), // %2
742 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000743 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000744 : "memory", "cc"
745#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000746 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000747#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000748 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000749}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000750
751void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
752 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000753 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000754 "movdqa %0,%%xmm4 \n"
755 "movdqa %1,%%xmm3 \n"
756 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000757 :
758 : "m"(kARGBToU), // %0
759 "m"(kARGBToV), // %1
760 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000761 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000762 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000763 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm6 \n"
770 "movdqu (%0,%4,1),%%xmm7 \n"
771 "pavgb %%xmm7,%%xmm0 \n"
772 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
773 "pavgb %%xmm7,%%xmm1 \n"
774 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm2 \n"
776 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
777 "pavgb %%xmm7,%%xmm6 \n"
778 "lea 0x40(%0),%0 \n"
779 "movdqa %%xmm0,%%xmm7 \n"
780 "shufps $0x88,%%xmm1,%%xmm0 \n"
781 "shufps $0xdd,%%xmm1,%%xmm7 \n"
782 "pavgb %%xmm7,%%xmm0 \n"
783 "movdqa %%xmm2,%%xmm7 \n"
784 "shufps $0x88,%%xmm6,%%xmm2 \n"
785 "shufps $0xdd,%%xmm6,%%xmm7 \n"
786 "pavgb %%xmm7,%%xmm2 \n"
787 "movdqa %%xmm0,%%xmm1 \n"
788 "movdqa %%xmm2,%%xmm6 \n"
789 "pmaddubsw %%xmm4,%%xmm0 \n"
790 "pmaddubsw %%xmm4,%%xmm2 \n"
791 "pmaddubsw %%xmm3,%%xmm1 \n"
792 "pmaddubsw %%xmm3,%%xmm6 \n"
793 "phaddw %%xmm2,%%xmm0 \n"
794 "phaddw %%xmm6,%%xmm1 \n"
795 "psraw $0x8,%%xmm0 \n"
796 "psraw $0x8,%%xmm1 \n"
797 "packsswb %%xmm1,%%xmm0 \n"
798 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000799 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000800 "movlps %%xmm0,(%1) \n"
801 "movhps %%xmm0,(%1,%2,1) \n"
802 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000803 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000804 : "+r"(src_argb0), // %0
805 "+r"(dst_u), // %1
806 "+r"(dst_v), // %2
807 "+rm"(width) // %3
808 : "r"(static_cast<intptr_t>(src_stride_argb))
809 : "memory", "cc"
810#if defined(__SSE2__)
811 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
812#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000813 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000814}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000815
fbarchard@google.com714050a2012-02-17 22:59:56 +0000816void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000817 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000818 "movdqa %4,%%xmm5 \n"
819 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000820 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000821 "1: \n"
822 "movdqa (%0),%%xmm0 \n"
823 "movdqa 0x10(%0),%%xmm1 \n"
824 "movdqa 0x20(%0),%%xmm2 \n"
825 "movdqa 0x30(%0),%%xmm3 \n"
826 "pmaddubsw %%xmm4,%%xmm0 \n"
827 "pmaddubsw %%xmm4,%%xmm1 \n"
828 "pmaddubsw %%xmm4,%%xmm2 \n"
829 "pmaddubsw %%xmm4,%%xmm3 \n"
830 "lea 0x40(%0),%0 \n"
831 "phaddw %%xmm1,%%xmm0 \n"
832 "phaddw %%xmm3,%%xmm2 \n"
833 "psrlw $0x7,%%xmm0 \n"
834 "psrlw $0x7,%%xmm2 \n"
835 "packuswb %%xmm2,%%xmm0 \n"
836 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000837 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000838 "movdqa %%xmm0,(%1) \n"
839 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000840 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000841 : "+r"(src_bgra), // %0
842 "+r"(dst_y), // %1
843 "+r"(pix) // %2
844 : "m"(kBGRAToY), // %3
845 "m"(kAddY16) // %4
846 : "memory", "cc"
847#if defined(__SSE2__)
848 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000849#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000850 );
851}
852
853void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000854 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000855 "movdqa %4,%%xmm5 \n"
856 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000857 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000858 "1: \n"
859 "movdqu (%0),%%xmm0 \n"
860 "movdqu 0x10(%0),%%xmm1 \n"
861 "movdqu 0x20(%0),%%xmm2 \n"
862 "movdqu 0x30(%0),%%xmm3 \n"
863 "pmaddubsw %%xmm4,%%xmm0 \n"
864 "pmaddubsw %%xmm4,%%xmm1 \n"
865 "pmaddubsw %%xmm4,%%xmm2 \n"
866 "pmaddubsw %%xmm4,%%xmm3 \n"
867 "lea 0x40(%0),%0 \n"
868 "phaddw %%xmm1,%%xmm0 \n"
869 "phaddw %%xmm3,%%xmm2 \n"
870 "psrlw $0x7,%%xmm0 \n"
871 "psrlw $0x7,%%xmm2 \n"
872 "packuswb %%xmm2,%%xmm0 \n"
873 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000874 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000875 "movdqu %%xmm0,(%1) \n"
876 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000877 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000878 : "+r"(src_bgra), // %0
879 "+r"(dst_y), // %1
880 "+r"(pix) // %2
881 : "m"(kBGRAToY), // %3
882 "m"(kAddY16) // %4
883 : "memory", "cc"
884#if defined(__SSE2__)
885 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
886#endif
887 );
888}
889
890void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
891 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000892 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000893 "movdqa %0,%%xmm4 \n"
894 "movdqa %1,%%xmm3 \n"
895 "movdqa %2,%%xmm5 \n"
896 :
897 : "m"(kBGRAToU), // %0
898 "m"(kBGRAToV), // %1
899 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000900 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000901 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000902 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000903 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000904 "1: \n"
905 "movdqa (%0),%%xmm0 \n"
906 "movdqa 0x10(%0),%%xmm1 \n"
907 "movdqa 0x20(%0),%%xmm2 \n"
908 "movdqa 0x30(%0),%%xmm6 \n"
909 "pavgb (%0,%4,1),%%xmm0 \n"
910 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
911 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
912 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
913 "lea 0x40(%0),%0 \n"
914 "movdqa %%xmm0,%%xmm7 \n"
915 "shufps $0x88,%%xmm1,%%xmm0 \n"
916 "shufps $0xdd,%%xmm1,%%xmm7 \n"
917 "pavgb %%xmm7,%%xmm0 \n"
918 "movdqa %%xmm2,%%xmm7 \n"
919 "shufps $0x88,%%xmm6,%%xmm2 \n"
920 "shufps $0xdd,%%xmm6,%%xmm7 \n"
921 "pavgb %%xmm7,%%xmm2 \n"
922 "movdqa %%xmm0,%%xmm1 \n"
923 "movdqa %%xmm2,%%xmm6 \n"
924 "pmaddubsw %%xmm4,%%xmm0 \n"
925 "pmaddubsw %%xmm4,%%xmm2 \n"
926 "pmaddubsw %%xmm3,%%xmm1 \n"
927 "pmaddubsw %%xmm3,%%xmm6 \n"
928 "phaddw %%xmm2,%%xmm0 \n"
929 "phaddw %%xmm6,%%xmm1 \n"
930 "psraw $0x8,%%xmm0 \n"
931 "psraw $0x8,%%xmm1 \n"
932 "packsswb %%xmm1,%%xmm0 \n"
933 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000934 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000935 "movlps %%xmm0,(%1) \n"
936 "movhps %%xmm0,(%1,%2,1) \n"
937 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000938 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000939 : "+r"(src_bgra0), // %0
940 "+r"(dst_u), // %1
941 "+r"(dst_v), // %2
942 "+rm"(width) // %3
943 : "r"(static_cast<intptr_t>(src_stride_bgra))
944 : "memory", "cc"
945#if defined(__SSE2__)
946 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
947#endif
948 );
949}
950
951void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
952 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000953 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000954 "movdqa %0,%%xmm4 \n"
955 "movdqa %1,%%xmm3 \n"
956 "movdqa %2,%%xmm5 \n"
957 :
958 : "m"(kBGRAToU), // %0
959 "m"(kBGRAToV), // %1
960 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000961 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000962 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000963 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000964 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000965 "1: \n"
966 "movdqu (%0),%%xmm0 \n"
967 "movdqu 0x10(%0),%%xmm1 \n"
968 "movdqu 0x20(%0),%%xmm2 \n"
969 "movdqu 0x30(%0),%%xmm6 \n"
970 "movdqu (%0,%4,1),%%xmm7 \n"
971 "pavgb %%xmm7,%%xmm0 \n"
972 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
973 "pavgb %%xmm7,%%xmm1 \n"
974 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm2 \n"
976 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm6 \n"
978 "lea 0x40(%0),%0 \n"
979 "movdqa %%xmm0,%%xmm7 \n"
980 "shufps $0x88,%%xmm1,%%xmm0 \n"
981 "shufps $0xdd,%%xmm1,%%xmm7 \n"
982 "pavgb %%xmm7,%%xmm0 \n"
983 "movdqa %%xmm2,%%xmm7 \n"
984 "shufps $0x88,%%xmm6,%%xmm2 \n"
985 "shufps $0xdd,%%xmm6,%%xmm7 \n"
986 "pavgb %%xmm7,%%xmm2 \n"
987 "movdqa %%xmm0,%%xmm1 \n"
988 "movdqa %%xmm2,%%xmm6 \n"
989 "pmaddubsw %%xmm4,%%xmm0 \n"
990 "pmaddubsw %%xmm4,%%xmm2 \n"
991 "pmaddubsw %%xmm3,%%xmm1 \n"
992 "pmaddubsw %%xmm3,%%xmm6 \n"
993 "phaddw %%xmm2,%%xmm0 \n"
994 "phaddw %%xmm6,%%xmm1 \n"
995 "psraw $0x8,%%xmm0 \n"
996 "psraw $0x8,%%xmm1 \n"
997 "packsswb %%xmm1,%%xmm0 \n"
998 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000999 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001000 "movlps %%xmm0,(%1) \n"
1001 "movhps %%xmm0,(%1,%2,1) \n"
1002 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001003 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001004 : "+r"(src_bgra0), // %0
1005 "+r"(dst_u), // %1
1006 "+r"(dst_v), // %2
1007 "+rm"(width) // %3
1008 : "r"(static_cast<intptr_t>(src_stride_bgra))
1009 : "memory", "cc"
1010#if defined(__SSE2__)
1011 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1012#endif
1013 );
1014}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001015
1016void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001017 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001018 "movdqa %4,%%xmm5 \n"
1019 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001020 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001021 "1: \n"
1022 "movdqa (%0),%%xmm0 \n"
1023 "movdqa 0x10(%0),%%xmm1 \n"
1024 "movdqa 0x20(%0),%%xmm2 \n"
1025 "movdqa 0x30(%0),%%xmm3 \n"
1026 "pmaddubsw %%xmm4,%%xmm0 \n"
1027 "pmaddubsw %%xmm4,%%xmm1 \n"
1028 "pmaddubsw %%xmm4,%%xmm2 \n"
1029 "pmaddubsw %%xmm4,%%xmm3 \n"
1030 "lea 0x40(%0),%0 \n"
1031 "phaddw %%xmm1,%%xmm0 \n"
1032 "phaddw %%xmm3,%%xmm2 \n"
1033 "psrlw $0x7,%%xmm0 \n"
1034 "psrlw $0x7,%%xmm2 \n"
1035 "packuswb %%xmm2,%%xmm0 \n"
1036 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001037 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001038 "movdqa %%xmm0,(%1) \n"
1039 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001040 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001041 : "+r"(src_abgr), // %0
1042 "+r"(dst_y), // %1
1043 "+r"(pix) // %2
1044 : "m"(kABGRToY), // %3
1045 "m"(kAddY16) // %4
1046 : "memory", "cc"
1047#if defined(__SSE2__)
1048 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1049#endif
1050 );
1051}
1052
1053void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001054 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001055 "movdqa %4,%%xmm5 \n"
1056 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001057 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001058 "1: \n"
1059 "movdqu (%0),%%xmm0 \n"
1060 "movdqu 0x10(%0),%%xmm1 \n"
1061 "movdqu 0x20(%0),%%xmm2 \n"
1062 "movdqu 0x30(%0),%%xmm3 \n"
1063 "pmaddubsw %%xmm4,%%xmm0 \n"
1064 "pmaddubsw %%xmm4,%%xmm1 \n"
1065 "pmaddubsw %%xmm4,%%xmm2 \n"
1066 "pmaddubsw %%xmm4,%%xmm3 \n"
1067 "lea 0x40(%0),%0 \n"
1068 "phaddw %%xmm1,%%xmm0 \n"
1069 "phaddw %%xmm3,%%xmm2 \n"
1070 "psrlw $0x7,%%xmm0 \n"
1071 "psrlw $0x7,%%xmm2 \n"
1072 "packuswb %%xmm2,%%xmm0 \n"
1073 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001074 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001075 "movdqu %%xmm0,(%1) \n"
1076 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001077 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001078 : "+r"(src_abgr), // %0
1079 "+r"(dst_y), // %1
1080 "+r"(pix) // %2
1081 : "m"(kABGRToY), // %3
1082 "m"(kAddY16) // %4
1083 : "memory", "cc"
1084#if defined(__SSE2__)
1085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1086#endif
1087 );
1088}
1089
1090void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1091 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001092 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001093 "movdqa %0,%%xmm4 \n"
1094 "movdqa %1,%%xmm3 \n"
1095 "movdqa %2,%%xmm5 \n"
1096 :
1097 : "m"(kABGRToU), // %0
1098 "m"(kABGRToV), // %1
1099 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001100 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001101 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001102 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001103 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001104 "1: \n"
1105 "movdqa (%0),%%xmm0 \n"
1106 "movdqa 0x10(%0),%%xmm1 \n"
1107 "movdqa 0x20(%0),%%xmm2 \n"
1108 "movdqa 0x30(%0),%%xmm6 \n"
1109 "pavgb (%0,%4,1),%%xmm0 \n"
1110 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1111 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1112 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1113 "lea 0x40(%0),%0 \n"
1114 "movdqa %%xmm0,%%xmm7 \n"
1115 "shufps $0x88,%%xmm1,%%xmm0 \n"
1116 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1117 "pavgb %%xmm7,%%xmm0 \n"
1118 "movdqa %%xmm2,%%xmm7 \n"
1119 "shufps $0x88,%%xmm6,%%xmm2 \n"
1120 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1121 "pavgb %%xmm7,%%xmm2 \n"
1122 "movdqa %%xmm0,%%xmm1 \n"
1123 "movdqa %%xmm2,%%xmm6 \n"
1124 "pmaddubsw %%xmm4,%%xmm0 \n"
1125 "pmaddubsw %%xmm4,%%xmm2 \n"
1126 "pmaddubsw %%xmm3,%%xmm1 \n"
1127 "pmaddubsw %%xmm3,%%xmm6 \n"
1128 "phaddw %%xmm2,%%xmm0 \n"
1129 "phaddw %%xmm6,%%xmm1 \n"
1130 "psraw $0x8,%%xmm0 \n"
1131 "psraw $0x8,%%xmm1 \n"
1132 "packsswb %%xmm1,%%xmm0 \n"
1133 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001134 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 "movlps %%xmm0,(%1) \n"
1136 "movhps %%xmm0,(%1,%2,1) \n"
1137 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001138 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 : "+r"(src_abgr0), // %0
1140 "+r"(dst_u), // %1
1141 "+r"(dst_v), // %2
1142 "+rm"(width) // %3
1143 : "r"(static_cast<intptr_t>(src_stride_abgr))
1144 : "memory", "cc"
1145#if defined(__SSE2__)
1146 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1147#endif
1148 );
1149}
1150
1151void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1152 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001153 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001154 "movdqa %0,%%xmm4 \n"
1155 "movdqa %1,%%xmm3 \n"
1156 "movdqa %2,%%xmm5 \n"
1157 :
1158 : "m"(kABGRToU), // %0
1159 "m"(kABGRToV), // %1
1160 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001161 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001162 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001163 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001164 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001165 "1: \n"
1166 "movdqu (%0),%%xmm0 \n"
1167 "movdqu 0x10(%0),%%xmm1 \n"
1168 "movdqu 0x20(%0),%%xmm2 \n"
1169 "movdqu 0x30(%0),%%xmm6 \n"
1170 "movdqu (%0,%4,1),%%xmm7 \n"
1171 "pavgb %%xmm7,%%xmm0 \n"
1172 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1173 "pavgb %%xmm7,%%xmm1 \n"
1174 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm2 \n"
1176 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1177 "pavgb %%xmm7,%%xmm6 \n"
1178 "lea 0x40(%0),%0 \n"
1179 "movdqa %%xmm0,%%xmm7 \n"
1180 "shufps $0x88,%%xmm1,%%xmm0 \n"
1181 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1182 "pavgb %%xmm7,%%xmm0 \n"
1183 "movdqa %%xmm2,%%xmm7 \n"
1184 "shufps $0x88,%%xmm6,%%xmm2 \n"
1185 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1186 "pavgb %%xmm7,%%xmm2 \n"
1187 "movdqa %%xmm0,%%xmm1 \n"
1188 "movdqa %%xmm2,%%xmm6 \n"
1189 "pmaddubsw %%xmm4,%%xmm0 \n"
1190 "pmaddubsw %%xmm4,%%xmm2 \n"
1191 "pmaddubsw %%xmm3,%%xmm1 \n"
1192 "pmaddubsw %%xmm3,%%xmm6 \n"
1193 "phaddw %%xmm2,%%xmm0 \n"
1194 "phaddw %%xmm6,%%xmm1 \n"
1195 "psraw $0x8,%%xmm0 \n"
1196 "psraw $0x8,%%xmm1 \n"
1197 "packsswb %%xmm1,%%xmm0 \n"
1198 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001199 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001200 "movlps %%xmm0,(%1) \n"
1201 "movhps %%xmm0,(%1,%2,1) \n"
1202 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001203 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001204 : "+r"(src_abgr0), // %0
1205 "+r"(dst_u), // %1
1206 "+r"(dst_v), // %2
1207 "+rm"(width) // %3
1208 : "r"(static_cast<intptr_t>(src_stride_abgr))
1209 : "memory", "cc"
1210#if defined(__SSE2__)
1211 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1212#endif
1213 );
1214}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001215
1216#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001217
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001218#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001219#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1220#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1221#define UR 0
1222
1223#define VB 0
1224#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1225#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1226
1227// Bias
1228#define BB UB * 128 + VB * 128
1229#define BG UG * 128 + VG * 128
1230#define BR UR * 128 + VR * 128
1231
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001232#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001233
fbarchard@google.comb6149762011-11-07 21:58:52 +00001234#if defined(__APPLE__) || defined(__x86_64__)
1235#define OMITFP
1236#else
1237#define OMITFP __attribute__((optimize("omit-frame-pointer")))
1238#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001239
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001240struct {
1241 vec8 kUVToB;
1242 vec8 kUVToG;
1243 vec8 kUVToR;
1244 vec16 kUVBiasB;
1245 vec16 kUVBiasG;
1246 vec16 kUVBiasR;
1247 vec16 kYSub16;
1248 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001249} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001250 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1251 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1252 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1253 { BB, BB, BB, BB, BB, BB, BB, BB },
1254 { BG, BG, BG, BG, BG, BG, BG, BG },
1255 { BR, BR, BR, BR, BR, BR, BR, BR },
1256 { 16, 16, 16, 16, 16, 16, 16, 16 },
1257 { YG, YG, YG, YG, YG, YG, YG, YG }
1258};
1259
1260// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +00001261#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001262 "movd (%1),%%xmm0 \n" \
1263 "movd (%1,%2,1),%%xmm1 \n" \
1264 "lea 0x4(%1),%1 \n" \
1265 "punpcklbw %%xmm1,%%xmm0 \n" \
1266 "punpcklwd %%xmm0,%%xmm0 \n" \
1267 "movdqa %%xmm0,%%xmm1 \n" \
1268 "movdqa %%xmm0,%%xmm2 \n" \
1269 "pmaddubsw (%5),%%xmm0 \n" \
1270 "pmaddubsw 16(%5),%%xmm1 \n" \
1271 "pmaddubsw 32(%5),%%xmm2 \n" \
1272 "psubw 48(%5),%%xmm0 \n" \
1273 "psubw 64(%5),%%xmm1 \n" \
1274 "psubw 80(%5),%%xmm2 \n" \
1275 "movq (%0),%%xmm3 \n" \
1276 "lea 0x8(%0),%0 \n" \
1277 "punpcklbw %%xmm4,%%xmm3 \n" \
1278 "psubsw 96(%5),%%xmm3 \n" \
1279 "pmullw 112(%5),%%xmm3 \n" \
1280 "paddsw %%xmm3,%%xmm0 \n" \
1281 "paddsw %%xmm3,%%xmm1 \n" \
1282 "paddsw %%xmm3,%%xmm2 \n" \
1283 "psraw $0x6,%%xmm0 \n" \
1284 "psraw $0x6,%%xmm1 \n" \
1285 "psraw $0x6,%%xmm2 \n" \
1286 "packuswb %%xmm0,%%xmm0 \n" \
1287 "packuswb %%xmm1,%%xmm1 \n" \
1288 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001289
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001290void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
1291 const uint8* u_buf,
1292 const uint8* v_buf,
1293 uint8* rgb_buf,
1294 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001295 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001296 "sub %1,%2 \n"
1297 "pcmpeqb %%xmm5,%%xmm5 \n"
1298 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001299 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001300 "1: \n"
1301 YUVTORGB
1302 "punpcklbw %%xmm1,%%xmm0 \n"
1303 "punpcklbw %%xmm5,%%xmm2 \n"
1304 "movdqa %%xmm0,%%xmm1 \n"
1305 "punpcklwd %%xmm2,%%xmm0 \n"
1306 "punpckhwd %%xmm2,%%xmm1 \n"
1307 "movdqa %%xmm0,(%3) \n"
1308 "movdqa %%xmm1,0x10(%3) \n"
1309 "lea 0x20(%3),%3 \n"
1310 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001311 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001312 : "+r"(y_buf), // %0
1313 "+r"(u_buf), // %1
1314 "+r"(v_buf), // %2
1315 "+r"(rgb_buf), // %3
1316 "+rm"(width) // %4
1317 : "r"(&kYuvConstants.kUVToB) // %5
1318 : "memory", "cc"
1319#if defined(__SSE2__)
1320 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1321#endif
1322 );
1323}
1324
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001325void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
1326 const uint8* u_buf,
1327 const uint8* v_buf,
1328 uint8* rgb_buf,
1329 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001330 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001331 "sub %1,%2 \n"
1332 "pcmpeqb %%xmm5,%%xmm5 \n"
1333 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001334 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001335 "1: \n"
1336 YUVTORGB
1337 "pcmpeqb %%xmm5,%%xmm5 \n"
1338 "punpcklbw %%xmm0,%%xmm1 \n"
1339 "punpcklbw %%xmm2,%%xmm5 \n"
1340 "movdqa %%xmm5,%%xmm0 \n"
1341 "punpcklwd %%xmm1,%%xmm5 \n"
1342 "punpckhwd %%xmm1,%%xmm0 \n"
1343 "movdqa %%xmm5,(%3) \n"
1344 "movdqa %%xmm0,0x10(%3) \n"
1345 "lea 0x20(%3),%3 \n"
1346 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001347 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001348 : "+r"(y_buf), // %0
1349 "+r"(u_buf), // %1
1350 "+r"(v_buf), // %2
1351 "+r"(rgb_buf), // %3
1352 "+rm"(width) // %4
1353 : "r"(&kYuvConstants.kUVToB) // %5
1354 : "memory", "cc"
1355#if defined(__SSE2__)
1356 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1357#endif
1358 );
1359}
1360
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001361void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
1362 const uint8* u_buf,
1363 const uint8* v_buf,
1364 uint8* rgb_buf,
1365 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001366 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001367 "sub %1,%2 \n"
1368 "pcmpeqb %%xmm5,%%xmm5 \n"
1369 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001370 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001371 "1: \n"
1372 YUVTORGB
1373 "punpcklbw %%xmm1,%%xmm2 \n"
1374 "punpcklbw %%xmm5,%%xmm0 \n"
1375 "movdqa %%xmm2,%%xmm1 \n"
1376 "punpcklwd %%xmm0,%%xmm2 \n"
1377 "punpckhwd %%xmm0,%%xmm1 \n"
1378 "movdqa %%xmm2,(%3) \n"
1379 "movdqa %%xmm1,0x10(%3) \n"
1380 "lea 0x20(%3),%3 \n"
1381 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001382 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001383 : "+r"(y_buf), // %0
1384 "+r"(u_buf), // %1
1385 "+r"(v_buf), // %2
1386 "+r"(rgb_buf), // %3
1387 "+rm"(width) // %4
1388 : "r"(&kYuvConstants.kUVToB) // %5
1389 : "memory", "cc"
1390#if defined(__SSE2__)
1391 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1392#endif
1393 );
1394}
1395
fbarchard@google.com952a5072012-03-30 18:10:50 +00001396void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1397 const uint8* u_buf,
1398 const uint8* v_buf,
1399 uint8* rgb_buf,
1400 int width) {
1401 asm volatile (
1402 "sub %1,%2 \n"
1403 "pcmpeqb %%xmm5,%%xmm5 \n"
1404 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001405 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001406 "1: \n"
1407 YUVTORGB
1408 "punpcklbw %%xmm1,%%xmm0 \n"
1409 "punpcklbw %%xmm5,%%xmm2 \n"
1410 "movdqa %%xmm0,%%xmm1 \n"
1411 "punpcklwd %%xmm2,%%xmm0 \n"
1412 "punpckhwd %%xmm2,%%xmm1 \n"
1413 "movdqu %%xmm0,(%3) \n"
1414 "movdqu %%xmm1,0x10(%3) \n"
1415 "lea 0x20(%3),%3 \n"
1416 "sub $0x8,%4 \n"
1417 "jg 1b \n"
1418 : "+r"(y_buf), // %0
1419 "+r"(u_buf), // %1
1420 "+r"(v_buf), // %2
1421 "+r"(rgb_buf), // %3
1422 "+rm"(width) // %4
1423 : "r"(&kYuvConstants.kUVToB) // %5
1424 : "memory", "cc"
1425#if defined(__SSE2__)
1426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1427#endif
1428 );
1429}
1430
1431void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
1432 const uint8* u_buf,
1433 const uint8* v_buf,
1434 uint8* rgb_buf,
1435 int width) {
1436 asm volatile (
1437 "sub %1,%2 \n"
1438 "pcmpeqb %%xmm5,%%xmm5 \n"
1439 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001440 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001441 "1: \n"
1442 YUVTORGB
1443 "pcmpeqb %%xmm5,%%xmm5 \n"
1444 "punpcklbw %%xmm0,%%xmm1 \n"
1445 "punpcklbw %%xmm2,%%xmm5 \n"
1446 "movdqa %%xmm5,%%xmm0 \n"
1447 "punpcklwd %%xmm1,%%xmm5 \n"
1448 "punpckhwd %%xmm1,%%xmm0 \n"
1449 "movdqu %%xmm5,(%3) \n"
1450 "movdqu %%xmm0,0x10(%3) \n"
1451 "lea 0x20(%3),%3 \n"
1452 "sub $0x8,%4 \n"
1453 "jg 1b \n"
1454 : "+r"(y_buf), // %0
1455 "+r"(u_buf), // %1
1456 "+r"(v_buf), // %2
1457 "+r"(rgb_buf), // %3
1458 "+rm"(width) // %4
1459 : "r"(&kYuvConstants.kUVToB) // %5
1460 : "memory", "cc"
1461#if defined(__SSE2__)
1462 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1463#endif
1464 );
1465}
1466
1467void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
1468 const uint8* u_buf,
1469 const uint8* v_buf,
1470 uint8* rgb_buf,
1471 int width) {
1472 asm volatile (
1473 "sub %1,%2 \n"
1474 "pcmpeqb %%xmm5,%%xmm5 \n"
1475 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001476 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001477 "1: \n"
1478 YUVTORGB
1479 "punpcklbw %%xmm1,%%xmm2 \n"
1480 "punpcklbw %%xmm5,%%xmm0 \n"
1481 "movdqa %%xmm2,%%xmm1 \n"
1482 "punpcklwd %%xmm0,%%xmm2 \n"
1483 "punpckhwd %%xmm0,%%xmm1 \n"
1484 "movdqu %%xmm2,(%3) \n"
1485 "movdqu %%xmm1,0x10(%3) \n"
1486 "lea 0x20(%3),%3 \n"
1487 "sub $0x8,%4 \n"
1488 "jg 1b \n"
1489 : "+r"(y_buf), // %0
1490 "+r"(u_buf), // %1
1491 "+r"(v_buf), // %2
1492 "+r"(rgb_buf), // %3
1493 "+rm"(width) // %4
1494 : "r"(&kYuvConstants.kUVToB) // %5
1495 : "memory", "cc"
1496#if defined(__SSE2__)
1497 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1498#endif
1499 );
1500}
1501
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001502void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1503 const uint8* u_buf,
1504 const uint8* v_buf,
1505 uint8* rgb_buf,
1506 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001507 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001508 "sub %1,%2 \n"
1509 "pcmpeqb %%xmm5,%%xmm5 \n"
1510 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001511 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001512 "1: \n"
1513 "movd (%1),%%xmm0 \n"
1514 "movd (%1,%2,1),%%xmm1 \n"
1515 "lea 0x4(%1),%1 \n"
1516 "punpcklbw %%xmm1,%%xmm0 \n"
1517 "movdqa %%xmm0,%%xmm1 \n"
1518 "movdqa %%xmm0,%%xmm2 \n"
1519 "pmaddubsw (%5),%%xmm0 \n"
1520 "pmaddubsw 16(%5),%%xmm1 \n"
1521 "pmaddubsw 32(%5),%%xmm2 \n"
1522 "psubw 48(%5),%%xmm0 \n"
1523 "psubw 64(%5),%%xmm1 \n"
1524 "psubw 80(%5),%%xmm2 \n"
1525 "movd (%0),%%xmm3 \n"
1526 "lea 0x4(%0),%0 \n"
1527 "punpcklbw %%xmm4,%%xmm3 \n"
1528 "psubsw 96(%5),%%xmm3 \n"
1529 "pmullw 112(%5),%%xmm3 \n"
1530 "paddsw %%xmm3,%%xmm0 \n"
1531 "paddsw %%xmm3,%%xmm1 \n"
1532 "paddsw %%xmm3,%%xmm2 \n"
1533 "psraw $0x6,%%xmm0 \n"
1534 "psraw $0x6,%%xmm1 \n"
1535 "psraw $0x6,%%xmm2 \n"
1536 "packuswb %%xmm0,%%xmm0 \n"
1537 "packuswb %%xmm1,%%xmm1 \n"
1538 "packuswb %%xmm2,%%xmm2 \n"
1539 "punpcklbw %%xmm1,%%xmm0 \n"
1540 "punpcklbw %%xmm5,%%xmm2 \n"
1541 "punpcklwd %%xmm2,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001542 "sub $0x4,%4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001543 "movdqa %%xmm0,(%3) \n"
1544 "lea 0x10(%3),%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001545 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001546 : "+r"(y_buf), // %0
1547 "+r"(u_buf), // %1
1548 "+r"(v_buf), // %2
1549 "+r"(rgb_buf), // %3
1550 "+rm"(width) // %4
1551 : "r"(&kYuvConstants.kUVToB) // %5
1552 : "memory", "cc"
1553#if defined(__SSE2__)
1554 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1555#endif
1556 );
1557}
1558#endif
1559
1560#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001561void YToARGBRow_SSE2(const uint8* y_buf,
1562 uint8* rgb_buf,
1563 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001564 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001565 "pcmpeqb %%xmm4,%%xmm4 \n"
1566 "pslld $0x18,%%xmm4 \n"
1567 "mov $0x10001000,%%eax \n"
1568 "movd %%eax,%%xmm3 \n"
1569 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1570 "mov $0x012a012a,%%eax \n"
1571 "movd %%eax,%%xmm2 \n"
1572 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001573 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001574 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001575 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001576 "movq (%0),%%xmm0 \n"
1577 "lea 0x8(%0),%0 \n"
1578 "punpcklbw %%xmm0,%%xmm0 \n"
1579 "psubusw %%xmm3,%%xmm0 \n"
1580 "pmulhuw %%xmm2,%%xmm0 \n"
1581 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001582
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001583 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001584 "punpcklbw %%xmm0,%%xmm0 \n"
1585 "movdqa %%xmm0,%%xmm1 \n"
1586 "punpcklwd %%xmm0,%%xmm0 \n"
1587 "punpckhwd %%xmm1,%%xmm1 \n"
1588 "por %%xmm4,%%xmm0 \n"
1589 "por %%xmm4,%%xmm1 \n"
1590 "movdqa %%xmm0,(%1) \n"
1591 "movdqa %%xmm1,16(%1) \n"
1592 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001593
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001594 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001595 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001596 : "+r"(y_buf), // %0
1597 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001598 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001599 :
1600 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001601#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001602 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001603#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001604 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001605}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001606#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001607
fbarchard@google.com42831e02012-01-21 02:54:17 +00001608#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001609// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001610CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001611 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1612};
1613
fbarchard@google.com42831e02012-01-21 02:54:17 +00001614void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001615 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001616 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001617 "movdqa %3,%%xmm5 \n"
1618 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001619 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001620 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001621 "movdqa (%0,%2),%%xmm0 \n"
1622 "pshufb %%xmm5,%%xmm0 \n"
1623 "sub $0x10,%2 \n"
1624 "movdqa %%xmm0,(%1) \n"
1625 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001626 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001627 : "+r"(src), // %0
1628 "+r"(dst), // %1
1629 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001630 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001631 : "memory", "cc"
1632#if defined(__SSE2__)
1633 , "xmm0", "xmm5"
1634#endif
1635 );
1636}
1637#endif
1638
fbarchard@google.com42831e02012-01-21 02:54:17 +00001639#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001640void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001641 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001642 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001643 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001644 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001645 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001646 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001647 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001648 "psllw $0x8,%%xmm0 \n"
1649 "psrlw $0x8,%%xmm1 \n"
1650 "por %%xmm1,%%xmm0 \n"
1651 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1652 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1653 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1654 "sub $0x10,%2 \n"
1655 "movdqu %%xmm0,(%1) \n"
1656 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001657 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001658 : "+r"(src), // %0
1659 "+r"(dst), // %1
1660 "+r"(temp_width) // %2
1661 :
1662 : "memory", "cc"
1663#if defined(__SSE2__)
1664 , "xmm0", "xmm1"
1665#endif
1666 );
1667}
1668#endif
1669
fbarchard@google.com16a96642012-03-02 22:38:09 +00001670#ifdef HAS_MIRRORROW_UV_SSSE3
1671// Shuffle table for reversing the bytes of UV channels.
1672CONST uvec8 kShuffleMirrorUV = {
1673 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1674};
1675void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1676 int width) {
1677 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001678 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001679 "movdqa %4,%%xmm1 \n"
1680 "lea -16(%0,%3,2),%0 \n"
1681 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001682 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001683 "1: \n"
1684 "movdqa (%0),%%xmm0 \n"
1685 "lea -16(%0),%0 \n"
1686 "pshufb %%xmm1,%%xmm0 \n"
1687 "sub $8,%3 \n"
1688 "movlpd %%xmm0,(%1) \n"
1689 "movhpd %%xmm0,(%1,%2) \n"
1690 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001691 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001692 : "+r"(src), // %0
1693 "+r"(dst_u), // %1
1694 "+r"(dst_v), // %2
1695 "+r"(temp_width) // %3
1696 : "m"(kShuffleMirrorUV) // %4
1697 : "memory", "cc"
1698#if defined(__SSE2__)
1699 , "xmm0", "xmm1"
1700#endif
1701 );
1702}
1703#endif
1704
fbarchard@google.com55663022012-04-26 00:01:41 +00001705#ifdef HAS_ADDROW_SSE2
1706// dst and width aligned to 16
1707void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
1708 asm volatile (
1709 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001710 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001711 "1: \n"
1712 "movdqu (%0),%%xmm2 \n"
1713 "lea 0x10(%0),%0 \n"
1714 "movdqa (%1),%%xmm0 \n"
1715 "movdqa 0x10(%1),%%xmm1 \n"
1716 "movdqa %%xmm2,%%xmm3 \n"
1717 "punpcklbw %%xmm4,%%xmm2 \n"
1718 "punpckhbw %%xmm4,%%xmm3 \n"
1719 "paddusw %%xmm2,%%xmm0 \n"
1720 "paddusw %%xmm3,%%xmm1 \n"
1721 "sub $0x10,%2 \n"
1722 "movdqa %%xmm0,(%1) \n"
1723 "movdqa %%xmm1,0x10(%1) \n"
1724 "lea 0x20(%1),%1 \n"
1725 "jg 1b \n"
1726 : "+r"(src), // %0
1727 "+r"(dst), // %1
1728 "+r"(width) // %2
1729 :
1730 : "memory", "cc"
1731#if defined(__SSE2__)
1732 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1733#endif
1734 );
1735}
1736
1737// dst and width aligned to 16
1738void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
1739 asm volatile (
1740 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001741 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001742 "1: \n"
1743 "movdqu (%0),%%xmm2 \n"
1744 "lea 0x10(%0),%0 \n"
1745 "movdqa (%1),%%xmm0 \n"
1746 "movdqa 0x10(%1),%%xmm1 \n"
1747 "movdqa %%xmm2,%%xmm3 \n"
1748 "punpcklbw %%xmm4,%%xmm2 \n"
1749 "punpckhbw %%xmm4,%%xmm3 \n"
1750 "psubusw %%xmm2,%%xmm0 \n"
1751 "psubusw %%xmm3,%%xmm1 \n"
1752 "sub $0x10,%2 \n"
1753 "movdqa %%xmm0,(%1) \n"
1754 "movdqa %%xmm1,0x10(%1) \n"
1755 "lea 0x20(%1),%1 \n"
1756 "jg 1b \n"
1757 : "+r"(src), // %0
1758 "+r"(dst), // %1
1759 "+r"(width) // %2
1760 :
1761 : "memory", "cc"
1762#if defined(__SSE2__)
1763 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1764#endif
1765 );
1766}
1767#endif // HAS_ADDROW_SSE2
1768
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001769#ifdef HAS_SPLITUV_SSE2
1770void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001771 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001772 "pcmpeqb %%xmm5,%%xmm5 \n"
1773 "psrlw $0x8,%%xmm5 \n"
1774 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001775 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001776 "1: \n"
1777 "movdqa (%0),%%xmm0 \n"
1778 "movdqa 0x10(%0),%%xmm1 \n"
1779 "lea 0x20(%0),%0 \n"
1780 "movdqa %%xmm0,%%xmm2 \n"
1781 "movdqa %%xmm1,%%xmm3 \n"
1782 "pand %%xmm5,%%xmm0 \n"
1783 "pand %%xmm5,%%xmm1 \n"
1784 "packuswb %%xmm1,%%xmm0 \n"
1785 "psrlw $0x8,%%xmm2 \n"
1786 "psrlw $0x8,%%xmm3 \n"
1787 "packuswb %%xmm3,%%xmm2 \n"
1788 "movdqa %%xmm0,(%1) \n"
1789 "movdqa %%xmm2,(%1,%2) \n"
1790 "lea 0x10(%1),%1 \n"
1791 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001792 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001793 : "+r"(src_uv), // %0
1794 "+r"(dst_u), // %1
1795 "+r"(dst_v), // %2
1796 "+r"(pix) // %3
1797 :
1798 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001799#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001800 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001801#endif
1802 );
1803}
1804#endif
1805
fbarchard@google.com19932f82012-02-16 22:19:14 +00001806#ifdef HAS_COPYROW_SSE2
1807void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001808 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001809 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00001810 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001811 "1: \n"
1812 "movdqa (%0),%%xmm0 \n"
1813 "movdqa 0x10(%0),%%xmm1 \n"
1814 "movdqa %%xmm0,(%0,%1) \n"
1815 "movdqa %%xmm1,0x10(%0,%1) \n"
1816 "lea 0x20(%0),%0 \n"
1817 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001818 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001819 : "+r"(src), // %0
1820 "+r"(dst), // %1
1821 "+r"(count) // %2
1822 :
1823 : "memory", "cc"
1824#if defined(__SSE2__)
1825 , "xmm0", "xmm1"
1826#endif
1827 );
1828}
1829#endif // HAS_COPYROW_SSE2
1830
1831#ifdef HAS_COPYROW_X86
1832void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1833 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001834 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00001835 "shr $0x2,%2 \n"
1836 "rep movsl \n"
1837 : "+S"(src), // %0
1838 "+D"(dst), // %1
1839 "+c"(width_tmp) // %2
1840 :
1841 : "memory", "cc"
1842 );
1843}
1844#endif
1845
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001846#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001847void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001848 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001849 "pcmpeqb %%xmm5,%%xmm5 \n"
1850 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001851 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001852 "1: \n"
1853 "movdqa (%0),%%xmm0 \n"
1854 "movdqa 0x10(%0),%%xmm1 \n"
1855 "lea 0x20(%0),%0 \n"
1856 "pand %%xmm5,%%xmm0 \n"
1857 "pand %%xmm5,%%xmm1 \n"
1858 "packuswb %%xmm1,%%xmm0 \n"
1859 "movdqa %%xmm0,(%1) \n"
1860 "lea 0x10(%1),%1 \n"
1861 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001862 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001863 : "+r"(src_yuy2), // %0
1864 "+r"(dst_y), // %1
1865 "+r"(pix) // %2
1866 :
1867 : "memory", "cc"
1868#if defined(__SSE2__)
1869 , "xmm0", "xmm1", "xmm5"
1870#endif
1871 );
1872}
1873
1874void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1875 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001876 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001877 "pcmpeqb %%xmm5,%%xmm5 \n"
1878 "psrlw $0x8,%%xmm5 \n"
1879 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001880 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001881 "1: \n"
1882 "movdqa (%0),%%xmm0 \n"
1883 "movdqa 0x10(%0),%%xmm1 \n"
1884 "movdqa (%0,%4,1),%%xmm2 \n"
1885 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1886 "lea 0x20(%0),%0 \n"
1887 "pavgb %%xmm2,%%xmm0 \n"
1888 "pavgb %%xmm3,%%xmm1 \n"
1889 "psrlw $0x8,%%xmm0 \n"
1890 "psrlw $0x8,%%xmm1 \n"
1891 "packuswb %%xmm1,%%xmm0 \n"
1892 "movdqa %%xmm0,%%xmm1 \n"
1893 "pand %%xmm5,%%xmm0 \n"
1894 "packuswb %%xmm0,%%xmm0 \n"
1895 "psrlw $0x8,%%xmm1 \n"
1896 "packuswb %%xmm1,%%xmm1 \n"
1897 "movq %%xmm0,(%1) \n"
1898 "movq %%xmm1,(%1,%2) \n"
1899 "lea 0x8(%1),%1 \n"
1900 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001901 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001902 : "+r"(src_yuy2), // %0
1903 "+r"(dst_u), // %1
1904 "+r"(dst_y), // %2
1905 "+r"(pix) // %3
1906 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1907 : "memory", "cc"
1908#if defined(__SSE2__)
1909 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1910#endif
1911 );
1912}
1913
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00001914
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001915void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1916 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001917 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001918 "pcmpeqb %%xmm5,%%xmm5 \n"
1919 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001920 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001921 "1: \n"
1922 "movdqu (%0),%%xmm0 \n"
1923 "movdqu 0x10(%0),%%xmm1 \n"
1924 "lea 0x20(%0),%0 \n"
1925 "pand %%xmm5,%%xmm0 \n"
1926 "pand %%xmm5,%%xmm1 \n"
1927 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001928 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001929 "movdqu %%xmm0,(%1) \n"
1930 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001931 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001932 : "+r"(src_yuy2), // %0
1933 "+r"(dst_y), // %1
1934 "+r"(pix) // %2
1935 :
1936 : "memory", "cc"
1937#if defined(__SSE2__)
1938 , "xmm0", "xmm1", "xmm5"
1939#endif
1940 );
1941}
1942
1943void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1944 int stride_yuy2,
1945 uint8* dst_u, uint8* dst_y,
1946 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001947 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001948 "pcmpeqb %%xmm5,%%xmm5 \n"
1949 "psrlw $0x8,%%xmm5 \n"
1950 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001951 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001952 "1: \n"
1953 "movdqu (%0),%%xmm0 \n"
1954 "movdqu 0x10(%0),%%xmm1 \n"
1955 "movdqu (%0,%4,1),%%xmm2 \n"
1956 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1957 "lea 0x20(%0),%0 \n"
1958 "pavgb %%xmm2,%%xmm0 \n"
1959 "pavgb %%xmm3,%%xmm1 \n"
1960 "psrlw $0x8,%%xmm0 \n"
1961 "psrlw $0x8,%%xmm1 \n"
1962 "packuswb %%xmm1,%%xmm0 \n"
1963 "movdqa %%xmm0,%%xmm1 \n"
1964 "pand %%xmm5,%%xmm0 \n"
1965 "packuswb %%xmm0,%%xmm0 \n"
1966 "psrlw $0x8,%%xmm1 \n"
1967 "packuswb %%xmm1,%%xmm1 \n"
1968 "movq %%xmm0,(%1) \n"
1969 "movq %%xmm1,(%1,%2) \n"
1970 "lea 0x8(%1),%1 \n"
1971 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001972 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001973 : "+r"(src_yuy2), // %0
1974 "+r"(dst_u), // %1
1975 "+r"(dst_y), // %2
1976 "+r"(pix) // %3
1977 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1978 : "memory", "cc"
1979#if defined(__SSE2__)
1980 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1981#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001982 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001983}
1984
1985void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001986 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001987 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001988 "1: \n"
1989 "movdqa (%0),%%xmm0 \n"
1990 "movdqa 0x10(%0),%%xmm1 \n"
1991 "lea 0x20(%0),%0 \n"
1992 "psrlw $0x8,%%xmm0 \n"
1993 "psrlw $0x8,%%xmm1 \n"
1994 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001995 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001996 "movdqa %%xmm0,(%1) \n"
1997 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001998 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001999 : "+r"(src_uyvy), // %0
2000 "+r"(dst_y), // %1
2001 "+r"(pix) // %2
2002 :
2003 : "memory", "cc"
2004#if defined(__SSE2__)
2005 , "xmm0", "xmm1"
2006#endif
2007 );
2008}
2009
2010void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2011 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002012 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002013 "pcmpeqb %%xmm5,%%xmm5 \n"
2014 "psrlw $0x8,%%xmm5 \n"
2015 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002016 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002017 "1: \n"
2018 "movdqa (%0),%%xmm0 \n"
2019 "movdqa 0x10(%0),%%xmm1 \n"
2020 "movdqa (%0,%4,1),%%xmm2 \n"
2021 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2022 "lea 0x20(%0),%0 \n"
2023 "pavgb %%xmm2,%%xmm0 \n"
2024 "pavgb %%xmm3,%%xmm1 \n"
2025 "pand %%xmm5,%%xmm0 \n"
2026 "pand %%xmm5,%%xmm1 \n"
2027 "packuswb %%xmm1,%%xmm0 \n"
2028 "movdqa %%xmm0,%%xmm1 \n"
2029 "pand %%xmm5,%%xmm0 \n"
2030 "packuswb %%xmm0,%%xmm0 \n"
2031 "psrlw $0x8,%%xmm1 \n"
2032 "packuswb %%xmm1,%%xmm1 \n"
2033 "movq %%xmm0,(%1) \n"
2034 "movq %%xmm1,(%1,%2) \n"
2035 "lea 0x8(%1),%1 \n"
2036 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002037 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002038 : "+r"(src_uyvy), // %0
2039 "+r"(dst_u), // %1
2040 "+r"(dst_y), // %2
2041 "+r"(pix) // %3
2042 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2043 : "memory", "cc"
2044#if defined(__SSE2__)
2045 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2046#endif
2047 );
2048}
2049
2050void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2051 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002052 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002053 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002054 "1: \n"
2055 "movdqu (%0),%%xmm0 \n"
2056 "movdqu 0x10(%0),%%xmm1 \n"
2057 "lea 0x20(%0),%0 \n"
2058 "psrlw $0x8,%%xmm0 \n"
2059 "psrlw $0x8,%%xmm1 \n"
2060 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002061 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002062 "movdqu %%xmm0,(%1) \n"
2063 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002064 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002065 : "+r"(src_uyvy), // %0
2066 "+r"(dst_y), // %1
2067 "+r"(pix) // %2
2068 :
2069 : "memory", "cc"
2070#if defined(__SSE2__)
2071 , "xmm0", "xmm1"
2072#endif
2073 );
2074}
2075
2076void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2077 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002078 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002079 "pcmpeqb %%xmm5,%%xmm5 \n"
2080 "psrlw $0x8,%%xmm5 \n"
2081 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002082 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002083 "1: \n"
2084 "movdqu (%0),%%xmm0 \n"
2085 "movdqu 0x10(%0),%%xmm1 \n"
2086 "movdqu (%0,%4,1),%%xmm2 \n"
2087 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2088 "lea 0x20(%0),%0 \n"
2089 "pavgb %%xmm2,%%xmm0 \n"
2090 "pavgb %%xmm3,%%xmm1 \n"
2091 "pand %%xmm5,%%xmm0 \n"
2092 "pand %%xmm5,%%xmm1 \n"
2093 "packuswb %%xmm1,%%xmm0 \n"
2094 "movdqa %%xmm0,%%xmm1 \n"
2095 "pand %%xmm5,%%xmm0 \n"
2096 "packuswb %%xmm0,%%xmm0 \n"
2097 "psrlw $0x8,%%xmm1 \n"
2098 "packuswb %%xmm1,%%xmm1 \n"
2099 "movq %%xmm0,(%1) \n"
2100 "movq %%xmm1,(%1,%2) \n"
2101 "lea 0x8(%1),%1 \n"
2102 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002103 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002104 : "+r"(src_uyvy), // %0
2105 "+r"(dst_u), // %1
2106 "+r"(dst_y), // %2
2107 "+r"(pix) // %3
2108 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2109 : "memory", "cc"
2110#if defined(__SSE2__)
2111 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2112#endif
2113 );
2114}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002115#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002116
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002117#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002118// Blend 8 pixels at a time.
2119// src_argb0 unaligned.
2120// src_argb1 and dst_argb aligned to 16 bytes.
2121// width must be multiple of 4 pixels.
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002122void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.comc757f302012-04-03 00:49:16 +00002123 uint8* dst_argb, int width) {
2124 asm volatile (
2125 "pcmpeqb %%xmm7,%%xmm7 \n"
2126 "psrlw $0xf,%%xmm7 \n"
2127 "pcmpeqb %%xmm6,%%xmm6 \n"
2128 "psrlw $0x8,%%xmm6 \n"
2129 "pcmpeqb %%xmm5,%%xmm5 \n"
2130 "psllw $0x8,%%xmm5 \n"
2131 "pcmpeqb %%xmm4,%%xmm4 \n"
2132 "pslld $0x18,%%xmm4 \n"
2133
2134 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002135 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002136 "1: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002137 "movdqu (%0),%%xmm3 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002138 "movdqa %%xmm3,%%xmm0 \n"
2139 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002140 "movdqu (%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002141 "psrlw $0x8,%%xmm3 \n"
2142 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2143 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2144 "pand %%xmm6,%%xmm2 \n"
2145 "paddw %%xmm7,%%xmm3 \n"
2146 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002147 "movdqu (%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002148 "psrlw $0x8,%%xmm1 \n"
2149 "por %%xmm4,%%xmm0 \n"
2150 "pmullw %%xmm3,%%xmm1 \n"
2151 "movdqu 0x10(%0),%%xmm3 \n"
2152 "lea 0x20(%0),%0 \n"
2153 "psrlw $0x8,%%xmm2 \n"
2154 "paddusb %%xmm2,%%xmm0 \n"
2155 "pand %%xmm5,%%xmm1 \n"
2156 "paddusb %%xmm1,%%xmm0 \n"
2157 "sub $0x4,%3 \n"
2158 "movdqa %%xmm0,(%2) \n"
2159 "jle 9f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002160 "movdqa %%xmm3,%%xmm0 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002161 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002162 "movdqu 0x10(%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002163 "psrlw $0x8,%%xmm3 \n"
2164 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2165 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2166 "pand %%xmm6,%%xmm2 \n"
2167 "paddw %%xmm7,%%xmm3 \n"
2168 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002169 "movdqu 0x10(%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002170 "lea 0x20(%1),%1 \n"
2171 "psrlw $0x8,%%xmm1 \n"
2172 "por %%xmm4,%%xmm0 \n"
2173 "pmullw %%xmm3,%%xmm1 \n"
2174 "psrlw $0x8,%%xmm2 \n"
2175 "paddusb %%xmm2,%%xmm0 \n"
2176 "pand %%xmm5,%%xmm1 \n"
2177 "paddusb %%xmm1,%%xmm0 \n"
2178 "sub $0x4,%3 \n"
2179 "movdqa %%xmm0,0x10(%2) \n"
2180 "lea 0x20(%2),%2 \n"
2181 "jg 1b \n"
2182 "9: \n"
2183 : "+r"(src_argb0), // %0
2184 "+r"(src_argb1), // %1
2185 "+r"(dst_argb), // %2
2186 "+r"(width) // %3
2187 :
2188 : "memory", "cc"
2189#if defined(__SSE2__)
2190 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2191#endif
2192 );
2193}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002194#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002195
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002196#ifdef HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002197// Blend 1 pixel at a time, unaligned
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002198void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com96af8702012-04-06 18:22:27 +00002199 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002200 asm volatile (
2201 "pcmpeqb %%xmm7,%%xmm7 \n"
2202 "psrlw $0xf,%%xmm7 \n"
2203 "pcmpeqb %%xmm6,%%xmm6 \n"
2204 "psrlw $0x8,%%xmm6 \n"
2205 "pcmpeqb %%xmm5,%%xmm5 \n"
2206 "psllw $0x8,%%xmm5 \n"
2207 "pcmpeqb %%xmm4,%%xmm4 \n"
2208 "pslld $0x18,%%xmm4 \n"
2209
2210 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002211 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002212 "1: \n"
2213 "movd (%0),%%xmm3 \n"
2214 "lea 0x4(%0),%0 \n"
2215 "movdqa %%xmm3,%%xmm0 \n"
2216 "pxor %%xmm4,%%xmm3 \n"
2217 "movd (%1),%%xmm2 \n"
2218 "psrlw $0x8,%%xmm3 \n"
2219 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2220 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2221 "pand %%xmm6,%%xmm2 \n"
2222 "paddw %%xmm7,%%xmm3 \n"
2223 "pmullw %%xmm3,%%xmm2 \n"
2224 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002225 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002226 "psrlw $0x8,%%xmm1 \n"
2227 "por %%xmm4,%%xmm0 \n"
2228 "pmullw %%xmm3,%%xmm1 \n"
2229 "psrlw $0x8,%%xmm2 \n"
2230 "paddusb %%xmm2,%%xmm0 \n"
2231 "pand %%xmm5,%%xmm1 \n"
2232 "paddusb %%xmm1,%%xmm0 \n"
2233 "sub $0x1,%3 \n"
2234 "movd %%xmm0,(%2) \n"
2235 "lea 0x4(%2),%2 \n"
2236 "jg 1b \n"
2237 : "+r"(src_argb0), // %0
2238 "+r"(src_argb1), // %1
2239 "+r"(dst_argb), // %2
2240 "+r"(width) // %3
2241 :
2242 : "memory", "cc"
2243#if defined(__SSE2__)
2244 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2245#endif
2246 );
2247}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002248#endif // HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002249
fbarchard@google.com96af8702012-04-06 18:22:27 +00002250#ifdef HAS_ARGBBLENDROW_SSSE3
2251// Shuffle table for reversing the bytes.
2252CONST uvec8 kShuffleAlpha = {
2253 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2254 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2255};
2256void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002257 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002258 asm volatile (
2259 "pcmpeqb %%xmm7,%%xmm7 \n"
2260 "psrlw $0xf,%%xmm7 \n"
2261 "pcmpeqb %%xmm6,%%xmm6 \n"
2262 "psrlw $0x8,%%xmm6 \n"
2263 "pcmpeqb %%xmm5,%%xmm5 \n"
2264 "psllw $0x8,%%xmm5 \n"
2265 "pcmpeqb %%xmm4,%%xmm4 \n"
2266 "pslld $0x18,%%xmm4 \n"
2267
2268 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002269 ".p2align 4 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002270 "1: \n"
2271 "movdqu (%0),%%xmm3 \n"
2272 "movdqa %%xmm3,%%xmm0 \n"
2273 "pxor %%xmm4,%%xmm3 \n"
2274 "pshufb %4,%%xmm3 \n"
2275 "movdqu (%1),%%xmm2 \n"
2276 "pand %%xmm6,%%xmm2 \n"
2277 "paddw %%xmm7,%%xmm3 \n"
2278 "pmullw %%xmm3,%%xmm2 \n"
2279 "movdqu (%1),%%xmm1 \n"
2280 "psrlw $0x8,%%xmm1 \n"
2281 "por %%xmm4,%%xmm0 \n"
2282 "pmullw %%xmm3,%%xmm1 \n"
2283 "movdqu 0x10(%0),%%xmm3 \n"
2284 "lea 0x20(%0),%0 \n"
2285 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002286 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002287 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002288 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002289 "sub $0x4,%3 \n"
2290 "movdqa %%xmm0,(%2) \n"
2291 "jle 9f \n"
2292 "movdqa %%xmm3,%%xmm0 \n"
2293 "pxor %%xmm4,%%xmm3 \n"
2294 "movdqu 0x10(%1),%%xmm2 \n"
2295 "pshufb %4,%%xmm3 \n"
2296 "pand %%xmm6,%%xmm2 \n"
2297 "paddw %%xmm7,%%xmm3 \n"
2298 "pmullw %%xmm3,%%xmm2 \n"
2299 "movdqu 0x10(%1),%%xmm1 \n"
2300 "lea 0x20(%1),%1 \n"
2301 "psrlw $0x8,%%xmm1 \n"
2302 "por %%xmm4,%%xmm0 \n"
2303 "pmullw %%xmm3,%%xmm1 \n"
2304 "psrlw $0x8,%%xmm2 \n"
2305 "paddusb %%xmm2,%%xmm0 \n"
2306 "pand %%xmm5,%%xmm1 \n"
2307 "paddusb %%xmm1,%%xmm0 \n"
2308 "sub $0x4,%3 \n"
2309 "movdqa %%xmm0,0x10(%2) \n"
2310 "lea 0x20(%2),%2 \n"
2311 "jg 1b \n"
2312 "9: \n"
2313 : "+r"(src_argb0), // %0
2314 "+r"(src_argb1), // %1
2315 "+r"(dst_argb), // %2
2316 "+r"(width) // %3
2317 : "m"(kShuffleAlpha) // %4
2318 : "memory", "cc"
2319#if defined(__SSE2__)
2320 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2321#endif
2322 );
2323}
2324#endif // HAS_ARGBBLENDROW_SSSE3
2325
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002326
2327#ifdef HAS_ARGBBLENDROW1_SSSE3
2328// Blend 1 pixel at a time, unaligned
2329void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2330 uint8* dst_argb, int width) {
2331 asm volatile (
2332 "pcmpeqb %%xmm7,%%xmm7 \n"
2333 "psrlw $0xf,%%xmm7 \n"
2334 "pcmpeqb %%xmm6,%%xmm6 \n"
2335 "psrlw $0x8,%%xmm6 \n"
2336 "pcmpeqb %%xmm5,%%xmm5 \n"
2337 "psllw $0x8,%%xmm5 \n"
2338 "pcmpeqb %%xmm4,%%xmm4 \n"
2339 "pslld $0x18,%%xmm4 \n"
2340
2341 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002342 ".p2align 4 \n"
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002343 "1: \n"
2344 "movd (%0),%%xmm3 \n"
2345 "lea 0x4(%0),%0 \n"
2346 "movdqa %%xmm3,%%xmm0 \n"
2347 "pxor %%xmm4,%%xmm3 \n"
2348 "movd (%1),%%xmm2 \n"
2349 "pshufb %4,%%xmm3 \n"
2350 "pand %%xmm6,%%xmm2 \n"
2351 "paddw %%xmm7,%%xmm3 \n"
2352 "pmullw %%xmm3,%%xmm2 \n"
2353 "movd (%1),%%xmm1 \n"
2354 "lea 0x4(%1),%1 \n"
2355 "psrlw $0x8,%%xmm1 \n"
2356 "por %%xmm4,%%xmm0 \n"
2357 "pmullw %%xmm3,%%xmm1 \n"
2358 "psrlw $0x8,%%xmm2 \n"
2359 "paddusb %%xmm2,%%xmm0 \n"
2360 "pand %%xmm5,%%xmm1 \n"
2361 "paddusb %%xmm1,%%xmm0 \n"
2362 "sub $0x1,%3 \n"
2363 "movd %%xmm0,(%2) \n"
2364 "lea 0x4(%2),%2 \n"
2365 "jg 1b \n"
2366 : "+r"(src_argb0), // %0
2367 "+r"(src_argb1), // %1
2368 "+r"(dst_argb), // %2
2369 "+r"(width) // %3
2370 : "m"(kShuffleAlpha) // %4
2371 : "memory", "cc"
2372#if defined(__SSE2__)
2373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2374#endif
2375 );
2376}
2377#endif // HAS_ARGBBLENDROW1_SSSE3
2378
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002379#ifdef HAS_ARGBATTENUATE_SSE2
2380// Attenuate 4 pixels at a time.
2381// aligned to 16 bytes
2382void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2383 asm volatile (
2384 "sub %0,%1 \n"
2385 "pcmpeqb %%xmm4,%%xmm4 \n"
2386 "pslld $0x18,%%xmm4 \n"
2387 "pcmpeqb %%xmm5,%%xmm5 \n"
2388 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002389
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002390 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002391 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002392 "1: \n"
2393 "movdqa (%0),%%xmm0 \n"
2394 "punpcklbw %%xmm0,%%xmm0 \n"
2395 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2396 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2397 "pmulhuw %%xmm2,%%xmm0 \n"
2398 "movdqa (%0),%%xmm1 \n"
2399 "punpckhbw %%xmm1,%%xmm1 \n"
2400 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2401 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2402 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002403 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002404 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002405 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002406 "psrlw $0x8,%%xmm1 \n"
2407 "packuswb %%xmm1,%%xmm0 \n"
2408 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002409 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002410 "sub $0x4,%2 \n"
2411 "movdqa %%xmm0,(%0,%1,1) \n"
2412 "lea 0x10(%0),%0 \n"
2413 "jg 1b \n"
2414 : "+r"(src_argb), // %0
2415 "+r"(dst_argb), // %1
2416 "+r"(width) // %2
2417 :
2418 : "memory", "cc"
2419#if defined(__SSE2__)
2420 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2421#endif
2422 );
2423}
2424#endif // HAS_ARGBATTENUATE_SSE2
2425
fbarchard@google.com810cd912012-04-20 20:15:27 +00002426#ifdef HAS_ARGBATTENUATE_SSSE3
2427// Shuffle table duplicating alpha
2428CONST uvec8 kShuffleAlpha0 = {
2429 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2430};
2431CONST uvec8 kShuffleAlpha1 = {
2432 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2433 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2434};
2435// Attenuate 4 pixels at a time.
2436// aligned to 16 bytes
2437void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2438 asm volatile (
2439 "sub %0,%1 \n"
2440 "pcmpeqb %%xmm3,%%xmm3 \n"
2441 "pslld $0x18,%%xmm3 \n"
2442 "movdqa %3,%%xmm4 \n"
2443 "movdqa %4,%%xmm5 \n"
2444
2445 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002446 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002447 "1: \n"
2448 "movdqa (%0),%%xmm0 \n"
2449 "pshufb %%xmm4,%%xmm0 \n"
2450 "movdqa (%0),%%xmm1 \n"
2451 "punpcklbw %%xmm1,%%xmm1 \n"
2452 "pmulhuw %%xmm1,%%xmm0 \n"
2453 "movdqa (%0),%%xmm1 \n"
2454 "pshufb %%xmm5,%%xmm1 \n"
2455 "movdqa (%0),%%xmm2 \n"
2456 "punpckhbw %%xmm2,%%xmm2 \n"
2457 "pmulhuw %%xmm2,%%xmm1 \n"
2458 "movdqa (%0),%%xmm2 \n"
2459 "pand %%xmm3,%%xmm2 \n"
2460 "psrlw $0x8,%%xmm0 \n"
2461 "psrlw $0x8,%%xmm1 \n"
2462 "packuswb %%xmm1,%%xmm0 \n"
2463 "por %%xmm2,%%xmm0 \n"
2464 "sub $0x4,%2 \n"
2465 "movdqa %%xmm0,(%0,%1,1) \n"
2466 "lea 0x10(%0),%0 \n"
2467 "jg 1b \n"
2468 : "+r"(src_argb), // %0
2469 "+r"(dst_argb), // %1
2470 "+r"(width) // %2
2471 : "m"(kShuffleAlpha0), // %3
2472 "m"(kShuffleAlpha1) // %4
2473 : "memory", "cc"
2474#if defined(__SSE2__)
2475 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2476#endif
2477 );
2478}
2479#endif // HAS_ARGBATTENUATE_SSSE3
2480
2481#ifdef HAS_ARGBUNATTENUATE_SSE2
2482// Divide source RGB by alpha and store to destination.
2483// b = (b * 255 + (a / 2)) / a;
2484// g = (g * 255 + (a / 2)) / a;
2485// r = (r * 255 + (a / 2)) / a;
2486// Reciprocal method is off by 1 on some values. ie 125
2487// 8.16 fixed point inverse table
2488#define T(a) 0x10000 / a
2489CONST uint32 fixed_invtbl8[256] = {
2490 0x100, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
2491 T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
2492 T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
2493 T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
2494 T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
2495 T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
2496 T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
2497 T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
2498 T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
2499 T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
2500 T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
2501 T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
2502 T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
2503 T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
2504 T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
2505 T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
2506 T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
2507 T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
2508 T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
2509 T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
2510 T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
2511 T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
2512 T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
2513 T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
2514 T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
2515 T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
2516 T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
2517 T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
2518 T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
2519 T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
2520 T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
2521 T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x100 };
2522#undef T
2523
2524// Unattenuate 4 pixels at a time.
2525// aligned to 16 bytes
2526void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2527 int width) {
2528 uintptr_t alpha = 0;
2529 asm volatile (
2530 "sub %0,%1 \n"
2531 "pcmpeqb %%xmm4,%%xmm4 \n"
2532 "pslld $0x18,%%xmm4 \n"
2533
2534 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002535 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002536 "1: \n"
2537 "movdqa (%0),%%xmm0 \n"
2538 "movzb 0x3(%0),%3 \n"
2539 "punpcklbw %%xmm0,%%xmm0 \n"
2540 "movd 0x0(%4,%3,4),%%xmm2 \n"
2541 "movzb 0x7(%0),%3 \n"
2542 "movd 0x0(%4,%3,4),%%xmm3 \n"
2543 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2544 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2545 "movlhps %%xmm3,%%xmm2 \n"
2546 "pmulhuw %%xmm2,%%xmm0 \n"
2547 "movdqa (%0),%%xmm1 \n"
2548 "movzb 0xb(%0),%3 \n"
2549 "punpckhbw %%xmm1,%%xmm1 \n"
2550 "movd 0x0(%4,%3,4),%%xmm2 \n"
2551 "movzb 0xf(%0),%3 \n"
2552 "movd 0x0(%4,%3,4),%%xmm3 \n"
2553 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2554 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2555 "movlhps %%xmm3,%%xmm2 \n"
2556 "pmulhuw %%xmm2,%%xmm1 \n"
2557 "movdqa (%0),%%xmm2 \n"
2558 "pand %%xmm4,%%xmm2 \n"
2559 "packuswb %%xmm1,%%xmm0 \n"
2560 "por %%xmm2,%%xmm0 \n"
2561 "sub $0x4,%2 \n"
2562 "movdqa %%xmm0,(%0,%1,1) \n"
2563 "lea 0x10(%0),%0 \n"
2564 "jg 1b \n"
2565 : "+r"(src_argb), // %0
2566 "+r"(dst_argb), // %1
2567 "+r"(width), // %2
2568 "+r"(alpha) // %3
2569 : "r"(fixed_invtbl8) // %4
2570 : "memory", "cc"
2571#if defined(__SSE2__)
2572 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2573#endif
2574 );
2575}
2576#endif // HAS_ARGBUNATTENUATE_SSE2
2577
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002578#endif // defined(__x86_64__) || defined(__i386__)
2579
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002580#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002581} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002582} // namespace libyuv
2583#endif