blob: 1e61e1fea6c6dce4e57b2afbde7789f1924f72d8 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000115 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000116 "1: \n"
117 "movq (%0),%%xmm0 \n"
118 "lea 0x8(%0),%0 \n"
119 "punpcklbw %%xmm0,%%xmm0 \n"
120 "movdqa %%xmm0,%%xmm1 \n"
121 "punpcklwd %%xmm0,%%xmm0 \n"
122 "punpckhwd %%xmm1,%%xmm1 \n"
123 "por %%xmm5,%%xmm0 \n"
124 "por %%xmm5,%%xmm1 \n"
125 "movdqa %%xmm0,(%1) \n"
126 "movdqa %%xmm1,0x10(%1) \n"
127 "lea 0x20(%1),%1 \n"
128 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000129 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000130 : "+r"(src_y), // %0
131 "+r"(dst_argb), // %1
132 "+r"(pix) // %2
133 :
134 : "memory", "cc"
135#if defined(__SSE2__)
136 , "xmm0", "xmm1", "xmm5"
137#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000138 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000139}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140
141void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000142 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000143 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000144 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000145 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "1: \n"
147 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000148 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000150 "movdqa %%xmm0,(%0,%1,1) \n"
151 "lea 0x10(%0),%0 \n"
152 "jg 1b \n"
153
fbarchard@google.comb6149762011-11-07 21:58:52 +0000154 : "+r"(src_abgr), // %0
155 "+r"(dst_argb), // %1
156 "+r"(pix) // %2
157 : "m"(kShuffleMaskABGRToARGB) // %3
158 : "memory", "cc"
159#if defined(__SSE2__)
160 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000161#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000162 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163}
164
165void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000166 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000168 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000169 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "1: \n"
171 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000172 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000173 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000174 "movdqa %%xmm0,(%0,%1,1) \n"
175 "lea 0x10(%0),%0 \n"
176 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000177 : "+r"(src_bgra), // %0
178 "+r"(dst_argb), // %1
179 "+r"(pix) // %2
180 : "m"(kShuffleMaskBGRAToARGB) // %3
181 : "memory", "cc"
182#if defined(__SSE2__)
183 , "xmm0", "xmm5"
184#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000185 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000186}
187
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000188void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000189 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000190 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
191 "pslld $0x18,%%xmm5 \n"
192 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000193 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000194 "1: \n"
195 "movdqu (%0),%%xmm0 \n"
196 "movdqu 0x10(%0),%%xmm1 \n"
197 "movdqu 0x20(%0),%%xmm3 \n"
198 "lea 0x30(%0),%0 \n"
199 "movdqa %%xmm3,%%xmm2 \n"
200 "palignr $0x8,%%xmm1,%%xmm2 \n"
201 "pshufb %%xmm4,%%xmm2 \n"
202 "por %%xmm5,%%xmm2 \n"
203 "palignr $0xc,%%xmm0,%%xmm1 \n"
204 "pshufb %%xmm4,%%xmm0 \n"
205 "movdqa %%xmm2,0x20(%1) \n"
206 "por %%xmm5,%%xmm0 \n"
207 "pshufb %%xmm4,%%xmm1 \n"
208 "movdqa %%xmm0,(%1) \n"
209 "por %%xmm5,%%xmm1 \n"
210 "palignr $0x4,%%xmm3,%%xmm3 \n"
211 "pshufb %%xmm4,%%xmm3 \n"
212 "movdqa %%xmm1,0x10(%1) \n"
213 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000215 "movdqa %%xmm3,0x30(%1) \n"
216 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000217 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000218 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000221 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000227}
228
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000229void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
232 "pslld $0x18,%%xmm5 \n"
233 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000234 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000235 "1: \n"
236 "movdqu (%0),%%xmm0 \n"
237 "movdqu 0x10(%0),%%xmm1 \n"
238 "movdqu 0x20(%0),%%xmm3 \n"
239 "lea 0x30(%0),%0 \n"
240 "movdqa %%xmm3,%%xmm2 \n"
241 "palignr $0x8,%%xmm1,%%xmm2 \n"
242 "pshufb %%xmm4,%%xmm2 \n"
243 "por %%xmm5,%%xmm2 \n"
244 "palignr $0xc,%%xmm0,%%xmm1 \n"
245 "pshufb %%xmm4,%%xmm0 \n"
246 "movdqa %%xmm2,0x20(%1) \n"
247 "por %%xmm5,%%xmm0 \n"
248 "pshufb %%xmm4,%%xmm1 \n"
249 "movdqa %%xmm0,(%1) \n"
250 "por %%xmm5,%%xmm1 \n"
251 "palignr $0x4,%%xmm3,%%xmm3 \n"
252 "pshufb %%xmm4,%%xmm3 \n"
253 "movdqa %%xmm1,0x10(%1) \n"
254 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000255 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000256 "movdqa %%xmm3,0x30(%1) \n"
257 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000258 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000259 : "+r"(src_raw), // %0
260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000262 : "m"(kShuffleMaskRAWToARGB) // %3
263 : "memory", "cc"
264#if defined(__SSE2__)
265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
266#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000268}
269
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000270void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000271 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000272 "mov $0x1080108,%%eax \n"
273 "movd %%eax,%%xmm5 \n"
274 "pshufd $0x0,%%xmm5,%%xmm5 \n"
275 "mov $0x20082008,%%eax \n"
276 "movd %%eax,%%xmm6 \n"
277 "pshufd $0x0,%%xmm6,%%xmm6 \n"
278 "pcmpeqb %%xmm3,%%xmm3 \n"
279 "psllw $0xb,%%xmm3 \n"
280 "pcmpeqb %%xmm4,%%xmm4 \n"
281 "psllw $0xa,%%xmm4 \n"
282 "psrlw $0x5,%%xmm4 \n"
283 "pcmpeqb %%xmm7,%%xmm7 \n"
284 "psllw $0x8,%%xmm7 \n"
285 "sub %0,%1 \n"
286 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000287 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000288 "1: \n"
289 "movdqu (%0),%%xmm0 \n"
290 "movdqa %%xmm0,%%xmm1 \n"
291 "movdqa %%xmm0,%%xmm2 \n"
292 "pand %%xmm3,%%xmm1 \n"
293 "psllw $0xb,%%xmm2 \n"
294 "pmulhuw %%xmm5,%%xmm1 \n"
295 "pmulhuw %%xmm5,%%xmm2 \n"
296 "psllw $0x8,%%xmm1 \n"
297 "por %%xmm2,%%xmm1 \n"
298 "pand %%xmm4,%%xmm0 \n"
299 "pmulhuw %%xmm6,%%xmm0 \n"
300 "por %%xmm7,%%xmm0 \n"
301 "movdqa %%xmm1,%%xmm2 \n"
302 "punpcklbw %%xmm0,%%xmm1 \n"
303 "punpckhbw %%xmm0,%%xmm2 \n"
304 "movdqa %%xmm1,(%1,%0,2) \n"
305 "movdqa %%xmm2,0x10(%1,%0,2) \n"
306 "lea 0x10(%0),%0 \n"
307 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000308 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000309 : "+r"(src), // %0
310 "+r"(dst), // %1
311 "+r"(pix) // %2
312 :
313 : "memory", "cc", "eax"
314#if defined(__SSE2__)
315 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
316#endif
317 );
318}
319
320void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000321 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000322 "mov $0x1080108,%%eax \n"
323 "movd %%eax,%%xmm5 \n"
324 "pshufd $0x0,%%xmm5,%%xmm5 \n"
325 "mov $0x42004200,%%eax \n"
326 "movd %%eax,%%xmm6 \n"
327 "pshufd $0x0,%%xmm6,%%xmm6 \n"
328 "pcmpeqb %%xmm3,%%xmm3 \n"
329 "psllw $0xb,%%xmm3 \n"
330 "movdqa %%xmm3,%%xmm4 \n"
331 "psrlw $0x6,%%xmm4 \n"
332 "pcmpeqb %%xmm7,%%xmm7 \n"
333 "psllw $0x8,%%xmm7 \n"
334 "sub %0,%1 \n"
335 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000336 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000337 "1: \n"
338 "movdqu (%0),%%xmm0 \n"
339 "movdqa %%xmm0,%%xmm1 \n"
340 "movdqa %%xmm0,%%xmm2 \n"
341 "psllw $0x1,%%xmm1 \n"
342 "psllw $0xb,%%xmm2 \n"
343 "pand %%xmm3,%%xmm1 \n"
344 "pmulhuw %%xmm5,%%xmm2 \n"
345 "pmulhuw %%xmm5,%%xmm1 \n"
346 "psllw $0x8,%%xmm1 \n"
347 "por %%xmm2,%%xmm1 \n"
348 "movdqa %%xmm0,%%xmm2 \n"
349 "pand %%xmm4,%%xmm0 \n"
350 "psraw $0x8,%%xmm2 \n"
351 "pmulhuw %%xmm6,%%xmm0 \n"
352 "pand %%xmm7,%%xmm2 \n"
353 "por %%xmm2,%%xmm0 \n"
354 "movdqa %%xmm1,%%xmm2 \n"
355 "punpcklbw %%xmm0,%%xmm1 \n"
356 "punpckhbw %%xmm0,%%xmm2 \n"
357 "movdqa %%xmm1,(%1,%0,2) \n"
358 "movdqa %%xmm2,0x10(%1,%0,2) \n"
359 "lea 0x10(%0),%0 \n"
360 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000361 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000362 : "+r"(src), // %0
363 "+r"(dst), // %1
364 "+r"(pix) // %2
365 :
366 : "memory", "cc", "eax"
367#if defined(__SSE2__)
368 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
369#endif
370 );
371}
372
373void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000374 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000375 "mov $0xf0f0f0f,%%eax \n"
376 "movd %%eax,%%xmm4 \n"
377 "pshufd $0x0,%%xmm4,%%xmm4 \n"
378 "movdqa %%xmm4,%%xmm5 \n"
379 "pslld $0x4,%%xmm5 \n"
380 "sub %0,%1 \n"
381 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000382 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000383 "1: \n"
384 "movdqu (%0),%%xmm0 \n"
385 "movdqa %%xmm0,%%xmm2 \n"
386 "pand %%xmm4,%%xmm0 \n"
387 "pand %%xmm5,%%xmm2 \n"
388 "movdqa %%xmm0,%%xmm1 \n"
389 "movdqa %%xmm2,%%xmm3 \n"
390 "psllw $0x4,%%xmm1 \n"
391 "psrlw $0x4,%%xmm3 \n"
392 "por %%xmm1,%%xmm0 \n"
393 "por %%xmm3,%%xmm2 \n"
394 "movdqa %%xmm0,%%xmm1 \n"
395 "punpcklbw %%xmm2,%%xmm0 \n"
396 "punpckhbw %%xmm2,%%xmm1 \n"
397 "movdqa %%xmm0,(%1,%0,2) \n"
398 "movdqa %%xmm1,0x10(%1,%0,2) \n"
399 "lea 0x10(%0),%0 \n"
400 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000401 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000402 : "+r"(src), // %0
403 "+r"(dst), // %1
404 "+r"(pix) // %2
405 :
406 : "memory", "cc", "eax"
407#if defined(__SSE2__)
408 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
409#endif
410 );
411}
412
413void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000414 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000415 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000416 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000417 "1: \n"
418 "movdqa (%0),%%xmm0 \n"
419 "movdqa 0x10(%0),%%xmm1 \n"
420 "movdqa 0x20(%0),%%xmm2 \n"
421 "movdqa 0x30(%0),%%xmm3 \n"
422 "lea 0x40(%0),%0 \n"
423 "pshufb %%xmm6,%%xmm0 \n"
424 "pshufb %%xmm6,%%xmm1 \n"
425 "pshufb %%xmm6,%%xmm2 \n"
426 "pshufb %%xmm6,%%xmm3 \n"
427 "movdqa %%xmm1,%%xmm4 \n"
428 "psrldq $0x4,%%xmm1 \n"
429 "pslldq $0xc,%%xmm4 \n"
430 "movdqa %%xmm2,%%xmm5 \n"
431 "por %%xmm4,%%xmm0 \n"
432 "pslldq $0x8,%%xmm5 \n"
433 "movdqa %%xmm0,(%1) \n"
434 "por %%xmm5,%%xmm1 \n"
435 "psrldq $0x8,%%xmm2 \n"
436 "pslldq $0x4,%%xmm3 \n"
437 "por %%xmm3,%%xmm2 \n"
438 "movdqa %%xmm1,0x10(%1) \n"
439 "movdqa %%xmm2,0x20(%1) \n"
440 "lea 0x30(%1),%1 \n"
441 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 : "m"(kShuffleMaskARGBToRGB24) // %3
447 : "memory", "cc"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
450#endif
451 );
452}
453
454void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000457 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000458 "1: \n"
459 "movdqa (%0),%%xmm0 \n"
460 "movdqa 0x10(%0),%%xmm1 \n"
461 "movdqa 0x20(%0),%%xmm2 \n"
462 "movdqa 0x30(%0),%%xmm3 \n"
463 "lea 0x40(%0),%0 \n"
464 "pshufb %%xmm6,%%xmm0 \n"
465 "pshufb %%xmm6,%%xmm1 \n"
466 "pshufb %%xmm6,%%xmm2 \n"
467 "pshufb %%xmm6,%%xmm3 \n"
468 "movdqa %%xmm1,%%xmm4 \n"
469 "psrldq $0x4,%%xmm1 \n"
470 "pslldq $0xc,%%xmm4 \n"
471 "movdqa %%xmm2,%%xmm5 \n"
472 "por %%xmm4,%%xmm0 \n"
473 "pslldq $0x8,%%xmm5 \n"
474 "movdqa %%xmm0,(%1) \n"
475 "por %%xmm5,%%xmm1 \n"
476 "psrldq $0x8,%%xmm2 \n"
477 "pslldq $0x4,%%xmm3 \n"
478 "por %%xmm3,%%xmm2 \n"
479 "movdqa %%xmm1,0x10(%1) \n"
480 "movdqa %%xmm2,0x20(%1) \n"
481 "lea 0x30(%1),%1 \n"
482 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000483 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000484 : "+r"(src), // %0
485 "+r"(dst), // %1
486 "+r"(pix) // %2
487 : "m"(kShuffleMaskARGBToRAW) // %3
488 : "memory", "cc"
489#if defined(__SSE2__)
490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
491#endif
492 );
493}
494
495void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000496 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000497 "pcmpeqb %%xmm3,%%xmm3 \n"
498 "psrld $0x1b,%%xmm3 \n"
499 "pcmpeqb %%xmm4,%%xmm4 \n"
500 "psrld $0x1a,%%xmm4 \n"
501 "pslld $0x5,%%xmm4 \n"
502 "pcmpeqb %%xmm5,%%xmm5 \n"
503 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000504 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000505 "1: \n"
506 "movdqa (%0),%%xmm0 \n"
507 "movdqa %%xmm0,%%xmm1 \n"
508 "movdqa %%xmm0,%%xmm2 \n"
509 "pslld $0x8,%%xmm0 \n"
510 "psrld $0x3,%%xmm1 \n"
511 "psrld $0x5,%%xmm2 \n"
512 "psrad $0x10,%%xmm0 \n"
513 "pand %%xmm3,%%xmm1 \n"
514 "pand %%xmm4,%%xmm2 \n"
515 "pand %%xmm5,%%xmm0 \n"
516 "por %%xmm2,%%xmm1 \n"
517 "por %%xmm1,%%xmm0 \n"
518 "packssdw %%xmm0,%%xmm0 \n"
519 "lea 0x10(%0),%0 \n"
520 "movq %%xmm0,(%1) \n"
521 "lea 0x8(%1),%1 \n"
522 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 :
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
531#endif
532 );
533}
534
535void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "pcmpeqb %%xmm4,%%xmm4 \n"
538 "psrld $0x1b,%%xmm4 \n"
539 "movdqa %%xmm4,%%xmm5 \n"
540 "pslld $0x5,%%xmm5 \n"
541 "movdqa %%xmm4,%%xmm6 \n"
542 "pslld $0xa,%%xmm6 \n"
543 "pcmpeqb %%xmm7,%%xmm7 \n"
544 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000545 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000546 "1: \n"
547 "movdqa (%0),%%xmm0 \n"
548 "movdqa %%xmm0,%%xmm1 \n"
549 "movdqa %%xmm0,%%xmm2 \n"
550 "movdqa %%xmm0,%%xmm3 \n"
551 "psrad $0x10,%%xmm0 \n"
552 "psrld $0x3,%%xmm1 \n"
553 "psrld $0x6,%%xmm2 \n"
554 "psrld $0x9,%%xmm3 \n"
555 "pand %%xmm7,%%xmm0 \n"
556 "pand %%xmm4,%%xmm1 \n"
557 "pand %%xmm5,%%xmm2 \n"
558 "pand %%xmm6,%%xmm3 \n"
559 "por %%xmm1,%%xmm0 \n"
560 "por %%xmm3,%%xmm2 \n"
561 "por %%xmm2,%%xmm0 \n"
562 "packssdw %%xmm0,%%xmm0 \n"
563 "lea 0x10(%0),%0 \n"
564 "movq %%xmm0,(%1) \n"
565 "lea 0x8(%1),%1 \n"
566 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000567 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 : "+r"(src), // %0
569 "+r"(dst), // %1
570 "+r"(pix) // %2
571 :
572 : "memory", "cc"
573#if defined(__SSE2__)
574 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
575#endif
576 );
577}
578
579void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000580 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000581 "pcmpeqb %%xmm4,%%xmm4 \n"
582 "psllw $0xc,%%xmm4 \n"
583 "movdqa %%xmm4,%%xmm3 \n"
584 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "pand %%xmm3,%%xmm0 \n"
590 "pand %%xmm4,%%xmm1 \n"
591 "psrlq $0x4,%%xmm0 \n"
592 "psrlq $0x8,%%xmm1 \n"
593 "por %%xmm1,%%xmm0 \n"
594 "packuswb %%xmm0,%%xmm0 \n"
595 "lea 0x10(%0),%0 \n"
596 "movq %%xmm0,(%1) \n"
597 "lea 0x8(%1),%1 \n"
598 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000599 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000600 : "+r"(src), // %0
601 "+r"(dst), // %1
602 "+r"(pix) // %2
603 :
604 : "memory", "cc"
605#if defined(__SSE2__)
606 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
607#endif
608 );
609}
610
fbarchard@google.comb6149762011-11-07 21:58:52 +0000611void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000612 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000613 "movdqa %4,%%xmm5 \n"
614 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000615 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000616 "1: \n"
617 "movdqa (%0),%%xmm0 \n"
618 "movdqa 0x10(%0),%%xmm1 \n"
619 "movdqa 0x20(%0),%%xmm2 \n"
620 "movdqa 0x30(%0),%%xmm3 \n"
621 "pmaddubsw %%xmm4,%%xmm0 \n"
622 "pmaddubsw %%xmm4,%%xmm1 \n"
623 "pmaddubsw %%xmm4,%%xmm2 \n"
624 "pmaddubsw %%xmm4,%%xmm3 \n"
625 "lea 0x40(%0),%0 \n"
626 "phaddw %%xmm1,%%xmm0 \n"
627 "phaddw %%xmm3,%%xmm2 \n"
628 "psrlw $0x7,%%xmm0 \n"
629 "psrlw $0x7,%%xmm2 \n"
630 "packuswb %%xmm2,%%xmm0 \n"
631 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000632 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000633 "movdqa %%xmm0,(%1) \n"
634 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000636 : "+r"(src_argb), // %0
637 "+r"(dst_y), // %1
638 "+r"(pix) // %2
639 : "m"(kARGBToY), // %3
640 "m"(kAddY16) // %4
641 : "memory", "cc"
642#if defined(__SSE2__)
643 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
644#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000645 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000646}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000647
648void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000649 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000650 "movdqa %4,%%xmm5 \n"
651 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000652 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000653 "1: \n"
654 "movdqu (%0),%%xmm0 \n"
655 "movdqu 0x10(%0),%%xmm1 \n"
656 "movdqu 0x20(%0),%%xmm2 \n"
657 "movdqu 0x30(%0),%%xmm3 \n"
658 "pmaddubsw %%xmm4,%%xmm0 \n"
659 "pmaddubsw %%xmm4,%%xmm1 \n"
660 "pmaddubsw %%xmm4,%%xmm2 \n"
661 "pmaddubsw %%xmm4,%%xmm3 \n"
662 "lea 0x40(%0),%0 \n"
663 "phaddw %%xmm1,%%xmm0 \n"
664 "phaddw %%xmm3,%%xmm2 \n"
665 "psrlw $0x7,%%xmm0 \n"
666 "psrlw $0x7,%%xmm2 \n"
667 "packuswb %%xmm2,%%xmm0 \n"
668 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000669 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000670 "movdqu %%xmm0,(%1) \n"
671 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000672 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000673 : "+r"(src_argb), // %0
674 "+r"(dst_y), // %1
675 "+r"(pix) // %2
676 : "m"(kARGBToY), // %3
677 "m"(kAddY16) // %4
678 : "memory", "cc"
679#if defined(__SSE2__)
680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
681#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000682 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000683}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000684
fbarchard@google.com714050a2012-02-17 22:59:56 +0000685// TODO(fbarchard): pass xmm constants to single block of assembly.
686// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
687// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
688// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
689// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000690void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
691 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000693 "movdqa %0,%%xmm4 \n"
694 "movdqa %1,%%xmm3 \n"
695 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000696 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000697 : "m"(kARGBToU), // %0
698 "m"(kARGBToV), // %1
699 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000700 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000701 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000702 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000703 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000704 "1: \n"
705 "movdqa (%0),%%xmm0 \n"
706 "movdqa 0x10(%0),%%xmm1 \n"
707 "movdqa 0x20(%0),%%xmm2 \n"
708 "movdqa 0x30(%0),%%xmm6 \n"
709 "pavgb (%0,%4,1),%%xmm0 \n"
710 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
711 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
712 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
713 "lea 0x40(%0),%0 \n"
714 "movdqa %%xmm0,%%xmm7 \n"
715 "shufps $0x88,%%xmm1,%%xmm0 \n"
716 "shufps $0xdd,%%xmm1,%%xmm7 \n"
717 "pavgb %%xmm7,%%xmm0 \n"
718 "movdqa %%xmm2,%%xmm7 \n"
719 "shufps $0x88,%%xmm6,%%xmm2 \n"
720 "shufps $0xdd,%%xmm6,%%xmm7 \n"
721 "pavgb %%xmm7,%%xmm2 \n"
722 "movdqa %%xmm0,%%xmm1 \n"
723 "movdqa %%xmm2,%%xmm6 \n"
724 "pmaddubsw %%xmm4,%%xmm0 \n"
725 "pmaddubsw %%xmm4,%%xmm2 \n"
726 "pmaddubsw %%xmm3,%%xmm1 \n"
727 "pmaddubsw %%xmm3,%%xmm6 \n"
728 "phaddw %%xmm2,%%xmm0 \n"
729 "phaddw %%xmm6,%%xmm1 \n"
730 "psraw $0x8,%%xmm0 \n"
731 "psraw $0x8,%%xmm1 \n"
732 "packsswb %%xmm1,%%xmm0 \n"
733 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000734 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000735 "movlps %%xmm0,(%1) \n"
736 "movhps %%xmm0,(%1,%2,1) \n"
737 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000738 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000739 : "+r"(src_argb0), // %0
740 "+r"(dst_u), // %1
741 "+r"(dst_v), // %2
742 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000743 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000744 : "memory", "cc"
745#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000746 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000747#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000748 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000749}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000750
751void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
752 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000753 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000754 "movdqa %0,%%xmm4 \n"
755 "movdqa %1,%%xmm3 \n"
756 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000757 :
758 : "m"(kARGBToU), // %0
759 "m"(kARGBToV), // %1
760 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000761 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000762 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000763 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm6 \n"
770 "movdqu (%0,%4,1),%%xmm7 \n"
771 "pavgb %%xmm7,%%xmm0 \n"
772 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
773 "pavgb %%xmm7,%%xmm1 \n"
774 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm2 \n"
776 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
777 "pavgb %%xmm7,%%xmm6 \n"
778 "lea 0x40(%0),%0 \n"
779 "movdqa %%xmm0,%%xmm7 \n"
780 "shufps $0x88,%%xmm1,%%xmm0 \n"
781 "shufps $0xdd,%%xmm1,%%xmm7 \n"
782 "pavgb %%xmm7,%%xmm0 \n"
783 "movdqa %%xmm2,%%xmm7 \n"
784 "shufps $0x88,%%xmm6,%%xmm2 \n"
785 "shufps $0xdd,%%xmm6,%%xmm7 \n"
786 "pavgb %%xmm7,%%xmm2 \n"
787 "movdqa %%xmm0,%%xmm1 \n"
788 "movdqa %%xmm2,%%xmm6 \n"
789 "pmaddubsw %%xmm4,%%xmm0 \n"
790 "pmaddubsw %%xmm4,%%xmm2 \n"
791 "pmaddubsw %%xmm3,%%xmm1 \n"
792 "pmaddubsw %%xmm3,%%xmm6 \n"
793 "phaddw %%xmm2,%%xmm0 \n"
794 "phaddw %%xmm6,%%xmm1 \n"
795 "psraw $0x8,%%xmm0 \n"
796 "psraw $0x8,%%xmm1 \n"
797 "packsswb %%xmm1,%%xmm0 \n"
798 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000799 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000800 "movlps %%xmm0,(%1) \n"
801 "movhps %%xmm0,(%1,%2,1) \n"
802 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000803 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000804 : "+r"(src_argb0), // %0
805 "+r"(dst_u), // %1
806 "+r"(dst_v), // %2
807 "+rm"(width) // %3
808 : "r"(static_cast<intptr_t>(src_stride_argb))
809 : "memory", "cc"
810#if defined(__SSE2__)
811 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
812#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000813 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000814}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000815
fbarchard@google.com714050a2012-02-17 22:59:56 +0000816void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000817 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000818 "movdqa %4,%%xmm5 \n"
819 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000820 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000821 "1: \n"
822 "movdqa (%0),%%xmm0 \n"
823 "movdqa 0x10(%0),%%xmm1 \n"
824 "movdqa 0x20(%0),%%xmm2 \n"
825 "movdqa 0x30(%0),%%xmm3 \n"
826 "pmaddubsw %%xmm4,%%xmm0 \n"
827 "pmaddubsw %%xmm4,%%xmm1 \n"
828 "pmaddubsw %%xmm4,%%xmm2 \n"
829 "pmaddubsw %%xmm4,%%xmm3 \n"
830 "lea 0x40(%0),%0 \n"
831 "phaddw %%xmm1,%%xmm0 \n"
832 "phaddw %%xmm3,%%xmm2 \n"
833 "psrlw $0x7,%%xmm0 \n"
834 "psrlw $0x7,%%xmm2 \n"
835 "packuswb %%xmm2,%%xmm0 \n"
836 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000837 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000838 "movdqa %%xmm0,(%1) \n"
839 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000840 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000841 : "+r"(src_bgra), // %0
842 "+r"(dst_y), // %1
843 "+r"(pix) // %2
844 : "m"(kBGRAToY), // %3
845 "m"(kAddY16) // %4
846 : "memory", "cc"
847#if defined(__SSE2__)
848 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000849#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000850 );
851}
852
853void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000854 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000855 "movdqa %4,%%xmm5 \n"
856 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000857 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000858 "1: \n"
859 "movdqu (%0),%%xmm0 \n"
860 "movdqu 0x10(%0),%%xmm1 \n"
861 "movdqu 0x20(%0),%%xmm2 \n"
862 "movdqu 0x30(%0),%%xmm3 \n"
863 "pmaddubsw %%xmm4,%%xmm0 \n"
864 "pmaddubsw %%xmm4,%%xmm1 \n"
865 "pmaddubsw %%xmm4,%%xmm2 \n"
866 "pmaddubsw %%xmm4,%%xmm3 \n"
867 "lea 0x40(%0),%0 \n"
868 "phaddw %%xmm1,%%xmm0 \n"
869 "phaddw %%xmm3,%%xmm2 \n"
870 "psrlw $0x7,%%xmm0 \n"
871 "psrlw $0x7,%%xmm2 \n"
872 "packuswb %%xmm2,%%xmm0 \n"
873 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000874 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000875 "movdqu %%xmm0,(%1) \n"
876 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000877 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000878 : "+r"(src_bgra), // %0
879 "+r"(dst_y), // %1
880 "+r"(pix) // %2
881 : "m"(kBGRAToY), // %3
882 "m"(kAddY16) // %4
883 : "memory", "cc"
884#if defined(__SSE2__)
885 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
886#endif
887 );
888}
889
890void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
891 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000892 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000893 "movdqa %0,%%xmm4 \n"
894 "movdqa %1,%%xmm3 \n"
895 "movdqa %2,%%xmm5 \n"
896 :
897 : "m"(kBGRAToU), // %0
898 "m"(kBGRAToV), // %1
899 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000900 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000901 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000902 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000903 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000904 "1: \n"
905 "movdqa (%0),%%xmm0 \n"
906 "movdqa 0x10(%0),%%xmm1 \n"
907 "movdqa 0x20(%0),%%xmm2 \n"
908 "movdqa 0x30(%0),%%xmm6 \n"
909 "pavgb (%0,%4,1),%%xmm0 \n"
910 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
911 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
912 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
913 "lea 0x40(%0),%0 \n"
914 "movdqa %%xmm0,%%xmm7 \n"
915 "shufps $0x88,%%xmm1,%%xmm0 \n"
916 "shufps $0xdd,%%xmm1,%%xmm7 \n"
917 "pavgb %%xmm7,%%xmm0 \n"
918 "movdqa %%xmm2,%%xmm7 \n"
919 "shufps $0x88,%%xmm6,%%xmm2 \n"
920 "shufps $0xdd,%%xmm6,%%xmm7 \n"
921 "pavgb %%xmm7,%%xmm2 \n"
922 "movdqa %%xmm0,%%xmm1 \n"
923 "movdqa %%xmm2,%%xmm6 \n"
924 "pmaddubsw %%xmm4,%%xmm0 \n"
925 "pmaddubsw %%xmm4,%%xmm2 \n"
926 "pmaddubsw %%xmm3,%%xmm1 \n"
927 "pmaddubsw %%xmm3,%%xmm6 \n"
928 "phaddw %%xmm2,%%xmm0 \n"
929 "phaddw %%xmm6,%%xmm1 \n"
930 "psraw $0x8,%%xmm0 \n"
931 "psraw $0x8,%%xmm1 \n"
932 "packsswb %%xmm1,%%xmm0 \n"
933 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000934 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000935 "movlps %%xmm0,(%1) \n"
936 "movhps %%xmm0,(%1,%2,1) \n"
937 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000938 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000939 : "+r"(src_bgra0), // %0
940 "+r"(dst_u), // %1
941 "+r"(dst_v), // %2
942 "+rm"(width) // %3
943 : "r"(static_cast<intptr_t>(src_stride_bgra))
944 : "memory", "cc"
945#if defined(__SSE2__)
946 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
947#endif
948 );
949}
950
951void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
952 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000953 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000954 "movdqa %0,%%xmm4 \n"
955 "movdqa %1,%%xmm3 \n"
956 "movdqa %2,%%xmm5 \n"
957 :
958 : "m"(kBGRAToU), // %0
959 "m"(kBGRAToV), // %1
960 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000961 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000962 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000963 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000964 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000965 "1: \n"
966 "movdqu (%0),%%xmm0 \n"
967 "movdqu 0x10(%0),%%xmm1 \n"
968 "movdqu 0x20(%0),%%xmm2 \n"
969 "movdqu 0x30(%0),%%xmm6 \n"
970 "movdqu (%0,%4,1),%%xmm7 \n"
971 "pavgb %%xmm7,%%xmm0 \n"
972 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
973 "pavgb %%xmm7,%%xmm1 \n"
974 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm2 \n"
976 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm6 \n"
978 "lea 0x40(%0),%0 \n"
979 "movdqa %%xmm0,%%xmm7 \n"
980 "shufps $0x88,%%xmm1,%%xmm0 \n"
981 "shufps $0xdd,%%xmm1,%%xmm7 \n"
982 "pavgb %%xmm7,%%xmm0 \n"
983 "movdqa %%xmm2,%%xmm7 \n"
984 "shufps $0x88,%%xmm6,%%xmm2 \n"
985 "shufps $0xdd,%%xmm6,%%xmm7 \n"
986 "pavgb %%xmm7,%%xmm2 \n"
987 "movdqa %%xmm0,%%xmm1 \n"
988 "movdqa %%xmm2,%%xmm6 \n"
989 "pmaddubsw %%xmm4,%%xmm0 \n"
990 "pmaddubsw %%xmm4,%%xmm2 \n"
991 "pmaddubsw %%xmm3,%%xmm1 \n"
992 "pmaddubsw %%xmm3,%%xmm6 \n"
993 "phaddw %%xmm2,%%xmm0 \n"
994 "phaddw %%xmm6,%%xmm1 \n"
995 "psraw $0x8,%%xmm0 \n"
996 "psraw $0x8,%%xmm1 \n"
997 "packsswb %%xmm1,%%xmm0 \n"
998 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000999 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001000 "movlps %%xmm0,(%1) \n"
1001 "movhps %%xmm0,(%1,%2,1) \n"
1002 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001003 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001004 : "+r"(src_bgra0), // %0
1005 "+r"(dst_u), // %1
1006 "+r"(dst_v), // %2
1007 "+rm"(width) // %3
1008 : "r"(static_cast<intptr_t>(src_stride_bgra))
1009 : "memory", "cc"
1010#if defined(__SSE2__)
1011 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1012#endif
1013 );
1014}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001015
1016void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001017 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001018 "movdqa %4,%%xmm5 \n"
1019 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001020 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001021 "1: \n"
1022 "movdqa (%0),%%xmm0 \n"
1023 "movdqa 0x10(%0),%%xmm1 \n"
1024 "movdqa 0x20(%0),%%xmm2 \n"
1025 "movdqa 0x30(%0),%%xmm3 \n"
1026 "pmaddubsw %%xmm4,%%xmm0 \n"
1027 "pmaddubsw %%xmm4,%%xmm1 \n"
1028 "pmaddubsw %%xmm4,%%xmm2 \n"
1029 "pmaddubsw %%xmm4,%%xmm3 \n"
1030 "lea 0x40(%0),%0 \n"
1031 "phaddw %%xmm1,%%xmm0 \n"
1032 "phaddw %%xmm3,%%xmm2 \n"
1033 "psrlw $0x7,%%xmm0 \n"
1034 "psrlw $0x7,%%xmm2 \n"
1035 "packuswb %%xmm2,%%xmm0 \n"
1036 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001037 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001038 "movdqa %%xmm0,(%1) \n"
1039 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001040 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001041 : "+r"(src_abgr), // %0
1042 "+r"(dst_y), // %1
1043 "+r"(pix) // %2
1044 : "m"(kABGRToY), // %3
1045 "m"(kAddY16) // %4
1046 : "memory", "cc"
1047#if defined(__SSE2__)
1048 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1049#endif
1050 );
1051}
1052
1053void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001054 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001055 "movdqa %4,%%xmm5 \n"
1056 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001057 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001058 "1: \n"
1059 "movdqu (%0),%%xmm0 \n"
1060 "movdqu 0x10(%0),%%xmm1 \n"
1061 "movdqu 0x20(%0),%%xmm2 \n"
1062 "movdqu 0x30(%0),%%xmm3 \n"
1063 "pmaddubsw %%xmm4,%%xmm0 \n"
1064 "pmaddubsw %%xmm4,%%xmm1 \n"
1065 "pmaddubsw %%xmm4,%%xmm2 \n"
1066 "pmaddubsw %%xmm4,%%xmm3 \n"
1067 "lea 0x40(%0),%0 \n"
1068 "phaddw %%xmm1,%%xmm0 \n"
1069 "phaddw %%xmm3,%%xmm2 \n"
1070 "psrlw $0x7,%%xmm0 \n"
1071 "psrlw $0x7,%%xmm2 \n"
1072 "packuswb %%xmm2,%%xmm0 \n"
1073 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001074 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001075 "movdqu %%xmm0,(%1) \n"
1076 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001077 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001078 : "+r"(src_abgr), // %0
1079 "+r"(dst_y), // %1
1080 "+r"(pix) // %2
1081 : "m"(kABGRToY), // %3
1082 "m"(kAddY16) // %4
1083 : "memory", "cc"
1084#if defined(__SSE2__)
1085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1086#endif
1087 );
1088}
1089
1090void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1091 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001092 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001093 "movdqa %0,%%xmm4 \n"
1094 "movdqa %1,%%xmm3 \n"
1095 "movdqa %2,%%xmm5 \n"
1096 :
1097 : "m"(kABGRToU), // %0
1098 "m"(kABGRToV), // %1
1099 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001100 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001101 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001102 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001103 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001104 "1: \n"
1105 "movdqa (%0),%%xmm0 \n"
1106 "movdqa 0x10(%0),%%xmm1 \n"
1107 "movdqa 0x20(%0),%%xmm2 \n"
1108 "movdqa 0x30(%0),%%xmm6 \n"
1109 "pavgb (%0,%4,1),%%xmm0 \n"
1110 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1111 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1112 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1113 "lea 0x40(%0),%0 \n"
1114 "movdqa %%xmm0,%%xmm7 \n"
1115 "shufps $0x88,%%xmm1,%%xmm0 \n"
1116 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1117 "pavgb %%xmm7,%%xmm0 \n"
1118 "movdqa %%xmm2,%%xmm7 \n"
1119 "shufps $0x88,%%xmm6,%%xmm2 \n"
1120 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1121 "pavgb %%xmm7,%%xmm2 \n"
1122 "movdqa %%xmm0,%%xmm1 \n"
1123 "movdqa %%xmm2,%%xmm6 \n"
1124 "pmaddubsw %%xmm4,%%xmm0 \n"
1125 "pmaddubsw %%xmm4,%%xmm2 \n"
1126 "pmaddubsw %%xmm3,%%xmm1 \n"
1127 "pmaddubsw %%xmm3,%%xmm6 \n"
1128 "phaddw %%xmm2,%%xmm0 \n"
1129 "phaddw %%xmm6,%%xmm1 \n"
1130 "psraw $0x8,%%xmm0 \n"
1131 "psraw $0x8,%%xmm1 \n"
1132 "packsswb %%xmm1,%%xmm0 \n"
1133 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001134 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 "movlps %%xmm0,(%1) \n"
1136 "movhps %%xmm0,(%1,%2,1) \n"
1137 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001138 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 : "+r"(src_abgr0), // %0
1140 "+r"(dst_u), // %1
1141 "+r"(dst_v), // %2
1142 "+rm"(width) // %3
1143 : "r"(static_cast<intptr_t>(src_stride_abgr))
1144 : "memory", "cc"
1145#if defined(__SSE2__)
1146 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1147#endif
1148 );
1149}
1150
1151void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1152 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001153 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001154 "movdqa %0,%%xmm4 \n"
1155 "movdqa %1,%%xmm3 \n"
1156 "movdqa %2,%%xmm5 \n"
1157 :
1158 : "m"(kABGRToU), // %0
1159 "m"(kABGRToV), // %1
1160 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001161 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001162 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001163 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001164 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001165 "1: \n"
1166 "movdqu (%0),%%xmm0 \n"
1167 "movdqu 0x10(%0),%%xmm1 \n"
1168 "movdqu 0x20(%0),%%xmm2 \n"
1169 "movdqu 0x30(%0),%%xmm6 \n"
1170 "movdqu (%0,%4,1),%%xmm7 \n"
1171 "pavgb %%xmm7,%%xmm0 \n"
1172 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1173 "pavgb %%xmm7,%%xmm1 \n"
1174 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm2 \n"
1176 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1177 "pavgb %%xmm7,%%xmm6 \n"
1178 "lea 0x40(%0),%0 \n"
1179 "movdqa %%xmm0,%%xmm7 \n"
1180 "shufps $0x88,%%xmm1,%%xmm0 \n"
1181 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1182 "pavgb %%xmm7,%%xmm0 \n"
1183 "movdqa %%xmm2,%%xmm7 \n"
1184 "shufps $0x88,%%xmm6,%%xmm2 \n"
1185 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1186 "pavgb %%xmm7,%%xmm2 \n"
1187 "movdqa %%xmm0,%%xmm1 \n"
1188 "movdqa %%xmm2,%%xmm6 \n"
1189 "pmaddubsw %%xmm4,%%xmm0 \n"
1190 "pmaddubsw %%xmm4,%%xmm2 \n"
1191 "pmaddubsw %%xmm3,%%xmm1 \n"
1192 "pmaddubsw %%xmm3,%%xmm6 \n"
1193 "phaddw %%xmm2,%%xmm0 \n"
1194 "phaddw %%xmm6,%%xmm1 \n"
1195 "psraw $0x8,%%xmm0 \n"
1196 "psraw $0x8,%%xmm1 \n"
1197 "packsswb %%xmm1,%%xmm0 \n"
1198 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001199 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001200 "movlps %%xmm0,(%1) \n"
1201 "movhps %%xmm0,(%1,%2,1) \n"
1202 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001203 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001204 : "+r"(src_abgr0), // %0
1205 "+r"(dst_u), // %1
1206 "+r"(dst_v), // %2
1207 "+rm"(width) // %3
1208 : "r"(static_cast<intptr_t>(src_stride_abgr))
1209 : "memory", "cc"
1210#if defined(__SSE2__)
1211 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1212#endif
1213 );
1214}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001215
1216#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001217
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001218#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001219#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1220#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1221#define UR 0
1222
1223#define VB 0
1224#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1225#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1226
1227// Bias
1228#define BB UB * 128 + VB * 128
1229#define BG UG * 128 + VG * 128
1230#define BR UR * 128 + VR * 128
1231
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001232#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001233
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001234struct {
1235 vec8 kUVToB;
1236 vec8 kUVToG;
1237 vec8 kUVToR;
1238 vec16 kUVBiasB;
1239 vec16 kUVBiasG;
1240 vec16 kUVBiasR;
1241 vec16 kYSub16;
1242 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001243} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001244 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1245 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1246 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1247 { BB, BB, BB, BB, BB, BB, BB, BB },
1248 { BG, BG, BG, BG, BG, BG, BG, BG },
1249 { BR, BR, BR, BR, BR, BR, BR, BR },
1250 { 16, 16, 16, 16, 16, 16, 16, 16 },
1251 { YG, YG, YG, YG, YG, YG, YG, YG }
1252};
1253
1254// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +00001255#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001256 "movd (%1),%%xmm0 \n" \
1257 "movd (%1,%2,1),%%xmm1 \n" \
1258 "lea 0x4(%1),%1 \n" \
1259 "punpcklbw %%xmm1,%%xmm0 \n" \
1260 "punpcklwd %%xmm0,%%xmm0 \n" \
1261 "movdqa %%xmm0,%%xmm1 \n" \
1262 "movdqa %%xmm0,%%xmm2 \n" \
1263 "pmaddubsw (%5),%%xmm0 \n" \
1264 "pmaddubsw 16(%5),%%xmm1 \n" \
1265 "pmaddubsw 32(%5),%%xmm2 \n" \
1266 "psubw 48(%5),%%xmm0 \n" \
1267 "psubw 64(%5),%%xmm1 \n" \
1268 "psubw 80(%5),%%xmm2 \n" \
1269 "movq (%0),%%xmm3 \n" \
1270 "lea 0x8(%0),%0 \n" \
1271 "punpcklbw %%xmm4,%%xmm3 \n" \
1272 "psubsw 96(%5),%%xmm3 \n" \
1273 "pmullw 112(%5),%%xmm3 \n" \
1274 "paddsw %%xmm3,%%xmm0 \n" \
1275 "paddsw %%xmm3,%%xmm1 \n" \
1276 "paddsw %%xmm3,%%xmm2 \n" \
1277 "psraw $0x6,%%xmm0 \n" \
1278 "psraw $0x6,%%xmm1 \n" \
1279 "psraw $0x6,%%xmm2 \n" \
1280 "packuswb %%xmm0,%%xmm0 \n" \
1281 "packuswb %%xmm1,%%xmm1 \n" \
1282 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001283
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001284void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
1285 const uint8* u_buf,
1286 const uint8* v_buf,
1287 uint8* rgb_buf,
1288 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001289 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001290 "sub %1,%2 \n"
1291 "pcmpeqb %%xmm5,%%xmm5 \n"
1292 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001293 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001294 "1: \n"
1295 YUVTORGB
1296 "punpcklbw %%xmm1,%%xmm0 \n"
1297 "punpcklbw %%xmm5,%%xmm2 \n"
1298 "movdqa %%xmm0,%%xmm1 \n"
1299 "punpcklwd %%xmm2,%%xmm0 \n"
1300 "punpckhwd %%xmm2,%%xmm1 \n"
1301 "movdqa %%xmm0,(%3) \n"
1302 "movdqa %%xmm1,0x10(%3) \n"
1303 "lea 0x20(%3),%3 \n"
1304 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001305 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001306 : "+r"(y_buf), // %0
1307 "+r"(u_buf), // %1
1308 "+r"(v_buf), // %2
1309 "+r"(rgb_buf), // %3
1310 "+rm"(width) // %4
1311 : "r"(&kYuvConstants.kUVToB) // %5
1312 : "memory", "cc"
1313#if defined(__SSE2__)
1314 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1315#endif
1316 );
1317}
1318
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001319void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
1320 const uint8* u_buf,
1321 const uint8* v_buf,
1322 uint8* rgb_buf,
1323 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001324 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001325 "sub %1,%2 \n"
1326 "pcmpeqb %%xmm5,%%xmm5 \n"
1327 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001328 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001329 "1: \n"
1330 YUVTORGB
1331 "pcmpeqb %%xmm5,%%xmm5 \n"
1332 "punpcklbw %%xmm0,%%xmm1 \n"
1333 "punpcklbw %%xmm2,%%xmm5 \n"
1334 "movdqa %%xmm5,%%xmm0 \n"
1335 "punpcklwd %%xmm1,%%xmm5 \n"
1336 "punpckhwd %%xmm1,%%xmm0 \n"
1337 "movdqa %%xmm5,(%3) \n"
1338 "movdqa %%xmm0,0x10(%3) \n"
1339 "lea 0x20(%3),%3 \n"
1340 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001341 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001342 : "+r"(y_buf), // %0
1343 "+r"(u_buf), // %1
1344 "+r"(v_buf), // %2
1345 "+r"(rgb_buf), // %3
1346 "+rm"(width) // %4
1347 : "r"(&kYuvConstants.kUVToB) // %5
1348 : "memory", "cc"
1349#if defined(__SSE2__)
1350 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1351#endif
1352 );
1353}
1354
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001355void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
1356 const uint8* u_buf,
1357 const uint8* v_buf,
1358 uint8* rgb_buf,
1359 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001360 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001361 "sub %1,%2 \n"
1362 "pcmpeqb %%xmm5,%%xmm5 \n"
1363 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001364 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001365 "1: \n"
1366 YUVTORGB
1367 "punpcklbw %%xmm1,%%xmm2 \n"
1368 "punpcklbw %%xmm5,%%xmm0 \n"
1369 "movdqa %%xmm2,%%xmm1 \n"
1370 "punpcklwd %%xmm0,%%xmm2 \n"
1371 "punpckhwd %%xmm0,%%xmm1 \n"
1372 "movdqa %%xmm2,(%3) \n"
1373 "movdqa %%xmm1,0x10(%3) \n"
1374 "lea 0x20(%3),%3 \n"
1375 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001376 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001377 : "+r"(y_buf), // %0
1378 "+r"(u_buf), // %1
1379 "+r"(v_buf), // %2
1380 "+r"(rgb_buf), // %3
1381 "+rm"(width) // %4
1382 : "r"(&kYuvConstants.kUVToB) // %5
1383 : "memory", "cc"
1384#if defined(__SSE2__)
1385 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1386#endif
1387 );
1388}
1389
fbarchard@google.com952a5072012-03-30 18:10:50 +00001390void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1391 const uint8* u_buf,
1392 const uint8* v_buf,
1393 uint8* rgb_buf,
1394 int width) {
1395 asm volatile (
1396 "sub %1,%2 \n"
1397 "pcmpeqb %%xmm5,%%xmm5 \n"
1398 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001399 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001400 "1: \n"
1401 YUVTORGB
1402 "punpcklbw %%xmm1,%%xmm0 \n"
1403 "punpcklbw %%xmm5,%%xmm2 \n"
1404 "movdqa %%xmm0,%%xmm1 \n"
1405 "punpcklwd %%xmm2,%%xmm0 \n"
1406 "punpckhwd %%xmm2,%%xmm1 \n"
1407 "movdqu %%xmm0,(%3) \n"
1408 "movdqu %%xmm1,0x10(%3) \n"
1409 "lea 0x20(%3),%3 \n"
1410 "sub $0x8,%4 \n"
1411 "jg 1b \n"
1412 : "+r"(y_buf), // %0
1413 "+r"(u_buf), // %1
1414 "+r"(v_buf), // %2
1415 "+r"(rgb_buf), // %3
1416 "+rm"(width) // %4
1417 : "r"(&kYuvConstants.kUVToB) // %5
1418 : "memory", "cc"
1419#if defined(__SSE2__)
1420 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1421#endif
1422 );
1423}
1424
1425void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
1426 const uint8* u_buf,
1427 const uint8* v_buf,
1428 uint8* rgb_buf,
1429 int width) {
1430 asm volatile (
1431 "sub %1,%2 \n"
1432 "pcmpeqb %%xmm5,%%xmm5 \n"
1433 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001434 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001435 "1: \n"
1436 YUVTORGB
1437 "pcmpeqb %%xmm5,%%xmm5 \n"
1438 "punpcklbw %%xmm0,%%xmm1 \n"
1439 "punpcklbw %%xmm2,%%xmm5 \n"
1440 "movdqa %%xmm5,%%xmm0 \n"
1441 "punpcklwd %%xmm1,%%xmm5 \n"
1442 "punpckhwd %%xmm1,%%xmm0 \n"
1443 "movdqu %%xmm5,(%3) \n"
1444 "movdqu %%xmm0,0x10(%3) \n"
1445 "lea 0x20(%3),%3 \n"
1446 "sub $0x8,%4 \n"
1447 "jg 1b \n"
1448 : "+r"(y_buf), // %0
1449 "+r"(u_buf), // %1
1450 "+r"(v_buf), // %2
1451 "+r"(rgb_buf), // %3
1452 "+rm"(width) // %4
1453 : "r"(&kYuvConstants.kUVToB) // %5
1454 : "memory", "cc"
1455#if defined(__SSE2__)
1456 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1457#endif
1458 );
1459}
1460
1461void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
1462 const uint8* u_buf,
1463 const uint8* v_buf,
1464 uint8* rgb_buf,
1465 int width) {
1466 asm volatile (
1467 "sub %1,%2 \n"
1468 "pcmpeqb %%xmm5,%%xmm5 \n"
1469 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001470 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001471 "1: \n"
1472 YUVTORGB
1473 "punpcklbw %%xmm1,%%xmm2 \n"
1474 "punpcklbw %%xmm5,%%xmm0 \n"
1475 "movdqa %%xmm2,%%xmm1 \n"
1476 "punpcklwd %%xmm0,%%xmm2 \n"
1477 "punpckhwd %%xmm0,%%xmm1 \n"
1478 "movdqu %%xmm2,(%3) \n"
1479 "movdqu %%xmm1,0x10(%3) \n"
1480 "lea 0x20(%3),%3 \n"
1481 "sub $0x8,%4 \n"
1482 "jg 1b \n"
1483 : "+r"(y_buf), // %0
1484 "+r"(u_buf), // %1
1485 "+r"(v_buf), // %2
1486 "+r"(rgb_buf), // %3
1487 "+rm"(width) // %4
1488 : "r"(&kYuvConstants.kUVToB) // %5
1489 : "memory", "cc"
1490#if defined(__SSE2__)
1491 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1492#endif
1493 );
1494}
1495
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001496void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1497 const uint8* u_buf,
1498 const uint8* v_buf,
1499 uint8* rgb_buf,
1500 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001501 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001502 "sub %1,%2 \n"
1503 "pcmpeqb %%xmm5,%%xmm5 \n"
1504 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001505 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001506 "1: \n"
1507 "movd (%1),%%xmm0 \n"
1508 "movd (%1,%2,1),%%xmm1 \n"
1509 "lea 0x4(%1),%1 \n"
1510 "punpcklbw %%xmm1,%%xmm0 \n"
1511 "movdqa %%xmm0,%%xmm1 \n"
1512 "movdqa %%xmm0,%%xmm2 \n"
1513 "pmaddubsw (%5),%%xmm0 \n"
1514 "pmaddubsw 16(%5),%%xmm1 \n"
1515 "pmaddubsw 32(%5),%%xmm2 \n"
1516 "psubw 48(%5),%%xmm0 \n"
1517 "psubw 64(%5),%%xmm1 \n"
1518 "psubw 80(%5),%%xmm2 \n"
1519 "movd (%0),%%xmm3 \n"
1520 "lea 0x4(%0),%0 \n"
1521 "punpcklbw %%xmm4,%%xmm3 \n"
1522 "psubsw 96(%5),%%xmm3 \n"
1523 "pmullw 112(%5),%%xmm3 \n"
1524 "paddsw %%xmm3,%%xmm0 \n"
1525 "paddsw %%xmm3,%%xmm1 \n"
1526 "paddsw %%xmm3,%%xmm2 \n"
1527 "psraw $0x6,%%xmm0 \n"
1528 "psraw $0x6,%%xmm1 \n"
1529 "psraw $0x6,%%xmm2 \n"
1530 "packuswb %%xmm0,%%xmm0 \n"
1531 "packuswb %%xmm1,%%xmm1 \n"
1532 "packuswb %%xmm2,%%xmm2 \n"
1533 "punpcklbw %%xmm1,%%xmm0 \n"
1534 "punpcklbw %%xmm5,%%xmm2 \n"
1535 "punpcklwd %%xmm2,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001536 "sub $0x4,%4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001537 "movdqa %%xmm0,(%3) \n"
1538 "lea 0x10(%3),%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001539 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001540 : "+r"(y_buf), // %0
1541 "+r"(u_buf), // %1
1542 "+r"(v_buf), // %2
1543 "+r"(rgb_buf), // %3
1544 "+rm"(width) // %4
1545 : "r"(&kYuvConstants.kUVToB) // %5
1546 : "memory", "cc"
1547#if defined(__SSE2__)
1548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1549#endif
1550 );
1551}
1552#endif
1553
1554#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001555void YToARGBRow_SSE2(const uint8* y_buf,
1556 uint8* rgb_buf,
1557 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001558 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001559 "pcmpeqb %%xmm4,%%xmm4 \n"
1560 "pslld $0x18,%%xmm4 \n"
1561 "mov $0x10001000,%%eax \n"
1562 "movd %%eax,%%xmm3 \n"
1563 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1564 "mov $0x012a012a,%%eax \n"
1565 "movd %%eax,%%xmm2 \n"
1566 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001567 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001568 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001569 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001570 "movq (%0),%%xmm0 \n"
1571 "lea 0x8(%0),%0 \n"
1572 "punpcklbw %%xmm0,%%xmm0 \n"
1573 "psubusw %%xmm3,%%xmm0 \n"
1574 "pmulhuw %%xmm2,%%xmm0 \n"
1575 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001576
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001577 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001578 "punpcklbw %%xmm0,%%xmm0 \n"
1579 "movdqa %%xmm0,%%xmm1 \n"
1580 "punpcklwd %%xmm0,%%xmm0 \n"
1581 "punpckhwd %%xmm1,%%xmm1 \n"
1582 "por %%xmm4,%%xmm0 \n"
1583 "por %%xmm4,%%xmm1 \n"
1584 "movdqa %%xmm0,(%1) \n"
1585 "movdqa %%xmm1,16(%1) \n"
1586 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001587
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001588 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001589 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001590 : "+r"(y_buf), // %0
1591 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001592 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001593 :
1594 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001595#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001596 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001597#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001598 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001599}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001600#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001601
fbarchard@google.com42831e02012-01-21 02:54:17 +00001602#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001603// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001604CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001605 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1606};
1607
fbarchard@google.com42831e02012-01-21 02:54:17 +00001608void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001609 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001610 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001611 "movdqa %3,%%xmm5 \n"
1612 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001613 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001614 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001615 "movdqa (%0,%2),%%xmm0 \n"
1616 "pshufb %%xmm5,%%xmm0 \n"
1617 "sub $0x10,%2 \n"
1618 "movdqa %%xmm0,(%1) \n"
1619 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001620 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001621 : "+r"(src), // %0
1622 "+r"(dst), // %1
1623 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001624 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001625 : "memory", "cc"
1626#if defined(__SSE2__)
1627 , "xmm0", "xmm5"
1628#endif
1629 );
1630}
1631#endif
1632
fbarchard@google.com42831e02012-01-21 02:54:17 +00001633#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001634void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001635 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001636 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001637 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001638 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001639 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001640 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001641 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001642 "psllw $0x8,%%xmm0 \n"
1643 "psrlw $0x8,%%xmm1 \n"
1644 "por %%xmm1,%%xmm0 \n"
1645 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1646 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1647 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1648 "sub $0x10,%2 \n"
1649 "movdqu %%xmm0,(%1) \n"
1650 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001651 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001652 : "+r"(src), // %0
1653 "+r"(dst), // %1
1654 "+r"(temp_width) // %2
1655 :
1656 : "memory", "cc"
1657#if defined(__SSE2__)
1658 , "xmm0", "xmm1"
1659#endif
1660 );
1661}
1662#endif
1663
fbarchard@google.com16a96642012-03-02 22:38:09 +00001664#ifdef HAS_MIRRORROW_UV_SSSE3
1665// Shuffle table for reversing the bytes of UV channels.
1666CONST uvec8 kShuffleMirrorUV = {
1667 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1668};
1669void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1670 int width) {
1671 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001672 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001673 "movdqa %4,%%xmm1 \n"
1674 "lea -16(%0,%3,2),%0 \n"
1675 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001676 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001677 "1: \n"
1678 "movdqa (%0),%%xmm0 \n"
1679 "lea -16(%0),%0 \n"
1680 "pshufb %%xmm1,%%xmm0 \n"
1681 "sub $8,%3 \n"
1682 "movlpd %%xmm0,(%1) \n"
1683 "movhpd %%xmm0,(%1,%2) \n"
1684 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001685 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001686 : "+r"(src), // %0
1687 "+r"(dst_u), // %1
1688 "+r"(dst_v), // %2
1689 "+r"(temp_width) // %3
1690 : "m"(kShuffleMirrorUV) // %4
1691 : "memory", "cc"
1692#if defined(__SSE2__)
1693 , "xmm0", "xmm1"
1694#endif
1695 );
1696}
1697#endif
1698
fbarchard@google.com55663022012-04-26 00:01:41 +00001699#ifdef HAS_ADDROW_SSE2
1700// dst and width aligned to 16
1701void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
1702 asm volatile (
1703 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001704 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001705 "1: \n"
1706 "movdqu (%0),%%xmm2 \n"
1707 "lea 0x10(%0),%0 \n"
1708 "movdqa (%1),%%xmm0 \n"
1709 "movdqa 0x10(%1),%%xmm1 \n"
1710 "movdqa %%xmm2,%%xmm3 \n"
1711 "punpcklbw %%xmm4,%%xmm2 \n"
1712 "punpckhbw %%xmm4,%%xmm3 \n"
1713 "paddusw %%xmm2,%%xmm0 \n"
1714 "paddusw %%xmm3,%%xmm1 \n"
1715 "sub $0x10,%2 \n"
1716 "movdqa %%xmm0,(%1) \n"
1717 "movdqa %%xmm1,0x10(%1) \n"
1718 "lea 0x20(%1),%1 \n"
1719 "jg 1b \n"
1720 : "+r"(src), // %0
1721 "+r"(dst), // %1
1722 "+r"(width) // %2
1723 :
1724 : "memory", "cc"
1725#if defined(__SSE2__)
1726 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1727#endif
1728 );
1729}
1730
1731// dst and width aligned to 16
1732void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
1733 asm volatile (
1734 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001735 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001736 "1: \n"
1737 "movdqu (%0),%%xmm2 \n"
1738 "lea 0x10(%0),%0 \n"
1739 "movdqa (%1),%%xmm0 \n"
1740 "movdqa 0x10(%1),%%xmm1 \n"
1741 "movdqa %%xmm2,%%xmm3 \n"
1742 "punpcklbw %%xmm4,%%xmm2 \n"
1743 "punpckhbw %%xmm4,%%xmm3 \n"
1744 "psubusw %%xmm2,%%xmm0 \n"
1745 "psubusw %%xmm3,%%xmm1 \n"
1746 "sub $0x10,%2 \n"
1747 "movdqa %%xmm0,(%1) \n"
1748 "movdqa %%xmm1,0x10(%1) \n"
1749 "lea 0x20(%1),%1 \n"
1750 "jg 1b \n"
1751 : "+r"(src), // %0
1752 "+r"(dst), // %1
1753 "+r"(width) // %2
1754 :
1755 : "memory", "cc"
1756#if defined(__SSE2__)
1757 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1758#endif
1759 );
1760}
1761#endif // HAS_ADDROW_SSE2
1762
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001763#ifdef HAS_SPLITUV_SSE2
1764void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001765 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001766 "pcmpeqb %%xmm5,%%xmm5 \n"
1767 "psrlw $0x8,%%xmm5 \n"
1768 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001769 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001770 "1: \n"
1771 "movdqa (%0),%%xmm0 \n"
1772 "movdqa 0x10(%0),%%xmm1 \n"
1773 "lea 0x20(%0),%0 \n"
1774 "movdqa %%xmm0,%%xmm2 \n"
1775 "movdqa %%xmm1,%%xmm3 \n"
1776 "pand %%xmm5,%%xmm0 \n"
1777 "pand %%xmm5,%%xmm1 \n"
1778 "packuswb %%xmm1,%%xmm0 \n"
1779 "psrlw $0x8,%%xmm2 \n"
1780 "psrlw $0x8,%%xmm3 \n"
1781 "packuswb %%xmm3,%%xmm2 \n"
1782 "movdqa %%xmm0,(%1) \n"
1783 "movdqa %%xmm2,(%1,%2) \n"
1784 "lea 0x10(%1),%1 \n"
1785 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001786 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001787 : "+r"(src_uv), // %0
1788 "+r"(dst_u), // %1
1789 "+r"(dst_v), // %2
1790 "+r"(pix) // %3
1791 :
1792 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001793#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001794 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001795#endif
1796 );
1797}
1798#endif
1799
fbarchard@google.com19932f82012-02-16 22:19:14 +00001800#ifdef HAS_COPYROW_SSE2
1801void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001802 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001803 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00001804 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001805 "1: \n"
1806 "movdqa (%0),%%xmm0 \n"
1807 "movdqa 0x10(%0),%%xmm1 \n"
1808 "movdqa %%xmm0,(%0,%1) \n"
1809 "movdqa %%xmm1,0x10(%0,%1) \n"
1810 "lea 0x20(%0),%0 \n"
1811 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001812 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001813 : "+r"(src), // %0
1814 "+r"(dst), // %1
1815 "+r"(count) // %2
1816 :
1817 : "memory", "cc"
1818#if defined(__SSE2__)
1819 , "xmm0", "xmm1"
1820#endif
1821 );
1822}
1823#endif // HAS_COPYROW_SSE2
1824
1825#ifdef HAS_COPYROW_X86
1826void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1827 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001828 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00001829 "shr $0x2,%2 \n"
1830 "rep movsl \n"
1831 : "+S"(src), // %0
1832 "+D"(dst), // %1
1833 "+c"(width_tmp) // %2
1834 :
1835 : "memory", "cc"
1836 );
1837}
1838#endif
1839
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001840#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001841void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001842 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001843 "pcmpeqb %%xmm5,%%xmm5 \n"
1844 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001845 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001846 "1: \n"
1847 "movdqa (%0),%%xmm0 \n"
1848 "movdqa 0x10(%0),%%xmm1 \n"
1849 "lea 0x20(%0),%0 \n"
1850 "pand %%xmm5,%%xmm0 \n"
1851 "pand %%xmm5,%%xmm1 \n"
1852 "packuswb %%xmm1,%%xmm0 \n"
1853 "movdqa %%xmm0,(%1) \n"
1854 "lea 0x10(%1),%1 \n"
1855 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001856 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001857 : "+r"(src_yuy2), // %0
1858 "+r"(dst_y), // %1
1859 "+r"(pix) // %2
1860 :
1861 : "memory", "cc"
1862#if defined(__SSE2__)
1863 , "xmm0", "xmm1", "xmm5"
1864#endif
1865 );
1866}
1867
1868void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1869 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001870 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001871 "pcmpeqb %%xmm5,%%xmm5 \n"
1872 "psrlw $0x8,%%xmm5 \n"
1873 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001874 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001875 "1: \n"
1876 "movdqa (%0),%%xmm0 \n"
1877 "movdqa 0x10(%0),%%xmm1 \n"
1878 "movdqa (%0,%4,1),%%xmm2 \n"
1879 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1880 "lea 0x20(%0),%0 \n"
1881 "pavgb %%xmm2,%%xmm0 \n"
1882 "pavgb %%xmm3,%%xmm1 \n"
1883 "psrlw $0x8,%%xmm0 \n"
1884 "psrlw $0x8,%%xmm1 \n"
1885 "packuswb %%xmm1,%%xmm0 \n"
1886 "movdqa %%xmm0,%%xmm1 \n"
1887 "pand %%xmm5,%%xmm0 \n"
1888 "packuswb %%xmm0,%%xmm0 \n"
1889 "psrlw $0x8,%%xmm1 \n"
1890 "packuswb %%xmm1,%%xmm1 \n"
1891 "movq %%xmm0,(%1) \n"
1892 "movq %%xmm1,(%1,%2) \n"
1893 "lea 0x8(%1),%1 \n"
1894 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001895 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001896 : "+r"(src_yuy2), // %0
1897 "+r"(dst_u), // %1
1898 "+r"(dst_y), // %2
1899 "+r"(pix) // %3
1900 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1901 : "memory", "cc"
1902#if defined(__SSE2__)
1903 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1904#endif
1905 );
1906}
1907
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00001908
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001909void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1910 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001911 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001912 "pcmpeqb %%xmm5,%%xmm5 \n"
1913 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001914 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001915 "1: \n"
1916 "movdqu (%0),%%xmm0 \n"
1917 "movdqu 0x10(%0),%%xmm1 \n"
1918 "lea 0x20(%0),%0 \n"
1919 "pand %%xmm5,%%xmm0 \n"
1920 "pand %%xmm5,%%xmm1 \n"
1921 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001922 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001923 "movdqu %%xmm0,(%1) \n"
1924 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001925 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001926 : "+r"(src_yuy2), // %0
1927 "+r"(dst_y), // %1
1928 "+r"(pix) // %2
1929 :
1930 : "memory", "cc"
1931#if defined(__SSE2__)
1932 , "xmm0", "xmm1", "xmm5"
1933#endif
1934 );
1935}
1936
1937void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1938 int stride_yuy2,
1939 uint8* dst_u, uint8* dst_y,
1940 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001941 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001942 "pcmpeqb %%xmm5,%%xmm5 \n"
1943 "psrlw $0x8,%%xmm5 \n"
1944 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001945 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001946 "1: \n"
1947 "movdqu (%0),%%xmm0 \n"
1948 "movdqu 0x10(%0),%%xmm1 \n"
1949 "movdqu (%0,%4,1),%%xmm2 \n"
1950 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1951 "lea 0x20(%0),%0 \n"
1952 "pavgb %%xmm2,%%xmm0 \n"
1953 "pavgb %%xmm3,%%xmm1 \n"
1954 "psrlw $0x8,%%xmm0 \n"
1955 "psrlw $0x8,%%xmm1 \n"
1956 "packuswb %%xmm1,%%xmm0 \n"
1957 "movdqa %%xmm0,%%xmm1 \n"
1958 "pand %%xmm5,%%xmm0 \n"
1959 "packuswb %%xmm0,%%xmm0 \n"
1960 "psrlw $0x8,%%xmm1 \n"
1961 "packuswb %%xmm1,%%xmm1 \n"
1962 "movq %%xmm0,(%1) \n"
1963 "movq %%xmm1,(%1,%2) \n"
1964 "lea 0x8(%1),%1 \n"
1965 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001966 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001967 : "+r"(src_yuy2), // %0
1968 "+r"(dst_u), // %1
1969 "+r"(dst_y), // %2
1970 "+r"(pix) // %3
1971 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1972 : "memory", "cc"
1973#if defined(__SSE2__)
1974 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1975#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001976 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001977}
1978
1979void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001980 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001981 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001982 "1: \n"
1983 "movdqa (%0),%%xmm0 \n"
1984 "movdqa 0x10(%0),%%xmm1 \n"
1985 "lea 0x20(%0),%0 \n"
1986 "psrlw $0x8,%%xmm0 \n"
1987 "psrlw $0x8,%%xmm1 \n"
1988 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001989 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001990 "movdqa %%xmm0,(%1) \n"
1991 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001992 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001993 : "+r"(src_uyvy), // %0
1994 "+r"(dst_y), // %1
1995 "+r"(pix) // %2
1996 :
1997 : "memory", "cc"
1998#if defined(__SSE2__)
1999 , "xmm0", "xmm1"
2000#endif
2001 );
2002}
2003
2004void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2005 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002006 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002007 "pcmpeqb %%xmm5,%%xmm5 \n"
2008 "psrlw $0x8,%%xmm5 \n"
2009 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002010 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002011 "1: \n"
2012 "movdqa (%0),%%xmm0 \n"
2013 "movdqa 0x10(%0),%%xmm1 \n"
2014 "movdqa (%0,%4,1),%%xmm2 \n"
2015 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2016 "lea 0x20(%0),%0 \n"
2017 "pavgb %%xmm2,%%xmm0 \n"
2018 "pavgb %%xmm3,%%xmm1 \n"
2019 "pand %%xmm5,%%xmm0 \n"
2020 "pand %%xmm5,%%xmm1 \n"
2021 "packuswb %%xmm1,%%xmm0 \n"
2022 "movdqa %%xmm0,%%xmm1 \n"
2023 "pand %%xmm5,%%xmm0 \n"
2024 "packuswb %%xmm0,%%xmm0 \n"
2025 "psrlw $0x8,%%xmm1 \n"
2026 "packuswb %%xmm1,%%xmm1 \n"
2027 "movq %%xmm0,(%1) \n"
2028 "movq %%xmm1,(%1,%2) \n"
2029 "lea 0x8(%1),%1 \n"
2030 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002031 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002032 : "+r"(src_uyvy), // %0
2033 "+r"(dst_u), // %1
2034 "+r"(dst_y), // %2
2035 "+r"(pix) // %3
2036 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2037 : "memory", "cc"
2038#if defined(__SSE2__)
2039 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2040#endif
2041 );
2042}
2043
2044void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2045 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002046 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002047 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002048 "1: \n"
2049 "movdqu (%0),%%xmm0 \n"
2050 "movdqu 0x10(%0),%%xmm1 \n"
2051 "lea 0x20(%0),%0 \n"
2052 "psrlw $0x8,%%xmm0 \n"
2053 "psrlw $0x8,%%xmm1 \n"
2054 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002055 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002056 "movdqu %%xmm0,(%1) \n"
2057 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002058 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002059 : "+r"(src_uyvy), // %0
2060 "+r"(dst_y), // %1
2061 "+r"(pix) // %2
2062 :
2063 : "memory", "cc"
2064#if defined(__SSE2__)
2065 , "xmm0", "xmm1"
2066#endif
2067 );
2068}
2069
2070void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2071 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002072 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002073 "pcmpeqb %%xmm5,%%xmm5 \n"
2074 "psrlw $0x8,%%xmm5 \n"
2075 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002076 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002077 "1: \n"
2078 "movdqu (%0),%%xmm0 \n"
2079 "movdqu 0x10(%0),%%xmm1 \n"
2080 "movdqu (%0,%4,1),%%xmm2 \n"
2081 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2082 "lea 0x20(%0),%0 \n"
2083 "pavgb %%xmm2,%%xmm0 \n"
2084 "pavgb %%xmm3,%%xmm1 \n"
2085 "pand %%xmm5,%%xmm0 \n"
2086 "pand %%xmm5,%%xmm1 \n"
2087 "packuswb %%xmm1,%%xmm0 \n"
2088 "movdqa %%xmm0,%%xmm1 \n"
2089 "pand %%xmm5,%%xmm0 \n"
2090 "packuswb %%xmm0,%%xmm0 \n"
2091 "psrlw $0x8,%%xmm1 \n"
2092 "packuswb %%xmm1,%%xmm1 \n"
2093 "movq %%xmm0,(%1) \n"
2094 "movq %%xmm1,(%1,%2) \n"
2095 "lea 0x8(%1),%1 \n"
2096 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002097 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002098 : "+r"(src_uyvy), // %0
2099 "+r"(dst_u), // %1
2100 "+r"(dst_y), // %2
2101 "+r"(pix) // %3
2102 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2103 : "memory", "cc"
2104#if defined(__SSE2__)
2105 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2106#endif
2107 );
2108}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002109#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002110
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002111#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002112// Blend 8 pixels at a time.
2113// src_argb0 unaligned.
2114// src_argb1 and dst_argb aligned to 16 bytes.
2115// width must be multiple of 4 pixels.
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002116void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.comc757f302012-04-03 00:49:16 +00002117 uint8* dst_argb, int width) {
2118 asm volatile (
2119 "pcmpeqb %%xmm7,%%xmm7 \n"
2120 "psrlw $0xf,%%xmm7 \n"
2121 "pcmpeqb %%xmm6,%%xmm6 \n"
2122 "psrlw $0x8,%%xmm6 \n"
2123 "pcmpeqb %%xmm5,%%xmm5 \n"
2124 "psllw $0x8,%%xmm5 \n"
2125 "pcmpeqb %%xmm4,%%xmm4 \n"
2126 "pslld $0x18,%%xmm4 \n"
2127
2128 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002129 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002130 "1: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002131 "movdqu (%0),%%xmm3 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002132 "movdqa %%xmm3,%%xmm0 \n"
2133 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002134 "movdqu (%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002135 "psrlw $0x8,%%xmm3 \n"
2136 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2137 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2138 "pand %%xmm6,%%xmm2 \n"
2139 "paddw %%xmm7,%%xmm3 \n"
2140 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002141 "movdqu (%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002142 "psrlw $0x8,%%xmm1 \n"
2143 "por %%xmm4,%%xmm0 \n"
2144 "pmullw %%xmm3,%%xmm1 \n"
2145 "movdqu 0x10(%0),%%xmm3 \n"
2146 "lea 0x20(%0),%0 \n"
2147 "psrlw $0x8,%%xmm2 \n"
2148 "paddusb %%xmm2,%%xmm0 \n"
2149 "pand %%xmm5,%%xmm1 \n"
2150 "paddusb %%xmm1,%%xmm0 \n"
2151 "sub $0x4,%3 \n"
2152 "movdqa %%xmm0,(%2) \n"
2153 "jle 9f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002154 "movdqa %%xmm3,%%xmm0 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002155 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002156 "movdqu 0x10(%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002157 "psrlw $0x8,%%xmm3 \n"
2158 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2159 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2160 "pand %%xmm6,%%xmm2 \n"
2161 "paddw %%xmm7,%%xmm3 \n"
2162 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002163 "movdqu 0x10(%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002164 "lea 0x20(%1),%1 \n"
2165 "psrlw $0x8,%%xmm1 \n"
2166 "por %%xmm4,%%xmm0 \n"
2167 "pmullw %%xmm3,%%xmm1 \n"
2168 "psrlw $0x8,%%xmm2 \n"
2169 "paddusb %%xmm2,%%xmm0 \n"
2170 "pand %%xmm5,%%xmm1 \n"
2171 "paddusb %%xmm1,%%xmm0 \n"
2172 "sub $0x4,%3 \n"
2173 "movdqa %%xmm0,0x10(%2) \n"
2174 "lea 0x20(%2),%2 \n"
2175 "jg 1b \n"
2176 "9: \n"
2177 : "+r"(src_argb0), // %0
2178 "+r"(src_argb1), // %1
2179 "+r"(dst_argb), // %2
2180 "+r"(width) // %3
2181 :
2182 : "memory", "cc"
2183#if defined(__SSE2__)
2184 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2185#endif
2186 );
2187}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002188#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002189
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002190#ifdef HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002191// Blend 1 pixel at a time, unaligned
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002192void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com96af8702012-04-06 18:22:27 +00002193 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002194 asm volatile (
2195 "pcmpeqb %%xmm7,%%xmm7 \n"
2196 "psrlw $0xf,%%xmm7 \n"
2197 "pcmpeqb %%xmm6,%%xmm6 \n"
2198 "psrlw $0x8,%%xmm6 \n"
2199 "pcmpeqb %%xmm5,%%xmm5 \n"
2200 "psllw $0x8,%%xmm5 \n"
2201 "pcmpeqb %%xmm4,%%xmm4 \n"
2202 "pslld $0x18,%%xmm4 \n"
2203
2204 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002205 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002206 "1: \n"
2207 "movd (%0),%%xmm3 \n"
2208 "lea 0x4(%0),%0 \n"
2209 "movdqa %%xmm3,%%xmm0 \n"
2210 "pxor %%xmm4,%%xmm3 \n"
2211 "movd (%1),%%xmm2 \n"
2212 "psrlw $0x8,%%xmm3 \n"
2213 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2214 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2215 "pand %%xmm6,%%xmm2 \n"
2216 "paddw %%xmm7,%%xmm3 \n"
2217 "pmullw %%xmm3,%%xmm2 \n"
2218 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002219 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002220 "psrlw $0x8,%%xmm1 \n"
2221 "por %%xmm4,%%xmm0 \n"
2222 "pmullw %%xmm3,%%xmm1 \n"
2223 "psrlw $0x8,%%xmm2 \n"
2224 "paddusb %%xmm2,%%xmm0 \n"
2225 "pand %%xmm5,%%xmm1 \n"
2226 "paddusb %%xmm1,%%xmm0 \n"
2227 "sub $0x1,%3 \n"
2228 "movd %%xmm0,(%2) \n"
2229 "lea 0x4(%2),%2 \n"
2230 "jg 1b \n"
2231 : "+r"(src_argb0), // %0
2232 "+r"(src_argb1), // %1
2233 "+r"(dst_argb), // %2
2234 "+r"(width) // %3
2235 :
2236 : "memory", "cc"
2237#if defined(__SSE2__)
2238 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2239#endif
2240 );
2241}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002242#endif // HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002243
fbarchard@google.com96af8702012-04-06 18:22:27 +00002244#ifdef HAS_ARGBBLENDROW_SSSE3
2245// Shuffle table for reversing the bytes.
2246CONST uvec8 kShuffleAlpha = {
2247 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2248 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2249};
2250void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002251 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002252 asm volatile (
2253 "pcmpeqb %%xmm7,%%xmm7 \n"
2254 "psrlw $0xf,%%xmm7 \n"
2255 "pcmpeqb %%xmm6,%%xmm6 \n"
2256 "psrlw $0x8,%%xmm6 \n"
2257 "pcmpeqb %%xmm5,%%xmm5 \n"
2258 "psllw $0x8,%%xmm5 \n"
2259 "pcmpeqb %%xmm4,%%xmm4 \n"
2260 "pslld $0x18,%%xmm4 \n"
2261
2262 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002263 ".p2align 4 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002264 "1: \n"
2265 "movdqu (%0),%%xmm3 \n"
2266 "movdqa %%xmm3,%%xmm0 \n"
2267 "pxor %%xmm4,%%xmm3 \n"
2268 "pshufb %4,%%xmm3 \n"
2269 "movdqu (%1),%%xmm2 \n"
2270 "pand %%xmm6,%%xmm2 \n"
2271 "paddw %%xmm7,%%xmm3 \n"
2272 "pmullw %%xmm3,%%xmm2 \n"
2273 "movdqu (%1),%%xmm1 \n"
2274 "psrlw $0x8,%%xmm1 \n"
2275 "por %%xmm4,%%xmm0 \n"
2276 "pmullw %%xmm3,%%xmm1 \n"
2277 "movdqu 0x10(%0),%%xmm3 \n"
2278 "lea 0x20(%0),%0 \n"
2279 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002280 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002281 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002282 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002283 "sub $0x4,%3 \n"
2284 "movdqa %%xmm0,(%2) \n"
2285 "jle 9f \n"
2286 "movdqa %%xmm3,%%xmm0 \n"
2287 "pxor %%xmm4,%%xmm3 \n"
2288 "movdqu 0x10(%1),%%xmm2 \n"
2289 "pshufb %4,%%xmm3 \n"
2290 "pand %%xmm6,%%xmm2 \n"
2291 "paddw %%xmm7,%%xmm3 \n"
2292 "pmullw %%xmm3,%%xmm2 \n"
2293 "movdqu 0x10(%1),%%xmm1 \n"
2294 "lea 0x20(%1),%1 \n"
2295 "psrlw $0x8,%%xmm1 \n"
2296 "por %%xmm4,%%xmm0 \n"
2297 "pmullw %%xmm3,%%xmm1 \n"
2298 "psrlw $0x8,%%xmm2 \n"
2299 "paddusb %%xmm2,%%xmm0 \n"
2300 "pand %%xmm5,%%xmm1 \n"
2301 "paddusb %%xmm1,%%xmm0 \n"
2302 "sub $0x4,%3 \n"
2303 "movdqa %%xmm0,0x10(%2) \n"
2304 "lea 0x20(%2),%2 \n"
2305 "jg 1b \n"
2306 "9: \n"
2307 : "+r"(src_argb0), // %0
2308 "+r"(src_argb1), // %1
2309 "+r"(dst_argb), // %2
2310 "+r"(width) // %3
2311 : "m"(kShuffleAlpha) // %4
2312 : "memory", "cc"
2313#if defined(__SSE2__)
2314 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2315#endif
2316 );
2317}
2318#endif // HAS_ARGBBLENDROW_SSSE3
2319
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002320
2321#ifdef HAS_ARGBBLENDROW1_SSSE3
2322// Blend 1 pixel at a time, unaligned
2323void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2324 uint8* dst_argb, int width) {
2325 asm volatile (
2326 "pcmpeqb %%xmm7,%%xmm7 \n"
2327 "psrlw $0xf,%%xmm7 \n"
2328 "pcmpeqb %%xmm6,%%xmm6 \n"
2329 "psrlw $0x8,%%xmm6 \n"
2330 "pcmpeqb %%xmm5,%%xmm5 \n"
2331 "psllw $0x8,%%xmm5 \n"
2332 "pcmpeqb %%xmm4,%%xmm4 \n"
2333 "pslld $0x18,%%xmm4 \n"
2334
2335 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002336 ".p2align 4 \n"
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002337 "1: \n"
2338 "movd (%0),%%xmm3 \n"
2339 "lea 0x4(%0),%0 \n"
2340 "movdqa %%xmm3,%%xmm0 \n"
2341 "pxor %%xmm4,%%xmm3 \n"
2342 "movd (%1),%%xmm2 \n"
2343 "pshufb %4,%%xmm3 \n"
2344 "pand %%xmm6,%%xmm2 \n"
2345 "paddw %%xmm7,%%xmm3 \n"
2346 "pmullw %%xmm3,%%xmm2 \n"
2347 "movd (%1),%%xmm1 \n"
2348 "lea 0x4(%1),%1 \n"
2349 "psrlw $0x8,%%xmm1 \n"
2350 "por %%xmm4,%%xmm0 \n"
2351 "pmullw %%xmm3,%%xmm1 \n"
2352 "psrlw $0x8,%%xmm2 \n"
2353 "paddusb %%xmm2,%%xmm0 \n"
2354 "pand %%xmm5,%%xmm1 \n"
2355 "paddusb %%xmm1,%%xmm0 \n"
2356 "sub $0x1,%3 \n"
2357 "movd %%xmm0,(%2) \n"
2358 "lea 0x4(%2),%2 \n"
2359 "jg 1b \n"
2360 : "+r"(src_argb0), // %0
2361 "+r"(src_argb1), // %1
2362 "+r"(dst_argb), // %2
2363 "+r"(width) // %3
2364 : "m"(kShuffleAlpha) // %4
2365 : "memory", "cc"
2366#if defined(__SSE2__)
2367 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2368#endif
2369 );
2370}
2371#endif // HAS_ARGBBLENDROW1_SSSE3
2372
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002373#ifdef HAS_ARGBATTENUATE_SSE2
2374// Attenuate 4 pixels at a time.
2375// aligned to 16 bytes
2376void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2377 asm volatile (
2378 "sub %0,%1 \n"
2379 "pcmpeqb %%xmm4,%%xmm4 \n"
2380 "pslld $0x18,%%xmm4 \n"
2381 "pcmpeqb %%xmm5,%%xmm5 \n"
2382 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002383
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002384 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002385 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002386 "1: \n"
2387 "movdqa (%0),%%xmm0 \n"
2388 "punpcklbw %%xmm0,%%xmm0 \n"
2389 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2390 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2391 "pmulhuw %%xmm2,%%xmm0 \n"
2392 "movdqa (%0),%%xmm1 \n"
2393 "punpckhbw %%xmm1,%%xmm1 \n"
2394 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2395 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2396 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002397 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002398 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002399 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002400 "psrlw $0x8,%%xmm1 \n"
2401 "packuswb %%xmm1,%%xmm0 \n"
2402 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002403 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002404 "sub $0x4,%2 \n"
2405 "movdqa %%xmm0,(%0,%1,1) \n"
2406 "lea 0x10(%0),%0 \n"
2407 "jg 1b \n"
2408 : "+r"(src_argb), // %0
2409 "+r"(dst_argb), // %1
2410 "+r"(width) // %2
2411 :
2412 : "memory", "cc"
2413#if defined(__SSE2__)
2414 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2415#endif
2416 );
2417}
2418#endif // HAS_ARGBATTENUATE_SSE2
2419
fbarchard@google.com810cd912012-04-20 20:15:27 +00002420#ifdef HAS_ARGBATTENUATE_SSSE3
2421// Shuffle table duplicating alpha
2422CONST uvec8 kShuffleAlpha0 = {
2423 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2424};
2425CONST uvec8 kShuffleAlpha1 = {
2426 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2427 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2428};
2429// Attenuate 4 pixels at a time.
2430// aligned to 16 bytes
2431void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2432 asm volatile (
2433 "sub %0,%1 \n"
2434 "pcmpeqb %%xmm3,%%xmm3 \n"
2435 "pslld $0x18,%%xmm3 \n"
2436 "movdqa %3,%%xmm4 \n"
2437 "movdqa %4,%%xmm5 \n"
2438
2439 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002440 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002441 "1: \n"
2442 "movdqa (%0),%%xmm0 \n"
2443 "pshufb %%xmm4,%%xmm0 \n"
2444 "movdqa (%0),%%xmm1 \n"
2445 "punpcklbw %%xmm1,%%xmm1 \n"
2446 "pmulhuw %%xmm1,%%xmm0 \n"
2447 "movdqa (%0),%%xmm1 \n"
2448 "pshufb %%xmm5,%%xmm1 \n"
2449 "movdqa (%0),%%xmm2 \n"
2450 "punpckhbw %%xmm2,%%xmm2 \n"
2451 "pmulhuw %%xmm2,%%xmm1 \n"
2452 "movdqa (%0),%%xmm2 \n"
2453 "pand %%xmm3,%%xmm2 \n"
2454 "psrlw $0x8,%%xmm0 \n"
2455 "psrlw $0x8,%%xmm1 \n"
2456 "packuswb %%xmm1,%%xmm0 \n"
2457 "por %%xmm2,%%xmm0 \n"
2458 "sub $0x4,%2 \n"
2459 "movdqa %%xmm0,(%0,%1,1) \n"
2460 "lea 0x10(%0),%0 \n"
2461 "jg 1b \n"
2462 : "+r"(src_argb), // %0
2463 "+r"(dst_argb), // %1
2464 "+r"(width) // %2
2465 : "m"(kShuffleAlpha0), // %3
2466 "m"(kShuffleAlpha1) // %4
2467 : "memory", "cc"
2468#if defined(__SSE2__)
2469 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2470#endif
2471 );
2472}
2473#endif // HAS_ARGBATTENUATE_SSSE3
2474
2475#ifdef HAS_ARGBUNATTENUATE_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00002476// Unattenuate 4 pixels at a time.
2477// aligned to 16 bytes
2478void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2479 int width) {
2480 uintptr_t alpha = 0;
2481 asm volatile (
2482 "sub %0,%1 \n"
2483 "pcmpeqb %%xmm4,%%xmm4 \n"
2484 "pslld $0x18,%%xmm4 \n"
2485
2486 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002487 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002488 "1: \n"
2489 "movdqa (%0),%%xmm0 \n"
2490 "movzb 0x3(%0),%3 \n"
2491 "punpcklbw %%xmm0,%%xmm0 \n"
2492 "movd 0x0(%4,%3,4),%%xmm2 \n"
2493 "movzb 0x7(%0),%3 \n"
2494 "movd 0x0(%4,%3,4),%%xmm3 \n"
2495 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2496 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2497 "movlhps %%xmm3,%%xmm2 \n"
2498 "pmulhuw %%xmm2,%%xmm0 \n"
2499 "movdqa (%0),%%xmm1 \n"
2500 "movzb 0xb(%0),%3 \n"
2501 "punpckhbw %%xmm1,%%xmm1 \n"
2502 "movd 0x0(%4,%3,4),%%xmm2 \n"
2503 "movzb 0xf(%0),%3 \n"
2504 "movd 0x0(%4,%3,4),%%xmm3 \n"
2505 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2506 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2507 "movlhps %%xmm3,%%xmm2 \n"
2508 "pmulhuw %%xmm2,%%xmm1 \n"
2509 "movdqa (%0),%%xmm2 \n"
2510 "pand %%xmm4,%%xmm2 \n"
2511 "packuswb %%xmm1,%%xmm0 \n"
2512 "por %%xmm2,%%xmm0 \n"
2513 "sub $0x4,%2 \n"
2514 "movdqa %%xmm0,(%0,%1,1) \n"
2515 "lea 0x10(%0),%0 \n"
2516 "jg 1b \n"
2517 : "+r"(src_argb), // %0
2518 "+r"(dst_argb), // %1
2519 "+r"(width), // %2
2520 "+r"(alpha) // %3
2521 : "r"(fixed_invtbl8) // %4
2522 : "memory", "cc"
2523#if defined(__SSE2__)
2524 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2525#endif
2526 );
2527}
2528#endif // HAS_ARGBUNATTENUATE_SSE2
2529
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002530#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002531// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
2532CONST vec8 kARGBToGray = {
2533 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
2534};
2535
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002536// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2537void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) {
2538 asm volatile (
2539 "movdqa %2,%%xmm4 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002540 // 8 pixel loop \n"
2541 ".p2align 4 \n"
2542 "1: \n"
2543 "movdqa (%0),%%xmm0 \n"
2544 "movdqa 0x10(%0),%%xmm1 \n"
2545 "pmaddubsw %%xmm4,%%xmm0 \n"
2546 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002547 "phaddw %%xmm1,%%xmm0 \n"
2548 "psrlw $0x7,%%xmm0 \n"
2549 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002550 "movdqa (%0),%%xmm2 \n"
2551 "movdqa 0x10(%0),%%xmm3 \n"
2552 "psrld $0x18,%%xmm2 \n"
2553 "psrld $0x18,%%xmm3 \n"
2554 "packuswb %%xmm3,%%xmm2 \n"
2555 "packuswb %%xmm2,%%xmm2 \n"
2556 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002557 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002558 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002559 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002560 "punpcklwd %%xmm3,%%xmm0 \n"
2561 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002562 "sub $0x8,%1 \n"
2563 "movdqa %%xmm0,(%0) \n"
2564 "movdqa %%xmm1,0x10(%0) \n"
2565 "lea 0x20(%0),%0 \n"
2566 "jg 1b \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002567 : "+r"(dst_argb), // %0
2568 "+r"(width) // %1
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002569 : "m"(kARGBToGray) // %2
2570 : "memory", "cc"
2571#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00002572 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002573#endif
2574 );
2575}
2576#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002577
2578#ifdef HAS_ARGBSEPIAROW_SSSE3
2579// b = (r * 35 + g * 68 + b * 17) >> 7
2580// g = (r * 45 + g * 88 + b * 22) >> 7
2581// r = (r * 50 + g * 98 + b * 24) >> 7
2582// Constant for ARGB color to sepia tone
2583CONST vec8 kARGBToSepiaB = {
2584 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
2585};
2586
2587CONST vec8 kARGBToSepiaG = {
2588 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
2589};
2590
2591CONST vec8 kARGBToSepiaR = {
2592 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
2593};
2594
2595// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels
2596void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
2597 asm volatile (
2598 "movdqa %2,%%xmm2 \n"
2599 "movdqa %3,%%xmm3 \n"
2600 "movdqa %4,%%xmm4 \n"
2601 // 8 pixel loop \n"
2602 ".p2align 4 \n"
2603 "1: \n"
2604 "movdqa (%0),%%xmm0 \n"
2605 "movdqa 0x10(%0),%%xmm6 \n"
2606 "pmaddubsw %%xmm2,%%xmm0 \n"
2607 "pmaddubsw %%xmm2,%%xmm6 \n"
2608 "phaddw %%xmm6,%%xmm0 \n"
2609 "psrlw $0x7,%%xmm0 \n"
2610 "packuswb %%xmm0,%%xmm0 \n"
2611 "movdqa (%0),%%xmm5 \n"
2612 "movdqa 0x10(%0),%%xmm1 \n"
2613 "pmaddubsw %%xmm3,%%xmm5 \n"
2614 "pmaddubsw %%xmm3,%%xmm1 \n"
2615 "phaddw %%xmm1,%%xmm5 \n"
2616 "psrlw $0x7,%%xmm5 \n"
2617 "packuswb %%xmm5,%%xmm5 \n"
2618 "punpcklbw %%xmm5,%%xmm0 \n"
2619 "movdqa (%0),%%xmm5 \n"
2620 "movdqa 0x10(%0),%%xmm1 \n"
2621 "pmaddubsw %%xmm4,%%xmm5 \n"
2622 "pmaddubsw %%xmm4,%%xmm1 \n"
2623 "phaddw %%xmm1,%%xmm5 \n"
2624 "psrlw $0x7,%%xmm5 \n"
2625 "packuswb %%xmm5,%%xmm5 \n"
2626 "movdqa (%0),%%xmm6 \n"
2627 "movdqa 0x10(%0),%%xmm1 \n"
2628 "psrld $0x18,%%xmm6 \n"
2629 "psrld $0x18,%%xmm1 \n"
2630 "packuswb %%xmm1,%%xmm6 \n"
2631 "packuswb %%xmm6,%%xmm6 \n"
2632 "punpcklbw %%xmm6,%%xmm5 \n"
2633 "movdqa %%xmm0,%%xmm1 \n"
2634 "punpcklwd %%xmm5,%%xmm0 \n"
2635 "punpckhwd %%xmm5,%%xmm1 \n"
2636 "sub $0x8,%1 \n"
2637 "movdqa %%xmm0,(%0) \n"
2638 "movdqa %%xmm1,0x10(%0) \n"
2639 "lea 0x20(%0),%0 \n"
2640 "jg 1b \n"
2641 : "+r"(dst_argb), // %0
2642 "+r"(width) // %1
2643 : "m"(kARGBToSepiaB), // %2
2644 "m"(kARGBToSepiaG), // %3
2645 "m"(kARGBToSepiaR) // %4
2646 : "memory", "cc"
2647#if defined(__SSE2__)
2648 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2649#endif
2650 );
2651}
2652#endif // HAS_ARGBSEPIAROW_SSSE3
2653
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002654#endif // defined(__x86_64__) || defined(__i386__)
2655
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002656#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002657} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002658} // namespace libyuv
2659#endif