blob: 122b309333b0308995123bb68979339c337f1e65 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000115 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000116 "1: \n"
117 "movq (%0),%%xmm0 \n"
118 "lea 0x8(%0),%0 \n"
119 "punpcklbw %%xmm0,%%xmm0 \n"
120 "movdqa %%xmm0,%%xmm1 \n"
121 "punpcklwd %%xmm0,%%xmm0 \n"
122 "punpckhwd %%xmm1,%%xmm1 \n"
123 "por %%xmm5,%%xmm0 \n"
124 "por %%xmm5,%%xmm1 \n"
125 "movdqa %%xmm0,(%1) \n"
126 "movdqa %%xmm1,0x10(%1) \n"
127 "lea 0x20(%1),%1 \n"
128 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000129 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000130 : "+r"(src_y), // %0
131 "+r"(dst_argb), // %1
132 "+r"(pix) // %2
133 :
134 : "memory", "cc"
135#if defined(__SSE2__)
136 , "xmm0", "xmm1", "xmm5"
137#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000138 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000139}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140
141void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000142 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000143 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000144 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000145 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "1: \n"
147 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000148 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000150 "movdqa %%xmm0,(%0,%1,1) \n"
151 "lea 0x10(%0),%0 \n"
152 "jg 1b \n"
153
fbarchard@google.comb6149762011-11-07 21:58:52 +0000154 : "+r"(src_abgr), // %0
155 "+r"(dst_argb), // %1
156 "+r"(pix) // %2
157 : "m"(kShuffleMaskABGRToARGB) // %3
158 : "memory", "cc"
159#if defined(__SSE2__)
160 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000161#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000162 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163}
164
165void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000166 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000168 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000169 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "1: \n"
171 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000172 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000173 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000174 "movdqa %%xmm0,(%0,%1,1) \n"
175 "lea 0x10(%0),%0 \n"
176 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000177 : "+r"(src_bgra), // %0
178 "+r"(dst_argb), // %1
179 "+r"(pix) // %2
180 : "m"(kShuffleMaskBGRAToARGB) // %3
181 : "memory", "cc"
182#if defined(__SSE2__)
183 , "xmm0", "xmm5"
184#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000185 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000186}
187
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000188void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000189 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000190 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
191 "pslld $0x18,%%xmm5 \n"
192 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000193 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000194 "1: \n"
195 "movdqu (%0),%%xmm0 \n"
196 "movdqu 0x10(%0),%%xmm1 \n"
197 "movdqu 0x20(%0),%%xmm3 \n"
198 "lea 0x30(%0),%0 \n"
199 "movdqa %%xmm3,%%xmm2 \n"
200 "palignr $0x8,%%xmm1,%%xmm2 \n"
201 "pshufb %%xmm4,%%xmm2 \n"
202 "por %%xmm5,%%xmm2 \n"
203 "palignr $0xc,%%xmm0,%%xmm1 \n"
204 "pshufb %%xmm4,%%xmm0 \n"
205 "movdqa %%xmm2,0x20(%1) \n"
206 "por %%xmm5,%%xmm0 \n"
207 "pshufb %%xmm4,%%xmm1 \n"
208 "movdqa %%xmm0,(%1) \n"
209 "por %%xmm5,%%xmm1 \n"
210 "palignr $0x4,%%xmm3,%%xmm3 \n"
211 "pshufb %%xmm4,%%xmm3 \n"
212 "movdqa %%xmm1,0x10(%1) \n"
213 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000215 "movdqa %%xmm3,0x30(%1) \n"
216 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000217 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000218 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000221 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000227}
228
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000229void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
232 "pslld $0x18,%%xmm5 \n"
233 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000234 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000235 "1: \n"
236 "movdqu (%0),%%xmm0 \n"
237 "movdqu 0x10(%0),%%xmm1 \n"
238 "movdqu 0x20(%0),%%xmm3 \n"
239 "lea 0x30(%0),%0 \n"
240 "movdqa %%xmm3,%%xmm2 \n"
241 "palignr $0x8,%%xmm1,%%xmm2 \n"
242 "pshufb %%xmm4,%%xmm2 \n"
243 "por %%xmm5,%%xmm2 \n"
244 "palignr $0xc,%%xmm0,%%xmm1 \n"
245 "pshufb %%xmm4,%%xmm0 \n"
246 "movdqa %%xmm2,0x20(%1) \n"
247 "por %%xmm5,%%xmm0 \n"
248 "pshufb %%xmm4,%%xmm1 \n"
249 "movdqa %%xmm0,(%1) \n"
250 "por %%xmm5,%%xmm1 \n"
251 "palignr $0x4,%%xmm3,%%xmm3 \n"
252 "pshufb %%xmm4,%%xmm3 \n"
253 "movdqa %%xmm1,0x10(%1) \n"
254 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000255 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000256 "movdqa %%xmm3,0x30(%1) \n"
257 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000258 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000259 : "+r"(src_raw), // %0
260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000262 : "m"(kShuffleMaskRAWToARGB) // %3
263 : "memory", "cc"
264#if defined(__SSE2__)
265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
266#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000268}
269
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000270void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000271 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000272 "mov $0x1080108,%%eax \n"
273 "movd %%eax,%%xmm5 \n"
274 "pshufd $0x0,%%xmm5,%%xmm5 \n"
275 "mov $0x20082008,%%eax \n"
276 "movd %%eax,%%xmm6 \n"
277 "pshufd $0x0,%%xmm6,%%xmm6 \n"
278 "pcmpeqb %%xmm3,%%xmm3 \n"
279 "psllw $0xb,%%xmm3 \n"
280 "pcmpeqb %%xmm4,%%xmm4 \n"
281 "psllw $0xa,%%xmm4 \n"
282 "psrlw $0x5,%%xmm4 \n"
283 "pcmpeqb %%xmm7,%%xmm7 \n"
284 "psllw $0x8,%%xmm7 \n"
285 "sub %0,%1 \n"
286 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000287 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000288 "1: \n"
289 "movdqu (%0),%%xmm0 \n"
290 "movdqa %%xmm0,%%xmm1 \n"
291 "movdqa %%xmm0,%%xmm2 \n"
292 "pand %%xmm3,%%xmm1 \n"
293 "psllw $0xb,%%xmm2 \n"
294 "pmulhuw %%xmm5,%%xmm1 \n"
295 "pmulhuw %%xmm5,%%xmm2 \n"
296 "psllw $0x8,%%xmm1 \n"
297 "por %%xmm2,%%xmm1 \n"
298 "pand %%xmm4,%%xmm0 \n"
299 "pmulhuw %%xmm6,%%xmm0 \n"
300 "por %%xmm7,%%xmm0 \n"
301 "movdqa %%xmm1,%%xmm2 \n"
302 "punpcklbw %%xmm0,%%xmm1 \n"
303 "punpckhbw %%xmm0,%%xmm2 \n"
304 "movdqa %%xmm1,(%1,%0,2) \n"
305 "movdqa %%xmm2,0x10(%1,%0,2) \n"
306 "lea 0x10(%0),%0 \n"
307 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000308 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000309 : "+r"(src), // %0
310 "+r"(dst), // %1
311 "+r"(pix) // %2
312 :
313 : "memory", "cc", "eax"
314#if defined(__SSE2__)
315 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
316#endif
317 );
318}
319
320void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000321 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000322 "mov $0x1080108,%%eax \n"
323 "movd %%eax,%%xmm5 \n"
324 "pshufd $0x0,%%xmm5,%%xmm5 \n"
325 "mov $0x42004200,%%eax \n"
326 "movd %%eax,%%xmm6 \n"
327 "pshufd $0x0,%%xmm6,%%xmm6 \n"
328 "pcmpeqb %%xmm3,%%xmm3 \n"
329 "psllw $0xb,%%xmm3 \n"
330 "movdqa %%xmm3,%%xmm4 \n"
331 "psrlw $0x6,%%xmm4 \n"
332 "pcmpeqb %%xmm7,%%xmm7 \n"
333 "psllw $0x8,%%xmm7 \n"
334 "sub %0,%1 \n"
335 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000336 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000337 "1: \n"
338 "movdqu (%0),%%xmm0 \n"
339 "movdqa %%xmm0,%%xmm1 \n"
340 "movdqa %%xmm0,%%xmm2 \n"
341 "psllw $0x1,%%xmm1 \n"
342 "psllw $0xb,%%xmm2 \n"
343 "pand %%xmm3,%%xmm1 \n"
344 "pmulhuw %%xmm5,%%xmm2 \n"
345 "pmulhuw %%xmm5,%%xmm1 \n"
346 "psllw $0x8,%%xmm1 \n"
347 "por %%xmm2,%%xmm1 \n"
348 "movdqa %%xmm0,%%xmm2 \n"
349 "pand %%xmm4,%%xmm0 \n"
350 "psraw $0x8,%%xmm2 \n"
351 "pmulhuw %%xmm6,%%xmm0 \n"
352 "pand %%xmm7,%%xmm2 \n"
353 "por %%xmm2,%%xmm0 \n"
354 "movdqa %%xmm1,%%xmm2 \n"
355 "punpcklbw %%xmm0,%%xmm1 \n"
356 "punpckhbw %%xmm0,%%xmm2 \n"
357 "movdqa %%xmm1,(%1,%0,2) \n"
358 "movdqa %%xmm2,0x10(%1,%0,2) \n"
359 "lea 0x10(%0),%0 \n"
360 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000361 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000362 : "+r"(src), // %0
363 "+r"(dst), // %1
364 "+r"(pix) // %2
365 :
366 : "memory", "cc", "eax"
367#if defined(__SSE2__)
368 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
369#endif
370 );
371}
372
373void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000374 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000375 "mov $0xf0f0f0f,%%eax \n"
376 "movd %%eax,%%xmm4 \n"
377 "pshufd $0x0,%%xmm4,%%xmm4 \n"
378 "movdqa %%xmm4,%%xmm5 \n"
379 "pslld $0x4,%%xmm5 \n"
380 "sub %0,%1 \n"
381 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000382 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000383 "1: \n"
384 "movdqu (%0),%%xmm0 \n"
385 "movdqa %%xmm0,%%xmm2 \n"
386 "pand %%xmm4,%%xmm0 \n"
387 "pand %%xmm5,%%xmm2 \n"
388 "movdqa %%xmm0,%%xmm1 \n"
389 "movdqa %%xmm2,%%xmm3 \n"
390 "psllw $0x4,%%xmm1 \n"
391 "psrlw $0x4,%%xmm3 \n"
392 "por %%xmm1,%%xmm0 \n"
393 "por %%xmm3,%%xmm2 \n"
394 "movdqa %%xmm0,%%xmm1 \n"
395 "punpcklbw %%xmm2,%%xmm0 \n"
396 "punpckhbw %%xmm2,%%xmm1 \n"
397 "movdqa %%xmm0,(%1,%0,2) \n"
398 "movdqa %%xmm1,0x10(%1,%0,2) \n"
399 "lea 0x10(%0),%0 \n"
400 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000401 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000402 : "+r"(src), // %0
403 "+r"(dst), // %1
404 "+r"(pix) // %2
405 :
406 : "memory", "cc", "eax"
407#if defined(__SSE2__)
408 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
409#endif
410 );
411}
412
413void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000414 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000415 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000416 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000417 "1: \n"
418 "movdqa (%0),%%xmm0 \n"
419 "movdqa 0x10(%0),%%xmm1 \n"
420 "movdqa 0x20(%0),%%xmm2 \n"
421 "movdqa 0x30(%0),%%xmm3 \n"
422 "lea 0x40(%0),%0 \n"
423 "pshufb %%xmm6,%%xmm0 \n"
424 "pshufb %%xmm6,%%xmm1 \n"
425 "pshufb %%xmm6,%%xmm2 \n"
426 "pshufb %%xmm6,%%xmm3 \n"
427 "movdqa %%xmm1,%%xmm4 \n"
428 "psrldq $0x4,%%xmm1 \n"
429 "pslldq $0xc,%%xmm4 \n"
430 "movdqa %%xmm2,%%xmm5 \n"
431 "por %%xmm4,%%xmm0 \n"
432 "pslldq $0x8,%%xmm5 \n"
433 "movdqa %%xmm0,(%1) \n"
434 "por %%xmm5,%%xmm1 \n"
435 "psrldq $0x8,%%xmm2 \n"
436 "pslldq $0x4,%%xmm3 \n"
437 "por %%xmm3,%%xmm2 \n"
438 "movdqa %%xmm1,0x10(%1) \n"
439 "movdqa %%xmm2,0x20(%1) \n"
440 "lea 0x30(%1),%1 \n"
441 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 : "m"(kShuffleMaskARGBToRGB24) // %3
447 : "memory", "cc"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
450#endif
451 );
452}
453
454void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000457 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000458 "1: \n"
459 "movdqa (%0),%%xmm0 \n"
460 "movdqa 0x10(%0),%%xmm1 \n"
461 "movdqa 0x20(%0),%%xmm2 \n"
462 "movdqa 0x30(%0),%%xmm3 \n"
463 "lea 0x40(%0),%0 \n"
464 "pshufb %%xmm6,%%xmm0 \n"
465 "pshufb %%xmm6,%%xmm1 \n"
466 "pshufb %%xmm6,%%xmm2 \n"
467 "pshufb %%xmm6,%%xmm3 \n"
468 "movdqa %%xmm1,%%xmm4 \n"
469 "psrldq $0x4,%%xmm1 \n"
470 "pslldq $0xc,%%xmm4 \n"
471 "movdqa %%xmm2,%%xmm5 \n"
472 "por %%xmm4,%%xmm0 \n"
473 "pslldq $0x8,%%xmm5 \n"
474 "movdqa %%xmm0,(%1) \n"
475 "por %%xmm5,%%xmm1 \n"
476 "psrldq $0x8,%%xmm2 \n"
477 "pslldq $0x4,%%xmm3 \n"
478 "por %%xmm3,%%xmm2 \n"
479 "movdqa %%xmm1,0x10(%1) \n"
480 "movdqa %%xmm2,0x20(%1) \n"
481 "lea 0x30(%1),%1 \n"
482 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000483 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000484 : "+r"(src), // %0
485 "+r"(dst), // %1
486 "+r"(pix) // %2
487 : "m"(kShuffleMaskARGBToRAW) // %3
488 : "memory", "cc"
489#if defined(__SSE2__)
490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
491#endif
492 );
493}
494
495void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000496 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000497 "pcmpeqb %%xmm3,%%xmm3 \n"
498 "psrld $0x1b,%%xmm3 \n"
499 "pcmpeqb %%xmm4,%%xmm4 \n"
500 "psrld $0x1a,%%xmm4 \n"
501 "pslld $0x5,%%xmm4 \n"
502 "pcmpeqb %%xmm5,%%xmm5 \n"
503 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000504 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000505 "1: \n"
506 "movdqa (%0),%%xmm0 \n"
507 "movdqa %%xmm0,%%xmm1 \n"
508 "movdqa %%xmm0,%%xmm2 \n"
509 "pslld $0x8,%%xmm0 \n"
510 "psrld $0x3,%%xmm1 \n"
511 "psrld $0x5,%%xmm2 \n"
512 "psrad $0x10,%%xmm0 \n"
513 "pand %%xmm3,%%xmm1 \n"
514 "pand %%xmm4,%%xmm2 \n"
515 "pand %%xmm5,%%xmm0 \n"
516 "por %%xmm2,%%xmm1 \n"
517 "por %%xmm1,%%xmm0 \n"
518 "packssdw %%xmm0,%%xmm0 \n"
519 "lea 0x10(%0),%0 \n"
520 "movq %%xmm0,(%1) \n"
521 "lea 0x8(%1),%1 \n"
522 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 :
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
531#endif
532 );
533}
534
535void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "pcmpeqb %%xmm4,%%xmm4 \n"
538 "psrld $0x1b,%%xmm4 \n"
539 "movdqa %%xmm4,%%xmm5 \n"
540 "pslld $0x5,%%xmm5 \n"
541 "movdqa %%xmm4,%%xmm6 \n"
542 "pslld $0xa,%%xmm6 \n"
543 "pcmpeqb %%xmm7,%%xmm7 \n"
544 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000545 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000546 "1: \n"
547 "movdqa (%0),%%xmm0 \n"
548 "movdqa %%xmm0,%%xmm1 \n"
549 "movdqa %%xmm0,%%xmm2 \n"
550 "movdqa %%xmm0,%%xmm3 \n"
551 "psrad $0x10,%%xmm0 \n"
552 "psrld $0x3,%%xmm1 \n"
553 "psrld $0x6,%%xmm2 \n"
554 "psrld $0x9,%%xmm3 \n"
555 "pand %%xmm7,%%xmm0 \n"
556 "pand %%xmm4,%%xmm1 \n"
557 "pand %%xmm5,%%xmm2 \n"
558 "pand %%xmm6,%%xmm3 \n"
559 "por %%xmm1,%%xmm0 \n"
560 "por %%xmm3,%%xmm2 \n"
561 "por %%xmm2,%%xmm0 \n"
562 "packssdw %%xmm0,%%xmm0 \n"
563 "lea 0x10(%0),%0 \n"
564 "movq %%xmm0,(%1) \n"
565 "lea 0x8(%1),%1 \n"
566 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000567 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 : "+r"(src), // %0
569 "+r"(dst), // %1
570 "+r"(pix) // %2
571 :
572 : "memory", "cc"
573#if defined(__SSE2__)
574 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
575#endif
576 );
577}
578
579void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000580 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000581 "pcmpeqb %%xmm4,%%xmm4 \n"
582 "psllw $0xc,%%xmm4 \n"
583 "movdqa %%xmm4,%%xmm3 \n"
584 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "pand %%xmm3,%%xmm0 \n"
590 "pand %%xmm4,%%xmm1 \n"
591 "psrlq $0x4,%%xmm0 \n"
592 "psrlq $0x8,%%xmm1 \n"
593 "por %%xmm1,%%xmm0 \n"
594 "packuswb %%xmm0,%%xmm0 \n"
595 "lea 0x10(%0),%0 \n"
596 "movq %%xmm0,(%1) \n"
597 "lea 0x8(%1),%1 \n"
598 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000599 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000600 : "+r"(src), // %0
601 "+r"(dst), // %1
602 "+r"(pix) // %2
603 :
604 : "memory", "cc"
605#if defined(__SSE2__)
606 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
607#endif
608 );
609}
610
fbarchard@google.comb6149762011-11-07 21:58:52 +0000611void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000612 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000613 "movdqa %4,%%xmm5 \n"
614 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000615 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000616 "1: \n"
617 "movdqa (%0),%%xmm0 \n"
618 "movdqa 0x10(%0),%%xmm1 \n"
619 "movdqa 0x20(%0),%%xmm2 \n"
620 "movdqa 0x30(%0),%%xmm3 \n"
621 "pmaddubsw %%xmm4,%%xmm0 \n"
622 "pmaddubsw %%xmm4,%%xmm1 \n"
623 "pmaddubsw %%xmm4,%%xmm2 \n"
624 "pmaddubsw %%xmm4,%%xmm3 \n"
625 "lea 0x40(%0),%0 \n"
626 "phaddw %%xmm1,%%xmm0 \n"
627 "phaddw %%xmm3,%%xmm2 \n"
628 "psrlw $0x7,%%xmm0 \n"
629 "psrlw $0x7,%%xmm2 \n"
630 "packuswb %%xmm2,%%xmm0 \n"
631 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000632 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000633 "movdqa %%xmm0,(%1) \n"
634 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000636 : "+r"(src_argb), // %0
637 "+r"(dst_y), // %1
638 "+r"(pix) // %2
639 : "m"(kARGBToY), // %3
640 "m"(kAddY16) // %4
641 : "memory", "cc"
642#if defined(__SSE2__)
643 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
644#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000645 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000646}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000647
648void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000649 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000650 "movdqa %4,%%xmm5 \n"
651 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000652 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000653 "1: \n"
654 "movdqu (%0),%%xmm0 \n"
655 "movdqu 0x10(%0),%%xmm1 \n"
656 "movdqu 0x20(%0),%%xmm2 \n"
657 "movdqu 0x30(%0),%%xmm3 \n"
658 "pmaddubsw %%xmm4,%%xmm0 \n"
659 "pmaddubsw %%xmm4,%%xmm1 \n"
660 "pmaddubsw %%xmm4,%%xmm2 \n"
661 "pmaddubsw %%xmm4,%%xmm3 \n"
662 "lea 0x40(%0),%0 \n"
663 "phaddw %%xmm1,%%xmm0 \n"
664 "phaddw %%xmm3,%%xmm2 \n"
665 "psrlw $0x7,%%xmm0 \n"
666 "psrlw $0x7,%%xmm2 \n"
667 "packuswb %%xmm2,%%xmm0 \n"
668 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000669 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000670 "movdqu %%xmm0,(%1) \n"
671 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000672 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000673 : "+r"(src_argb), // %0
674 "+r"(dst_y), // %1
675 "+r"(pix) // %2
676 : "m"(kARGBToY), // %3
677 "m"(kAddY16) // %4
678 : "memory", "cc"
679#if defined(__SSE2__)
680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
681#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000682 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000683}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000684
fbarchard@google.com714050a2012-02-17 22:59:56 +0000685// TODO(fbarchard): pass xmm constants to single block of assembly.
686// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
687// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
688// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
689// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000690void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
691 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000693 "movdqa %0,%%xmm4 \n"
694 "movdqa %1,%%xmm3 \n"
695 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000696 :
697 : "m"(kARGBToU), // %0
698 "m"(kARGBToV), // %1
699 "m"(kAddUV128) // %2
700 :
701#if defined(__SSE2__)
702 "xmm3", "xmm4", "xmm5"
703#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000704 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000705 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000706 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000707 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000708 "1: \n"
709 "movdqa (%0),%%xmm0 \n"
710 "movdqa 0x10(%0),%%xmm1 \n"
711 "movdqa 0x20(%0),%%xmm2 \n"
712 "movdqa 0x30(%0),%%xmm6 \n"
713 "pavgb (%0,%4,1),%%xmm0 \n"
714 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
715 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
716 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
717 "lea 0x40(%0),%0 \n"
718 "movdqa %%xmm0,%%xmm7 \n"
719 "shufps $0x88,%%xmm1,%%xmm0 \n"
720 "shufps $0xdd,%%xmm1,%%xmm7 \n"
721 "pavgb %%xmm7,%%xmm0 \n"
722 "movdqa %%xmm2,%%xmm7 \n"
723 "shufps $0x88,%%xmm6,%%xmm2 \n"
724 "shufps $0xdd,%%xmm6,%%xmm7 \n"
725 "pavgb %%xmm7,%%xmm2 \n"
726 "movdqa %%xmm0,%%xmm1 \n"
727 "movdqa %%xmm2,%%xmm6 \n"
728 "pmaddubsw %%xmm4,%%xmm0 \n"
729 "pmaddubsw %%xmm4,%%xmm2 \n"
730 "pmaddubsw %%xmm3,%%xmm1 \n"
731 "pmaddubsw %%xmm3,%%xmm6 \n"
732 "phaddw %%xmm2,%%xmm0 \n"
733 "phaddw %%xmm6,%%xmm1 \n"
734 "psraw $0x8,%%xmm0 \n"
735 "psraw $0x8,%%xmm1 \n"
736 "packsswb %%xmm1,%%xmm0 \n"
737 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000738 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000739 "movlps %%xmm0,(%1) \n"
740 "movhps %%xmm0,(%1,%2,1) \n"
741 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000742 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000743 : "+r"(src_argb0), // %0
744 "+r"(dst_u), // %1
745 "+r"(dst_v), // %2
746 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000747 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000748 : "memory", "cc"
749#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000750 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000751#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000752 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000753}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000754
755void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
756 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000757 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000758 "movdqa %0,%%xmm4 \n"
759 "movdqa %1,%%xmm3 \n"
760 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000761 :
762 : "m"(kARGBToU), // %0
763 "m"(kARGBToV), // %1
764 "m"(kAddUV128) // %2
765 :
766#if defined(__SSE2__)
767 "xmm3", "xmm4", "xmm5"
768#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000769 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000770 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000771 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000772 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000773 "1: \n"
774 "movdqu (%0),%%xmm0 \n"
775 "movdqu 0x10(%0),%%xmm1 \n"
776 "movdqu 0x20(%0),%%xmm2 \n"
777 "movdqu 0x30(%0),%%xmm6 \n"
778 "movdqu (%0,%4,1),%%xmm7 \n"
779 "pavgb %%xmm7,%%xmm0 \n"
780 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
781 "pavgb %%xmm7,%%xmm1 \n"
782 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
783 "pavgb %%xmm7,%%xmm2 \n"
784 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
785 "pavgb %%xmm7,%%xmm6 \n"
786 "lea 0x40(%0),%0 \n"
787 "movdqa %%xmm0,%%xmm7 \n"
788 "shufps $0x88,%%xmm1,%%xmm0 \n"
789 "shufps $0xdd,%%xmm1,%%xmm7 \n"
790 "pavgb %%xmm7,%%xmm0 \n"
791 "movdqa %%xmm2,%%xmm7 \n"
792 "shufps $0x88,%%xmm6,%%xmm2 \n"
793 "shufps $0xdd,%%xmm6,%%xmm7 \n"
794 "pavgb %%xmm7,%%xmm2 \n"
795 "movdqa %%xmm0,%%xmm1 \n"
796 "movdqa %%xmm2,%%xmm6 \n"
797 "pmaddubsw %%xmm4,%%xmm0 \n"
798 "pmaddubsw %%xmm4,%%xmm2 \n"
799 "pmaddubsw %%xmm3,%%xmm1 \n"
800 "pmaddubsw %%xmm3,%%xmm6 \n"
801 "phaddw %%xmm2,%%xmm0 \n"
802 "phaddw %%xmm6,%%xmm1 \n"
803 "psraw $0x8,%%xmm0 \n"
804 "psraw $0x8,%%xmm1 \n"
805 "packsswb %%xmm1,%%xmm0 \n"
806 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000807 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000808 "movlps %%xmm0,(%1) \n"
809 "movhps %%xmm0,(%1,%2,1) \n"
810 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000811 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000812 : "+r"(src_argb0), // %0
813 "+r"(dst_u), // %1
814 "+r"(dst_v), // %2
815 "+rm"(width) // %3
816 : "r"(static_cast<intptr_t>(src_stride_argb))
817 : "memory", "cc"
818#if defined(__SSE2__)
819 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
820#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000821 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000822}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000823
fbarchard@google.com714050a2012-02-17 22:59:56 +0000824void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000825 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000826 "movdqa %4,%%xmm5 \n"
827 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000828 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000829 "1: \n"
830 "movdqa (%0),%%xmm0 \n"
831 "movdqa 0x10(%0),%%xmm1 \n"
832 "movdqa 0x20(%0),%%xmm2 \n"
833 "movdqa 0x30(%0),%%xmm3 \n"
834 "pmaddubsw %%xmm4,%%xmm0 \n"
835 "pmaddubsw %%xmm4,%%xmm1 \n"
836 "pmaddubsw %%xmm4,%%xmm2 \n"
837 "pmaddubsw %%xmm4,%%xmm3 \n"
838 "lea 0x40(%0),%0 \n"
839 "phaddw %%xmm1,%%xmm0 \n"
840 "phaddw %%xmm3,%%xmm2 \n"
841 "psrlw $0x7,%%xmm0 \n"
842 "psrlw $0x7,%%xmm2 \n"
843 "packuswb %%xmm2,%%xmm0 \n"
844 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000845 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000846 "movdqa %%xmm0,(%1) \n"
847 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000848 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000849 : "+r"(src_bgra), // %0
850 "+r"(dst_y), // %1
851 "+r"(pix) // %2
852 : "m"(kBGRAToY), // %3
853 "m"(kAddY16) // %4
854 : "memory", "cc"
855#if defined(__SSE2__)
856 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000857#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000858 );
859}
860
861void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000862 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000863 "movdqa %4,%%xmm5 \n"
864 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000865 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000866 "1: \n"
867 "movdqu (%0),%%xmm0 \n"
868 "movdqu 0x10(%0),%%xmm1 \n"
869 "movdqu 0x20(%0),%%xmm2 \n"
870 "movdqu 0x30(%0),%%xmm3 \n"
871 "pmaddubsw %%xmm4,%%xmm0 \n"
872 "pmaddubsw %%xmm4,%%xmm1 \n"
873 "pmaddubsw %%xmm4,%%xmm2 \n"
874 "pmaddubsw %%xmm4,%%xmm3 \n"
875 "lea 0x40(%0),%0 \n"
876 "phaddw %%xmm1,%%xmm0 \n"
877 "phaddw %%xmm3,%%xmm2 \n"
878 "psrlw $0x7,%%xmm0 \n"
879 "psrlw $0x7,%%xmm2 \n"
880 "packuswb %%xmm2,%%xmm0 \n"
881 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000882 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000883 "movdqu %%xmm0,(%1) \n"
884 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000885 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000886 : "+r"(src_bgra), // %0
887 "+r"(dst_y), // %1
888 "+r"(pix) // %2
889 : "m"(kBGRAToY), // %3
890 "m"(kAddY16) // %4
891 : "memory", "cc"
892#if defined(__SSE2__)
893 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
894#endif
895 );
896}
897
898void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
899 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000900 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000901 "movdqa %0,%%xmm4 \n"
902 "movdqa %1,%%xmm3 \n"
903 "movdqa %2,%%xmm5 \n"
904 :
905 : "m"(kBGRAToU), // %0
906 "m"(kBGRAToV), // %1
907 "m"(kAddUV128) // %2
908 :
909#if defined(__SSE2__)
910 "xmm3", "xmm4", "xmm5"
911#endif
912 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000913 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000914 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000915 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000916 "1: \n"
917 "movdqa (%0),%%xmm0 \n"
918 "movdqa 0x10(%0),%%xmm1 \n"
919 "movdqa 0x20(%0),%%xmm2 \n"
920 "movdqa 0x30(%0),%%xmm6 \n"
921 "pavgb (%0,%4,1),%%xmm0 \n"
922 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
923 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
924 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
925 "lea 0x40(%0),%0 \n"
926 "movdqa %%xmm0,%%xmm7 \n"
927 "shufps $0x88,%%xmm1,%%xmm0 \n"
928 "shufps $0xdd,%%xmm1,%%xmm7 \n"
929 "pavgb %%xmm7,%%xmm0 \n"
930 "movdqa %%xmm2,%%xmm7 \n"
931 "shufps $0x88,%%xmm6,%%xmm2 \n"
932 "shufps $0xdd,%%xmm6,%%xmm7 \n"
933 "pavgb %%xmm7,%%xmm2 \n"
934 "movdqa %%xmm0,%%xmm1 \n"
935 "movdqa %%xmm2,%%xmm6 \n"
936 "pmaddubsw %%xmm4,%%xmm0 \n"
937 "pmaddubsw %%xmm4,%%xmm2 \n"
938 "pmaddubsw %%xmm3,%%xmm1 \n"
939 "pmaddubsw %%xmm3,%%xmm6 \n"
940 "phaddw %%xmm2,%%xmm0 \n"
941 "phaddw %%xmm6,%%xmm1 \n"
942 "psraw $0x8,%%xmm0 \n"
943 "psraw $0x8,%%xmm1 \n"
944 "packsswb %%xmm1,%%xmm0 \n"
945 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000946 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000947 "movlps %%xmm0,(%1) \n"
948 "movhps %%xmm0,(%1,%2,1) \n"
949 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000950 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000951 : "+r"(src_bgra0), // %0
952 "+r"(dst_u), // %1
953 "+r"(dst_v), // %2
954 "+rm"(width) // %3
955 : "r"(static_cast<intptr_t>(src_stride_bgra))
956 : "memory", "cc"
957#if defined(__SSE2__)
958 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
959#endif
960 );
961}
962
963void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
964 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000965 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000966 "movdqa %0,%%xmm4 \n"
967 "movdqa %1,%%xmm3 \n"
968 "movdqa %2,%%xmm5 \n"
969 :
970 : "m"(kBGRAToU), // %0
971 "m"(kBGRAToV), // %1
972 "m"(kAddUV128) // %2
973 :
974#if defined(__SSE2__)
975 "xmm3", "xmm4", "xmm5"
976#endif
977 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000978 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000979 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000980 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000981 "1: \n"
982 "movdqu (%0),%%xmm0 \n"
983 "movdqu 0x10(%0),%%xmm1 \n"
984 "movdqu 0x20(%0),%%xmm2 \n"
985 "movdqu 0x30(%0),%%xmm6 \n"
986 "movdqu (%0,%4,1),%%xmm7 \n"
987 "pavgb %%xmm7,%%xmm0 \n"
988 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
989 "pavgb %%xmm7,%%xmm1 \n"
990 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
991 "pavgb %%xmm7,%%xmm2 \n"
992 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
993 "pavgb %%xmm7,%%xmm6 \n"
994 "lea 0x40(%0),%0 \n"
995 "movdqa %%xmm0,%%xmm7 \n"
996 "shufps $0x88,%%xmm1,%%xmm0 \n"
997 "shufps $0xdd,%%xmm1,%%xmm7 \n"
998 "pavgb %%xmm7,%%xmm0 \n"
999 "movdqa %%xmm2,%%xmm7 \n"
1000 "shufps $0x88,%%xmm6,%%xmm2 \n"
1001 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1002 "pavgb %%xmm7,%%xmm2 \n"
1003 "movdqa %%xmm0,%%xmm1 \n"
1004 "movdqa %%xmm2,%%xmm6 \n"
1005 "pmaddubsw %%xmm4,%%xmm0 \n"
1006 "pmaddubsw %%xmm4,%%xmm2 \n"
1007 "pmaddubsw %%xmm3,%%xmm1 \n"
1008 "pmaddubsw %%xmm3,%%xmm6 \n"
1009 "phaddw %%xmm2,%%xmm0 \n"
1010 "phaddw %%xmm6,%%xmm1 \n"
1011 "psraw $0x8,%%xmm0 \n"
1012 "psraw $0x8,%%xmm1 \n"
1013 "packsswb %%xmm1,%%xmm0 \n"
1014 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001015 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001016 "movlps %%xmm0,(%1) \n"
1017 "movhps %%xmm0,(%1,%2,1) \n"
1018 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001019 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001020 : "+r"(src_bgra0), // %0
1021 "+r"(dst_u), // %1
1022 "+r"(dst_v), // %2
1023 "+rm"(width) // %3
1024 : "r"(static_cast<intptr_t>(src_stride_bgra))
1025 : "memory", "cc"
1026#if defined(__SSE2__)
1027 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1028#endif
1029 );
1030}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001031
1032void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001033 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001034 "movdqa %4,%%xmm5 \n"
1035 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001036 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001037 "1: \n"
1038 "movdqa (%0),%%xmm0 \n"
1039 "movdqa 0x10(%0),%%xmm1 \n"
1040 "movdqa 0x20(%0),%%xmm2 \n"
1041 "movdqa 0x30(%0),%%xmm3 \n"
1042 "pmaddubsw %%xmm4,%%xmm0 \n"
1043 "pmaddubsw %%xmm4,%%xmm1 \n"
1044 "pmaddubsw %%xmm4,%%xmm2 \n"
1045 "pmaddubsw %%xmm4,%%xmm3 \n"
1046 "lea 0x40(%0),%0 \n"
1047 "phaddw %%xmm1,%%xmm0 \n"
1048 "phaddw %%xmm3,%%xmm2 \n"
1049 "psrlw $0x7,%%xmm0 \n"
1050 "psrlw $0x7,%%xmm2 \n"
1051 "packuswb %%xmm2,%%xmm0 \n"
1052 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001053 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001054 "movdqa %%xmm0,(%1) \n"
1055 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001056 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001057 : "+r"(src_abgr), // %0
1058 "+r"(dst_y), // %1
1059 "+r"(pix) // %2
1060 : "m"(kABGRToY), // %3
1061 "m"(kAddY16) // %4
1062 : "memory", "cc"
1063#if defined(__SSE2__)
1064 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1065#endif
1066 );
1067}
1068
1069void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001070 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001071 "movdqa %4,%%xmm5 \n"
1072 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001073 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001074 "1: \n"
1075 "movdqu (%0),%%xmm0 \n"
1076 "movdqu 0x10(%0),%%xmm1 \n"
1077 "movdqu 0x20(%0),%%xmm2 \n"
1078 "movdqu 0x30(%0),%%xmm3 \n"
1079 "pmaddubsw %%xmm4,%%xmm0 \n"
1080 "pmaddubsw %%xmm4,%%xmm1 \n"
1081 "pmaddubsw %%xmm4,%%xmm2 \n"
1082 "pmaddubsw %%xmm4,%%xmm3 \n"
1083 "lea 0x40(%0),%0 \n"
1084 "phaddw %%xmm1,%%xmm0 \n"
1085 "phaddw %%xmm3,%%xmm2 \n"
1086 "psrlw $0x7,%%xmm0 \n"
1087 "psrlw $0x7,%%xmm2 \n"
1088 "packuswb %%xmm2,%%xmm0 \n"
1089 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001090 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001091 "movdqu %%xmm0,(%1) \n"
1092 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001093 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001094 : "+r"(src_abgr), // %0
1095 "+r"(dst_y), // %1
1096 "+r"(pix) // %2
1097 : "m"(kABGRToY), // %3
1098 "m"(kAddY16) // %4
1099 : "memory", "cc"
1100#if defined(__SSE2__)
1101 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1102#endif
1103 );
1104}
1105
1106void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1107 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001108 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001109 "movdqa %0,%%xmm4 \n"
1110 "movdqa %1,%%xmm3 \n"
1111 "movdqa %2,%%xmm5 \n"
1112 :
1113 : "m"(kABGRToU), // %0
1114 "m"(kABGRToV), // %1
1115 "m"(kAddUV128) // %2
1116 :
1117#if defined(__SSE2__)
1118 "xmm3", "xmm4", "xmm5"
1119#endif
1120 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001121 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001122 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001123 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001124 "1: \n"
1125 "movdqa (%0),%%xmm0 \n"
1126 "movdqa 0x10(%0),%%xmm1 \n"
1127 "movdqa 0x20(%0),%%xmm2 \n"
1128 "movdqa 0x30(%0),%%xmm6 \n"
1129 "pavgb (%0,%4,1),%%xmm0 \n"
1130 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1131 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1132 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1133 "lea 0x40(%0),%0 \n"
1134 "movdqa %%xmm0,%%xmm7 \n"
1135 "shufps $0x88,%%xmm1,%%xmm0 \n"
1136 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1137 "pavgb %%xmm7,%%xmm0 \n"
1138 "movdqa %%xmm2,%%xmm7 \n"
1139 "shufps $0x88,%%xmm6,%%xmm2 \n"
1140 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1141 "pavgb %%xmm7,%%xmm2 \n"
1142 "movdqa %%xmm0,%%xmm1 \n"
1143 "movdqa %%xmm2,%%xmm6 \n"
1144 "pmaddubsw %%xmm4,%%xmm0 \n"
1145 "pmaddubsw %%xmm4,%%xmm2 \n"
1146 "pmaddubsw %%xmm3,%%xmm1 \n"
1147 "pmaddubsw %%xmm3,%%xmm6 \n"
1148 "phaddw %%xmm2,%%xmm0 \n"
1149 "phaddw %%xmm6,%%xmm1 \n"
1150 "psraw $0x8,%%xmm0 \n"
1151 "psraw $0x8,%%xmm1 \n"
1152 "packsswb %%xmm1,%%xmm0 \n"
1153 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001154 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001155 "movlps %%xmm0,(%1) \n"
1156 "movhps %%xmm0,(%1,%2,1) \n"
1157 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001158 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001159 : "+r"(src_abgr0), // %0
1160 "+r"(dst_u), // %1
1161 "+r"(dst_v), // %2
1162 "+rm"(width) // %3
1163 : "r"(static_cast<intptr_t>(src_stride_abgr))
1164 : "memory", "cc"
1165#if defined(__SSE2__)
1166 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1167#endif
1168 );
1169}
1170
1171void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1172 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001173 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001174 "movdqa %0,%%xmm4 \n"
1175 "movdqa %1,%%xmm3 \n"
1176 "movdqa %2,%%xmm5 \n"
1177 :
1178 : "m"(kABGRToU), // %0
1179 "m"(kABGRToV), // %1
1180 "m"(kAddUV128) // %2
1181 :
1182#if defined(__SSE2__)
1183 "xmm3", "xmm4", "xmm5"
1184#endif
1185 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001186 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001187 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001188 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001189 "1: \n"
1190 "movdqu (%0),%%xmm0 \n"
1191 "movdqu 0x10(%0),%%xmm1 \n"
1192 "movdqu 0x20(%0),%%xmm2 \n"
1193 "movdqu 0x30(%0),%%xmm6 \n"
1194 "movdqu (%0,%4,1),%%xmm7 \n"
1195 "pavgb %%xmm7,%%xmm0 \n"
1196 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1197 "pavgb %%xmm7,%%xmm1 \n"
1198 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1199 "pavgb %%xmm7,%%xmm2 \n"
1200 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1201 "pavgb %%xmm7,%%xmm6 \n"
1202 "lea 0x40(%0),%0 \n"
1203 "movdqa %%xmm0,%%xmm7 \n"
1204 "shufps $0x88,%%xmm1,%%xmm0 \n"
1205 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1206 "pavgb %%xmm7,%%xmm0 \n"
1207 "movdqa %%xmm2,%%xmm7 \n"
1208 "shufps $0x88,%%xmm6,%%xmm2 \n"
1209 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1210 "pavgb %%xmm7,%%xmm2 \n"
1211 "movdqa %%xmm0,%%xmm1 \n"
1212 "movdqa %%xmm2,%%xmm6 \n"
1213 "pmaddubsw %%xmm4,%%xmm0 \n"
1214 "pmaddubsw %%xmm4,%%xmm2 \n"
1215 "pmaddubsw %%xmm3,%%xmm1 \n"
1216 "pmaddubsw %%xmm3,%%xmm6 \n"
1217 "phaddw %%xmm2,%%xmm0 \n"
1218 "phaddw %%xmm6,%%xmm1 \n"
1219 "psraw $0x8,%%xmm0 \n"
1220 "psraw $0x8,%%xmm1 \n"
1221 "packsswb %%xmm1,%%xmm0 \n"
1222 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001223 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001224 "movlps %%xmm0,(%1) \n"
1225 "movhps %%xmm0,(%1,%2,1) \n"
1226 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001227 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001228 : "+r"(src_abgr0), // %0
1229 "+r"(dst_u), // %1
1230 "+r"(dst_v), // %2
1231 "+rm"(width) // %3
1232 : "r"(static_cast<intptr_t>(src_stride_abgr))
1233 : "memory", "cc"
1234#if defined(__SSE2__)
1235 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1236#endif
1237 );
1238}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001239
1240#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001241
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001242#ifdef HAS_I420TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001243#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1244#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1245#define UR 0
1246
1247#define VB 0
1248#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1249#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1250
1251// Bias
1252#define BB UB * 128 + VB * 128
1253#define BG UG * 128 + VG * 128
1254#define BR UR * 128 + VR * 128
1255
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001256#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001257
fbarchard@google.comb6149762011-11-07 21:58:52 +00001258#if defined(__APPLE__) || defined(__x86_64__)
1259#define OMITFP
1260#else
1261#define OMITFP __attribute__((optimize("omit-frame-pointer")))
1262#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001263
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001264struct {
1265 vec8 kUVToB;
1266 vec8 kUVToG;
1267 vec8 kUVToR;
1268 vec16 kUVBiasB;
1269 vec16 kUVBiasG;
1270 vec16 kUVBiasR;
1271 vec16 kYSub16;
1272 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001273} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001274 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1275 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1276 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1277 { BB, BB, BB, BB, BB, BB, BB, BB },
1278 { BG, BG, BG, BG, BG, BG, BG, BG },
1279 { BR, BR, BR, BR, BR, BR, BR, BR },
1280 { 16, 16, 16, 16, 16, 16, 16, 16 },
1281 { YG, YG, YG, YG, YG, YG, YG, YG }
1282};
1283
1284// Convert 8 pixels
fbarchard@google.comb6149762011-11-07 21:58:52 +00001285#define YUVTORGB \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001286 "movd (%1),%%xmm0 \n" \
1287 "movd (%1,%2,1),%%xmm1 \n" \
1288 "lea 0x4(%1),%1 \n" \
1289 "punpcklbw %%xmm1,%%xmm0 \n" \
1290 "punpcklwd %%xmm0,%%xmm0 \n" \
1291 "movdqa %%xmm0,%%xmm1 \n" \
1292 "movdqa %%xmm0,%%xmm2 \n" \
1293 "pmaddubsw (%5),%%xmm0 \n" \
1294 "pmaddubsw 16(%5),%%xmm1 \n" \
1295 "pmaddubsw 32(%5),%%xmm2 \n" \
1296 "psubw 48(%5),%%xmm0 \n" \
1297 "psubw 64(%5),%%xmm1 \n" \
1298 "psubw 80(%5),%%xmm2 \n" \
1299 "movq (%0),%%xmm3 \n" \
1300 "lea 0x8(%0),%0 \n" \
1301 "punpcklbw %%xmm4,%%xmm3 \n" \
1302 "psubsw 96(%5),%%xmm3 \n" \
1303 "pmullw 112(%5),%%xmm3 \n" \
1304 "paddsw %%xmm3,%%xmm0 \n" \
1305 "paddsw %%xmm3,%%xmm1 \n" \
1306 "paddsw %%xmm3,%%xmm2 \n" \
1307 "psraw $0x6,%%xmm0 \n" \
1308 "psraw $0x6,%%xmm1 \n" \
1309 "psraw $0x6,%%xmm2 \n" \
1310 "packuswb %%xmm0,%%xmm0 \n" \
1311 "packuswb %%xmm1,%%xmm1 \n" \
1312 "packuswb %%xmm2,%%xmm2 \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001313
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001314void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
1315 const uint8* u_buf,
1316 const uint8* v_buf,
1317 uint8* rgb_buf,
1318 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001319 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001320 "sub %1,%2 \n"
1321 "pcmpeqb %%xmm5,%%xmm5 \n"
1322 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001323 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001324 "1: \n"
1325 YUVTORGB
1326 "punpcklbw %%xmm1,%%xmm0 \n"
1327 "punpcklbw %%xmm5,%%xmm2 \n"
1328 "movdqa %%xmm0,%%xmm1 \n"
1329 "punpcklwd %%xmm2,%%xmm0 \n"
1330 "punpckhwd %%xmm2,%%xmm1 \n"
1331 "movdqa %%xmm0,(%3) \n"
1332 "movdqa %%xmm1,0x10(%3) \n"
1333 "lea 0x20(%3),%3 \n"
1334 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001335 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001336 : "+r"(y_buf), // %0
1337 "+r"(u_buf), // %1
1338 "+r"(v_buf), // %2
1339 "+r"(rgb_buf), // %3
1340 "+rm"(width) // %4
1341 : "r"(&kYuvConstants.kUVToB) // %5
1342 : "memory", "cc"
1343#if defined(__SSE2__)
1344 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1345#endif
1346 );
1347}
1348
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001349void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
1350 const uint8* u_buf,
1351 const uint8* v_buf,
1352 uint8* rgb_buf,
1353 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001354 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001355 "sub %1,%2 \n"
1356 "pcmpeqb %%xmm5,%%xmm5 \n"
1357 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001358 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001359 "1: \n"
1360 YUVTORGB
1361 "pcmpeqb %%xmm5,%%xmm5 \n"
1362 "punpcklbw %%xmm0,%%xmm1 \n"
1363 "punpcklbw %%xmm2,%%xmm5 \n"
1364 "movdqa %%xmm5,%%xmm0 \n"
1365 "punpcklwd %%xmm1,%%xmm5 \n"
1366 "punpckhwd %%xmm1,%%xmm0 \n"
1367 "movdqa %%xmm5,(%3) \n"
1368 "movdqa %%xmm0,0x10(%3) \n"
1369 "lea 0x20(%3),%3 \n"
1370 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001371 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001372 : "+r"(y_buf), // %0
1373 "+r"(u_buf), // %1
1374 "+r"(v_buf), // %2
1375 "+r"(rgb_buf), // %3
1376 "+rm"(width) // %4
1377 : "r"(&kYuvConstants.kUVToB) // %5
1378 : "memory", "cc"
1379#if defined(__SSE2__)
1380 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1381#endif
1382 );
1383}
1384
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001385void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
1386 const uint8* u_buf,
1387 const uint8* v_buf,
1388 uint8* rgb_buf,
1389 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001390 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001391 "sub %1,%2 \n"
1392 "pcmpeqb %%xmm5,%%xmm5 \n"
1393 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001394 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001395 "1: \n"
1396 YUVTORGB
1397 "punpcklbw %%xmm1,%%xmm2 \n"
1398 "punpcklbw %%xmm5,%%xmm0 \n"
1399 "movdqa %%xmm2,%%xmm1 \n"
1400 "punpcklwd %%xmm0,%%xmm2 \n"
1401 "punpckhwd %%xmm0,%%xmm1 \n"
1402 "movdqa %%xmm2,(%3) \n"
1403 "movdqa %%xmm1,0x10(%3) \n"
1404 "lea 0x20(%3),%3 \n"
1405 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001406 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001407 : "+r"(y_buf), // %0
1408 "+r"(u_buf), // %1
1409 "+r"(v_buf), // %2
1410 "+r"(rgb_buf), // %3
1411 "+rm"(width) // %4
1412 : "r"(&kYuvConstants.kUVToB) // %5
1413 : "memory", "cc"
1414#if defined(__SSE2__)
1415 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1416#endif
1417 );
1418}
1419
fbarchard@google.com952a5072012-03-30 18:10:50 +00001420void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1421 const uint8* u_buf,
1422 const uint8* v_buf,
1423 uint8* rgb_buf,
1424 int width) {
1425 asm volatile (
1426 "sub %1,%2 \n"
1427 "pcmpeqb %%xmm5,%%xmm5 \n"
1428 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001429 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001430 "1: \n"
1431 YUVTORGB
1432 "punpcklbw %%xmm1,%%xmm0 \n"
1433 "punpcklbw %%xmm5,%%xmm2 \n"
1434 "movdqa %%xmm0,%%xmm1 \n"
1435 "punpcklwd %%xmm2,%%xmm0 \n"
1436 "punpckhwd %%xmm2,%%xmm1 \n"
1437 "movdqu %%xmm0,(%3) \n"
1438 "movdqu %%xmm1,0x10(%3) \n"
1439 "lea 0x20(%3),%3 \n"
1440 "sub $0x8,%4 \n"
1441 "jg 1b \n"
1442 : "+r"(y_buf), // %0
1443 "+r"(u_buf), // %1
1444 "+r"(v_buf), // %2
1445 "+r"(rgb_buf), // %3
1446 "+rm"(width) // %4
1447 : "r"(&kYuvConstants.kUVToB) // %5
1448 : "memory", "cc"
1449#if defined(__SSE2__)
1450 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1451#endif
1452 );
1453}
1454
1455void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
1456 const uint8* u_buf,
1457 const uint8* v_buf,
1458 uint8* rgb_buf,
1459 int width) {
1460 asm volatile (
1461 "sub %1,%2 \n"
1462 "pcmpeqb %%xmm5,%%xmm5 \n"
1463 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001464 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001465 "1: \n"
1466 YUVTORGB
1467 "pcmpeqb %%xmm5,%%xmm5 \n"
1468 "punpcklbw %%xmm0,%%xmm1 \n"
1469 "punpcklbw %%xmm2,%%xmm5 \n"
1470 "movdqa %%xmm5,%%xmm0 \n"
1471 "punpcklwd %%xmm1,%%xmm5 \n"
1472 "punpckhwd %%xmm1,%%xmm0 \n"
1473 "movdqu %%xmm5,(%3) \n"
1474 "movdqu %%xmm0,0x10(%3) \n"
1475 "lea 0x20(%3),%3 \n"
1476 "sub $0x8,%4 \n"
1477 "jg 1b \n"
1478 : "+r"(y_buf), // %0
1479 "+r"(u_buf), // %1
1480 "+r"(v_buf), // %2
1481 "+r"(rgb_buf), // %3
1482 "+rm"(width) // %4
1483 : "r"(&kYuvConstants.kUVToB) // %5
1484 : "memory", "cc"
1485#if defined(__SSE2__)
1486 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1487#endif
1488 );
1489}
1490
1491void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
1492 const uint8* u_buf,
1493 const uint8* v_buf,
1494 uint8* rgb_buf,
1495 int width) {
1496 asm volatile (
1497 "sub %1,%2 \n"
1498 "pcmpeqb %%xmm5,%%xmm5 \n"
1499 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001500 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001501 "1: \n"
1502 YUVTORGB
1503 "punpcklbw %%xmm1,%%xmm2 \n"
1504 "punpcklbw %%xmm5,%%xmm0 \n"
1505 "movdqa %%xmm2,%%xmm1 \n"
1506 "punpcklwd %%xmm0,%%xmm2 \n"
1507 "punpckhwd %%xmm0,%%xmm1 \n"
1508 "movdqu %%xmm2,(%3) \n"
1509 "movdqu %%xmm1,0x10(%3) \n"
1510 "lea 0x20(%3),%3 \n"
1511 "sub $0x8,%4 \n"
1512 "jg 1b \n"
1513 : "+r"(y_buf), // %0
1514 "+r"(u_buf), // %1
1515 "+r"(v_buf), // %2
1516 "+r"(rgb_buf), // %3
1517 "+rm"(width) // %4
1518 : "r"(&kYuvConstants.kUVToB) // %5
1519 : "memory", "cc"
1520#if defined(__SSE2__)
1521 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1522#endif
1523 );
1524}
1525
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001526void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1527 const uint8* u_buf,
1528 const uint8* v_buf,
1529 uint8* rgb_buf,
1530 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001531 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001532 "sub %1,%2 \n"
1533 "pcmpeqb %%xmm5,%%xmm5 \n"
1534 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001535 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001536 "1: \n"
1537 "movd (%1),%%xmm0 \n"
1538 "movd (%1,%2,1),%%xmm1 \n"
1539 "lea 0x4(%1),%1 \n"
1540 "punpcklbw %%xmm1,%%xmm0 \n"
1541 "movdqa %%xmm0,%%xmm1 \n"
1542 "movdqa %%xmm0,%%xmm2 \n"
1543 "pmaddubsw (%5),%%xmm0 \n"
1544 "pmaddubsw 16(%5),%%xmm1 \n"
1545 "pmaddubsw 32(%5),%%xmm2 \n"
1546 "psubw 48(%5),%%xmm0 \n"
1547 "psubw 64(%5),%%xmm1 \n"
1548 "psubw 80(%5),%%xmm2 \n"
1549 "movd (%0),%%xmm3 \n"
1550 "lea 0x4(%0),%0 \n"
1551 "punpcklbw %%xmm4,%%xmm3 \n"
1552 "psubsw 96(%5),%%xmm3 \n"
1553 "pmullw 112(%5),%%xmm3 \n"
1554 "paddsw %%xmm3,%%xmm0 \n"
1555 "paddsw %%xmm3,%%xmm1 \n"
1556 "paddsw %%xmm3,%%xmm2 \n"
1557 "psraw $0x6,%%xmm0 \n"
1558 "psraw $0x6,%%xmm1 \n"
1559 "psraw $0x6,%%xmm2 \n"
1560 "packuswb %%xmm0,%%xmm0 \n"
1561 "packuswb %%xmm1,%%xmm1 \n"
1562 "packuswb %%xmm2,%%xmm2 \n"
1563 "punpcklbw %%xmm1,%%xmm0 \n"
1564 "punpcklbw %%xmm5,%%xmm2 \n"
1565 "punpcklwd %%xmm2,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001566 "sub $0x4,%4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001567 "movdqa %%xmm0,(%3) \n"
1568 "lea 0x10(%3),%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001569 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001570 : "+r"(y_buf), // %0
1571 "+r"(u_buf), // %1
1572 "+r"(v_buf), // %2
1573 "+r"(rgb_buf), // %3
1574 "+rm"(width) // %4
1575 : "r"(&kYuvConstants.kUVToB) // %5
1576 : "memory", "cc"
1577#if defined(__SSE2__)
1578 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1579#endif
1580 );
1581}
1582#endif
1583
1584#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001585void YToARGBRow_SSE2(const uint8* y_buf,
1586 uint8* rgb_buf,
1587 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001588 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001589 "pcmpeqb %%xmm4,%%xmm4 \n"
1590 "pslld $0x18,%%xmm4 \n"
1591 "mov $0x10001000,%%eax \n"
1592 "movd %%eax,%%xmm3 \n"
1593 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1594 "mov $0x012a012a,%%eax \n"
1595 "movd %%eax,%%xmm2 \n"
1596 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001597 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001598 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001599 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001600 "movq (%0),%%xmm0 \n"
1601 "lea 0x8(%0),%0 \n"
1602 "punpcklbw %%xmm0,%%xmm0 \n"
1603 "psubusw %%xmm3,%%xmm0 \n"
1604 "pmulhuw %%xmm2,%%xmm0 \n"
1605 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001606
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001607 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001608 "punpcklbw %%xmm0,%%xmm0 \n"
1609 "movdqa %%xmm0,%%xmm1 \n"
1610 "punpcklwd %%xmm0,%%xmm0 \n"
1611 "punpckhwd %%xmm1,%%xmm1 \n"
1612 "por %%xmm4,%%xmm0 \n"
1613 "por %%xmm4,%%xmm1 \n"
1614 "movdqa %%xmm0,(%1) \n"
1615 "movdqa %%xmm1,16(%1) \n"
1616 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001617
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001618 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001619 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001620 : "+r"(y_buf), // %0
1621 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001622 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001623 :
1624 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001625#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001626 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001627#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001628 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001629}
fbarchard@google.comb6149762011-11-07 21:58:52 +00001630#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001631
fbarchard@google.com42831e02012-01-21 02:54:17 +00001632#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001633// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001634CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001635 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1636};
1637
fbarchard@google.com42831e02012-01-21 02:54:17 +00001638void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001639 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001640 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001641 "movdqa %3,%%xmm5 \n"
1642 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001643 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001644 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001645 "movdqa (%0,%2),%%xmm0 \n"
1646 "pshufb %%xmm5,%%xmm0 \n"
1647 "sub $0x10,%2 \n"
1648 "movdqa %%xmm0,(%1) \n"
1649 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001650 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001651 : "+r"(src), // %0
1652 "+r"(dst), // %1
1653 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001654 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001655 : "memory", "cc"
1656#if defined(__SSE2__)
1657 , "xmm0", "xmm5"
1658#endif
1659 );
1660}
1661#endif
1662
fbarchard@google.com42831e02012-01-21 02:54:17 +00001663#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001664void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001665 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001666 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001667 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001668 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001669 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001670 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001671 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001672 "psllw $0x8,%%xmm0 \n"
1673 "psrlw $0x8,%%xmm1 \n"
1674 "por %%xmm1,%%xmm0 \n"
1675 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1676 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1677 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1678 "sub $0x10,%2 \n"
1679 "movdqu %%xmm0,(%1) \n"
1680 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001681 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001682 : "+r"(src), // %0
1683 "+r"(dst), // %1
1684 "+r"(temp_width) // %2
1685 :
1686 : "memory", "cc"
1687#if defined(__SSE2__)
1688 , "xmm0", "xmm1"
1689#endif
1690 );
1691}
1692#endif
1693
fbarchard@google.com16a96642012-03-02 22:38:09 +00001694#ifdef HAS_MIRRORROW_UV_SSSE3
1695// Shuffle table for reversing the bytes of UV channels.
1696CONST uvec8 kShuffleMirrorUV = {
1697 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1698};
1699void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1700 int width) {
1701 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001702 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001703 "movdqa %4,%%xmm1 \n"
1704 "lea -16(%0,%3,2),%0 \n"
1705 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001706 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001707 "1: \n"
1708 "movdqa (%0),%%xmm0 \n"
1709 "lea -16(%0),%0 \n"
1710 "pshufb %%xmm1,%%xmm0 \n"
1711 "sub $8,%3 \n"
1712 "movlpd %%xmm0,(%1) \n"
1713 "movhpd %%xmm0,(%1,%2) \n"
1714 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001715 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001716 : "+r"(src), // %0
1717 "+r"(dst_u), // %1
1718 "+r"(dst_v), // %2
1719 "+r"(temp_width) // %3
1720 : "m"(kShuffleMirrorUV) // %4
1721 : "memory", "cc"
1722#if defined(__SSE2__)
1723 , "xmm0", "xmm1"
1724#endif
1725 );
1726}
1727#endif
1728
fbarchard@google.com55663022012-04-26 00:01:41 +00001729#ifdef HAS_ADDROW_SSE2
1730// dst and width aligned to 16
1731void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
1732 asm volatile (
1733 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001734 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001735 "1: \n"
1736 "movdqu (%0),%%xmm2 \n"
1737 "lea 0x10(%0),%0 \n"
1738 "movdqa (%1),%%xmm0 \n"
1739 "movdqa 0x10(%1),%%xmm1 \n"
1740 "movdqa %%xmm2,%%xmm3 \n"
1741 "punpcklbw %%xmm4,%%xmm2 \n"
1742 "punpckhbw %%xmm4,%%xmm3 \n"
1743 "paddusw %%xmm2,%%xmm0 \n"
1744 "paddusw %%xmm3,%%xmm1 \n"
1745 "sub $0x10,%2 \n"
1746 "movdqa %%xmm0,(%1) \n"
1747 "movdqa %%xmm1,0x10(%1) \n"
1748 "lea 0x20(%1),%1 \n"
1749 "jg 1b \n"
1750 : "+r"(src), // %0
1751 "+r"(dst), // %1
1752 "+r"(width) // %2
1753 :
1754 : "memory", "cc"
1755#if defined(__SSE2__)
1756 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1757#endif
1758 );
1759}
1760
1761// dst and width aligned to 16
1762void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
1763 asm volatile (
1764 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001765 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001766 "1: \n"
1767 "movdqu (%0),%%xmm2 \n"
1768 "lea 0x10(%0),%0 \n"
1769 "movdqa (%1),%%xmm0 \n"
1770 "movdqa 0x10(%1),%%xmm1 \n"
1771 "movdqa %%xmm2,%%xmm3 \n"
1772 "punpcklbw %%xmm4,%%xmm2 \n"
1773 "punpckhbw %%xmm4,%%xmm3 \n"
1774 "psubusw %%xmm2,%%xmm0 \n"
1775 "psubusw %%xmm3,%%xmm1 \n"
1776 "sub $0x10,%2 \n"
1777 "movdqa %%xmm0,(%1) \n"
1778 "movdqa %%xmm1,0x10(%1) \n"
1779 "lea 0x20(%1),%1 \n"
1780 "jg 1b \n"
1781 : "+r"(src), // %0
1782 "+r"(dst), // %1
1783 "+r"(width) // %2
1784 :
1785 : "memory", "cc"
1786#if defined(__SSE2__)
1787 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1788#endif
1789 );
1790}
1791#endif // HAS_ADDROW_SSE2
1792
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001793#ifdef HAS_SPLITUV_SSE2
1794void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001795 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001796 "pcmpeqb %%xmm5,%%xmm5 \n"
1797 "psrlw $0x8,%%xmm5 \n"
1798 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001799 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001800 "1: \n"
1801 "movdqa (%0),%%xmm0 \n"
1802 "movdqa 0x10(%0),%%xmm1 \n"
1803 "lea 0x20(%0),%0 \n"
1804 "movdqa %%xmm0,%%xmm2 \n"
1805 "movdqa %%xmm1,%%xmm3 \n"
1806 "pand %%xmm5,%%xmm0 \n"
1807 "pand %%xmm5,%%xmm1 \n"
1808 "packuswb %%xmm1,%%xmm0 \n"
1809 "psrlw $0x8,%%xmm2 \n"
1810 "psrlw $0x8,%%xmm3 \n"
1811 "packuswb %%xmm3,%%xmm2 \n"
1812 "movdqa %%xmm0,(%1) \n"
1813 "movdqa %%xmm2,(%1,%2) \n"
1814 "lea 0x10(%1),%1 \n"
1815 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001816 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001817 : "+r"(src_uv), // %0
1818 "+r"(dst_u), // %1
1819 "+r"(dst_v), // %2
1820 "+r"(pix) // %3
1821 :
1822 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001823#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001824 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001825#endif
1826 );
1827}
1828#endif
1829
fbarchard@google.com19932f82012-02-16 22:19:14 +00001830#ifdef HAS_COPYROW_SSE2
1831void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001832 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001833 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00001834 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001835 "1: \n"
1836 "movdqa (%0),%%xmm0 \n"
1837 "movdqa 0x10(%0),%%xmm1 \n"
1838 "movdqa %%xmm0,(%0,%1) \n"
1839 "movdqa %%xmm1,0x10(%0,%1) \n"
1840 "lea 0x20(%0),%0 \n"
1841 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001842 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001843 : "+r"(src), // %0
1844 "+r"(dst), // %1
1845 "+r"(count) // %2
1846 :
1847 : "memory", "cc"
1848#if defined(__SSE2__)
1849 , "xmm0", "xmm1"
1850#endif
1851 );
1852}
1853#endif // HAS_COPYROW_SSE2
1854
1855#ifdef HAS_COPYROW_X86
1856void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1857 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001858 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00001859 "shr $0x2,%2 \n"
1860 "rep movsl \n"
1861 : "+S"(src), // %0
1862 "+D"(dst), // %1
1863 "+c"(width_tmp) // %2
1864 :
1865 : "memory", "cc"
1866 );
1867}
1868#endif
1869
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001870#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001871void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001872 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001873 "pcmpeqb %%xmm5,%%xmm5 \n"
1874 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001875 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001876 "1: \n"
1877 "movdqa (%0),%%xmm0 \n"
1878 "movdqa 0x10(%0),%%xmm1 \n"
1879 "lea 0x20(%0),%0 \n"
1880 "pand %%xmm5,%%xmm0 \n"
1881 "pand %%xmm5,%%xmm1 \n"
1882 "packuswb %%xmm1,%%xmm0 \n"
1883 "movdqa %%xmm0,(%1) \n"
1884 "lea 0x10(%1),%1 \n"
1885 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001886 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001887 : "+r"(src_yuy2), // %0
1888 "+r"(dst_y), // %1
1889 "+r"(pix) // %2
1890 :
1891 : "memory", "cc"
1892#if defined(__SSE2__)
1893 , "xmm0", "xmm1", "xmm5"
1894#endif
1895 );
1896}
1897
1898void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1899 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001900 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001901 "pcmpeqb %%xmm5,%%xmm5 \n"
1902 "psrlw $0x8,%%xmm5 \n"
1903 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001904 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001905 "1: \n"
1906 "movdqa (%0),%%xmm0 \n"
1907 "movdqa 0x10(%0),%%xmm1 \n"
1908 "movdqa (%0,%4,1),%%xmm2 \n"
1909 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1910 "lea 0x20(%0),%0 \n"
1911 "pavgb %%xmm2,%%xmm0 \n"
1912 "pavgb %%xmm3,%%xmm1 \n"
1913 "psrlw $0x8,%%xmm0 \n"
1914 "psrlw $0x8,%%xmm1 \n"
1915 "packuswb %%xmm1,%%xmm0 \n"
1916 "movdqa %%xmm0,%%xmm1 \n"
1917 "pand %%xmm5,%%xmm0 \n"
1918 "packuswb %%xmm0,%%xmm0 \n"
1919 "psrlw $0x8,%%xmm1 \n"
1920 "packuswb %%xmm1,%%xmm1 \n"
1921 "movq %%xmm0,(%1) \n"
1922 "movq %%xmm1,(%1,%2) \n"
1923 "lea 0x8(%1),%1 \n"
1924 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001925 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001926 : "+r"(src_yuy2), // %0
1927 "+r"(dst_u), // %1
1928 "+r"(dst_y), // %2
1929 "+r"(pix) // %3
1930 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
1931 : "memory", "cc"
1932#if defined(__SSE2__)
1933 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1934#endif
1935 );
1936}
1937
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00001938
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001939void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
1940 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001941 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001942 "pcmpeqb %%xmm5,%%xmm5 \n"
1943 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001944 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001945 "1: \n"
1946 "movdqu (%0),%%xmm0 \n"
1947 "movdqu 0x10(%0),%%xmm1 \n"
1948 "lea 0x20(%0),%0 \n"
1949 "pand %%xmm5,%%xmm0 \n"
1950 "pand %%xmm5,%%xmm1 \n"
1951 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001952 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001953 "movdqu %%xmm0,(%1) \n"
1954 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001955 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001956 : "+r"(src_yuy2), // %0
1957 "+r"(dst_y), // %1
1958 "+r"(pix) // %2
1959 :
1960 : "memory", "cc"
1961#if defined(__SSE2__)
1962 , "xmm0", "xmm1", "xmm5"
1963#endif
1964 );
1965}
1966
1967void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
1968 int stride_yuy2,
1969 uint8* dst_u, uint8* dst_y,
1970 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001971 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001972 "pcmpeqb %%xmm5,%%xmm5 \n"
1973 "psrlw $0x8,%%xmm5 \n"
1974 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001975 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001976 "1: \n"
1977 "movdqu (%0),%%xmm0 \n"
1978 "movdqu 0x10(%0),%%xmm1 \n"
1979 "movdqu (%0,%4,1),%%xmm2 \n"
1980 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
1981 "lea 0x20(%0),%0 \n"
1982 "pavgb %%xmm2,%%xmm0 \n"
1983 "pavgb %%xmm3,%%xmm1 \n"
1984 "psrlw $0x8,%%xmm0 \n"
1985 "psrlw $0x8,%%xmm1 \n"
1986 "packuswb %%xmm1,%%xmm0 \n"
1987 "movdqa %%xmm0,%%xmm1 \n"
1988 "pand %%xmm5,%%xmm0 \n"
1989 "packuswb %%xmm0,%%xmm0 \n"
1990 "psrlw $0x8,%%xmm1 \n"
1991 "packuswb %%xmm1,%%xmm1 \n"
1992 "movq %%xmm0,(%1) \n"
1993 "movq %%xmm1,(%1,%2) \n"
1994 "lea 0x8(%1),%1 \n"
1995 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001996 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001997 : "+r"(src_yuy2), // %0
1998 "+r"(dst_u), // %1
1999 "+r"(dst_y), // %2
2000 "+r"(pix) // %3
2001 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2002 : "memory", "cc"
2003#if defined(__SSE2__)
2004 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2005#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002006 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002007}
2008
2009void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002010 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002011 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002012 "1: \n"
2013 "movdqa (%0),%%xmm0 \n"
2014 "movdqa 0x10(%0),%%xmm1 \n"
2015 "lea 0x20(%0),%0 \n"
2016 "psrlw $0x8,%%xmm0 \n"
2017 "psrlw $0x8,%%xmm1 \n"
2018 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002019 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002020 "movdqa %%xmm0,(%1) \n"
2021 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002022 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002023 : "+r"(src_uyvy), // %0
2024 "+r"(dst_y), // %1
2025 "+r"(pix) // %2
2026 :
2027 : "memory", "cc"
2028#if defined(__SSE2__)
2029 , "xmm0", "xmm1"
2030#endif
2031 );
2032}
2033
2034void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2035 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002036 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002037 "pcmpeqb %%xmm5,%%xmm5 \n"
2038 "psrlw $0x8,%%xmm5 \n"
2039 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002040 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002041 "1: \n"
2042 "movdqa (%0),%%xmm0 \n"
2043 "movdqa 0x10(%0),%%xmm1 \n"
2044 "movdqa (%0,%4,1),%%xmm2 \n"
2045 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2046 "lea 0x20(%0),%0 \n"
2047 "pavgb %%xmm2,%%xmm0 \n"
2048 "pavgb %%xmm3,%%xmm1 \n"
2049 "pand %%xmm5,%%xmm0 \n"
2050 "pand %%xmm5,%%xmm1 \n"
2051 "packuswb %%xmm1,%%xmm0 \n"
2052 "movdqa %%xmm0,%%xmm1 \n"
2053 "pand %%xmm5,%%xmm0 \n"
2054 "packuswb %%xmm0,%%xmm0 \n"
2055 "psrlw $0x8,%%xmm1 \n"
2056 "packuswb %%xmm1,%%xmm1 \n"
2057 "movq %%xmm0,(%1) \n"
2058 "movq %%xmm1,(%1,%2) \n"
2059 "lea 0x8(%1),%1 \n"
2060 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002061 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002062 : "+r"(src_uyvy), // %0
2063 "+r"(dst_u), // %1
2064 "+r"(dst_y), // %2
2065 "+r"(pix) // %3
2066 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2067 : "memory", "cc"
2068#if defined(__SSE2__)
2069 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2070#endif
2071 );
2072}
2073
2074void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2075 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002076 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002077 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002078 "1: \n"
2079 "movdqu (%0),%%xmm0 \n"
2080 "movdqu 0x10(%0),%%xmm1 \n"
2081 "lea 0x20(%0),%0 \n"
2082 "psrlw $0x8,%%xmm0 \n"
2083 "psrlw $0x8,%%xmm1 \n"
2084 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002085 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002086 "movdqu %%xmm0,(%1) \n"
2087 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002088 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002089 : "+r"(src_uyvy), // %0
2090 "+r"(dst_y), // %1
2091 "+r"(pix) // %2
2092 :
2093 : "memory", "cc"
2094#if defined(__SSE2__)
2095 , "xmm0", "xmm1"
2096#endif
2097 );
2098}
2099
2100void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2101 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002102 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002103 "pcmpeqb %%xmm5,%%xmm5 \n"
2104 "psrlw $0x8,%%xmm5 \n"
2105 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002106 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002107 "1: \n"
2108 "movdqu (%0),%%xmm0 \n"
2109 "movdqu 0x10(%0),%%xmm1 \n"
2110 "movdqu (%0,%4,1),%%xmm2 \n"
2111 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2112 "lea 0x20(%0),%0 \n"
2113 "pavgb %%xmm2,%%xmm0 \n"
2114 "pavgb %%xmm3,%%xmm1 \n"
2115 "pand %%xmm5,%%xmm0 \n"
2116 "pand %%xmm5,%%xmm1 \n"
2117 "packuswb %%xmm1,%%xmm0 \n"
2118 "movdqa %%xmm0,%%xmm1 \n"
2119 "pand %%xmm5,%%xmm0 \n"
2120 "packuswb %%xmm0,%%xmm0 \n"
2121 "psrlw $0x8,%%xmm1 \n"
2122 "packuswb %%xmm1,%%xmm1 \n"
2123 "movq %%xmm0,(%1) \n"
2124 "movq %%xmm1,(%1,%2) \n"
2125 "lea 0x8(%1),%1 \n"
2126 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002127 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002128 : "+r"(src_uyvy), // %0
2129 "+r"(dst_u), // %1
2130 "+r"(dst_y), // %2
2131 "+r"(pix) // %3
2132 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2133 : "memory", "cc"
2134#if defined(__SSE2__)
2135 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2136#endif
2137 );
2138}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002139#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002140
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002141#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002142// Blend 8 pixels at a time.
2143// src_argb0 unaligned.
2144// src_argb1 and dst_argb aligned to 16 bytes.
2145// width must be multiple of 4 pixels.
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002146void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.comc757f302012-04-03 00:49:16 +00002147 uint8* dst_argb, int width) {
2148 asm volatile (
2149 "pcmpeqb %%xmm7,%%xmm7 \n"
2150 "psrlw $0xf,%%xmm7 \n"
2151 "pcmpeqb %%xmm6,%%xmm6 \n"
2152 "psrlw $0x8,%%xmm6 \n"
2153 "pcmpeqb %%xmm5,%%xmm5 \n"
2154 "psllw $0x8,%%xmm5 \n"
2155 "pcmpeqb %%xmm4,%%xmm4 \n"
2156 "pslld $0x18,%%xmm4 \n"
2157
2158 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002159 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002160 "1: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002161 "movdqu (%0),%%xmm3 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002162 "movdqa %%xmm3,%%xmm0 \n"
2163 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002164 "movdqu (%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002165 "psrlw $0x8,%%xmm3 \n"
2166 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2167 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2168 "pand %%xmm6,%%xmm2 \n"
2169 "paddw %%xmm7,%%xmm3 \n"
2170 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002171 "movdqu (%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002172 "psrlw $0x8,%%xmm1 \n"
2173 "por %%xmm4,%%xmm0 \n"
2174 "pmullw %%xmm3,%%xmm1 \n"
2175 "movdqu 0x10(%0),%%xmm3 \n"
2176 "lea 0x20(%0),%0 \n"
2177 "psrlw $0x8,%%xmm2 \n"
2178 "paddusb %%xmm2,%%xmm0 \n"
2179 "pand %%xmm5,%%xmm1 \n"
2180 "paddusb %%xmm1,%%xmm0 \n"
2181 "sub $0x4,%3 \n"
2182 "movdqa %%xmm0,(%2) \n"
2183 "jle 9f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002184 "movdqa %%xmm3,%%xmm0 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002185 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002186 "movdqu 0x10(%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002187 "psrlw $0x8,%%xmm3 \n"
2188 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2189 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2190 "pand %%xmm6,%%xmm2 \n"
2191 "paddw %%xmm7,%%xmm3 \n"
2192 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002193 "movdqu 0x10(%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002194 "lea 0x20(%1),%1 \n"
2195 "psrlw $0x8,%%xmm1 \n"
2196 "por %%xmm4,%%xmm0 \n"
2197 "pmullw %%xmm3,%%xmm1 \n"
2198 "psrlw $0x8,%%xmm2 \n"
2199 "paddusb %%xmm2,%%xmm0 \n"
2200 "pand %%xmm5,%%xmm1 \n"
2201 "paddusb %%xmm1,%%xmm0 \n"
2202 "sub $0x4,%3 \n"
2203 "movdqa %%xmm0,0x10(%2) \n"
2204 "lea 0x20(%2),%2 \n"
2205 "jg 1b \n"
2206 "9: \n"
2207 : "+r"(src_argb0), // %0
2208 "+r"(src_argb1), // %1
2209 "+r"(dst_argb), // %2
2210 "+r"(width) // %3
2211 :
2212 : "memory", "cc"
2213#if defined(__SSE2__)
2214 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2215#endif
2216 );
2217}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002218#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002219
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002220#ifdef HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002221// Blend 1 pixel at a time, unaligned
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002222void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com96af8702012-04-06 18:22:27 +00002223 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002224 asm volatile (
2225 "pcmpeqb %%xmm7,%%xmm7 \n"
2226 "psrlw $0xf,%%xmm7 \n"
2227 "pcmpeqb %%xmm6,%%xmm6 \n"
2228 "psrlw $0x8,%%xmm6 \n"
2229 "pcmpeqb %%xmm5,%%xmm5 \n"
2230 "psllw $0x8,%%xmm5 \n"
2231 "pcmpeqb %%xmm4,%%xmm4 \n"
2232 "pslld $0x18,%%xmm4 \n"
2233
2234 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002235 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002236 "1: \n"
2237 "movd (%0),%%xmm3 \n"
2238 "lea 0x4(%0),%0 \n"
2239 "movdqa %%xmm3,%%xmm0 \n"
2240 "pxor %%xmm4,%%xmm3 \n"
2241 "movd (%1),%%xmm2 \n"
2242 "psrlw $0x8,%%xmm3 \n"
2243 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2244 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2245 "pand %%xmm6,%%xmm2 \n"
2246 "paddw %%xmm7,%%xmm3 \n"
2247 "pmullw %%xmm3,%%xmm2 \n"
2248 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002249 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002250 "psrlw $0x8,%%xmm1 \n"
2251 "por %%xmm4,%%xmm0 \n"
2252 "pmullw %%xmm3,%%xmm1 \n"
2253 "psrlw $0x8,%%xmm2 \n"
2254 "paddusb %%xmm2,%%xmm0 \n"
2255 "pand %%xmm5,%%xmm1 \n"
2256 "paddusb %%xmm1,%%xmm0 \n"
2257 "sub $0x1,%3 \n"
2258 "movd %%xmm0,(%2) \n"
2259 "lea 0x4(%2),%2 \n"
2260 "jg 1b \n"
2261 : "+r"(src_argb0), // %0
2262 "+r"(src_argb1), // %1
2263 "+r"(dst_argb), // %2
2264 "+r"(width) // %3
2265 :
2266 : "memory", "cc"
2267#if defined(__SSE2__)
2268 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2269#endif
2270 );
2271}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002272#endif // HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002273
fbarchard@google.com96af8702012-04-06 18:22:27 +00002274#ifdef HAS_ARGBBLENDROW_SSSE3
2275// Shuffle table for reversing the bytes.
2276CONST uvec8 kShuffleAlpha = {
2277 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2278 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2279};
2280void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002281 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002282 asm volatile (
2283 "pcmpeqb %%xmm7,%%xmm7 \n"
2284 "psrlw $0xf,%%xmm7 \n"
2285 "pcmpeqb %%xmm6,%%xmm6 \n"
2286 "psrlw $0x8,%%xmm6 \n"
2287 "pcmpeqb %%xmm5,%%xmm5 \n"
2288 "psllw $0x8,%%xmm5 \n"
2289 "pcmpeqb %%xmm4,%%xmm4 \n"
2290 "pslld $0x18,%%xmm4 \n"
2291
2292 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002293 ".p2align 4 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002294 "1: \n"
2295 "movdqu (%0),%%xmm3 \n"
2296 "movdqa %%xmm3,%%xmm0 \n"
2297 "pxor %%xmm4,%%xmm3 \n"
2298 "pshufb %4,%%xmm3 \n"
2299 "movdqu (%1),%%xmm2 \n"
2300 "pand %%xmm6,%%xmm2 \n"
2301 "paddw %%xmm7,%%xmm3 \n"
2302 "pmullw %%xmm3,%%xmm2 \n"
2303 "movdqu (%1),%%xmm1 \n"
2304 "psrlw $0x8,%%xmm1 \n"
2305 "por %%xmm4,%%xmm0 \n"
2306 "pmullw %%xmm3,%%xmm1 \n"
2307 "movdqu 0x10(%0),%%xmm3 \n"
2308 "lea 0x20(%0),%0 \n"
2309 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002310 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002311 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002312 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002313 "sub $0x4,%3 \n"
2314 "movdqa %%xmm0,(%2) \n"
2315 "jle 9f \n"
2316 "movdqa %%xmm3,%%xmm0 \n"
2317 "pxor %%xmm4,%%xmm3 \n"
2318 "movdqu 0x10(%1),%%xmm2 \n"
2319 "pshufb %4,%%xmm3 \n"
2320 "pand %%xmm6,%%xmm2 \n"
2321 "paddw %%xmm7,%%xmm3 \n"
2322 "pmullw %%xmm3,%%xmm2 \n"
2323 "movdqu 0x10(%1),%%xmm1 \n"
2324 "lea 0x20(%1),%1 \n"
2325 "psrlw $0x8,%%xmm1 \n"
2326 "por %%xmm4,%%xmm0 \n"
2327 "pmullw %%xmm3,%%xmm1 \n"
2328 "psrlw $0x8,%%xmm2 \n"
2329 "paddusb %%xmm2,%%xmm0 \n"
2330 "pand %%xmm5,%%xmm1 \n"
2331 "paddusb %%xmm1,%%xmm0 \n"
2332 "sub $0x4,%3 \n"
2333 "movdqa %%xmm0,0x10(%2) \n"
2334 "lea 0x20(%2),%2 \n"
2335 "jg 1b \n"
2336 "9: \n"
2337 : "+r"(src_argb0), // %0
2338 "+r"(src_argb1), // %1
2339 "+r"(dst_argb), // %2
2340 "+r"(width) // %3
2341 : "m"(kShuffleAlpha) // %4
2342 : "memory", "cc"
2343#if defined(__SSE2__)
2344 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2345#endif
2346 );
2347}
2348#endif // HAS_ARGBBLENDROW_SSSE3
2349
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002350
2351#ifdef HAS_ARGBBLENDROW1_SSSE3
2352// Blend 1 pixel at a time, unaligned
2353void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2354 uint8* dst_argb, int width) {
2355 asm volatile (
2356 "pcmpeqb %%xmm7,%%xmm7 \n"
2357 "psrlw $0xf,%%xmm7 \n"
2358 "pcmpeqb %%xmm6,%%xmm6 \n"
2359 "psrlw $0x8,%%xmm6 \n"
2360 "pcmpeqb %%xmm5,%%xmm5 \n"
2361 "psllw $0x8,%%xmm5 \n"
2362 "pcmpeqb %%xmm4,%%xmm4 \n"
2363 "pslld $0x18,%%xmm4 \n"
2364
2365 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002366 ".p2align 4 \n"
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002367 "1: \n"
2368 "movd (%0),%%xmm3 \n"
2369 "lea 0x4(%0),%0 \n"
2370 "movdqa %%xmm3,%%xmm0 \n"
2371 "pxor %%xmm4,%%xmm3 \n"
2372 "movd (%1),%%xmm2 \n"
2373 "pshufb %4,%%xmm3 \n"
2374 "pand %%xmm6,%%xmm2 \n"
2375 "paddw %%xmm7,%%xmm3 \n"
2376 "pmullw %%xmm3,%%xmm2 \n"
2377 "movd (%1),%%xmm1 \n"
2378 "lea 0x4(%1),%1 \n"
2379 "psrlw $0x8,%%xmm1 \n"
2380 "por %%xmm4,%%xmm0 \n"
2381 "pmullw %%xmm3,%%xmm1 \n"
2382 "psrlw $0x8,%%xmm2 \n"
2383 "paddusb %%xmm2,%%xmm0 \n"
2384 "pand %%xmm5,%%xmm1 \n"
2385 "paddusb %%xmm1,%%xmm0 \n"
2386 "sub $0x1,%3 \n"
2387 "movd %%xmm0,(%2) \n"
2388 "lea 0x4(%2),%2 \n"
2389 "jg 1b \n"
2390 : "+r"(src_argb0), // %0
2391 "+r"(src_argb1), // %1
2392 "+r"(dst_argb), // %2
2393 "+r"(width) // %3
2394 : "m"(kShuffleAlpha) // %4
2395 : "memory", "cc"
2396#if defined(__SSE2__)
2397 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2398#endif
2399 );
2400}
2401#endif // HAS_ARGBBLENDROW1_SSSE3
2402
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002403#ifdef HAS_ARGBATTENUATE_SSE2
2404// Attenuate 4 pixels at a time.
2405// aligned to 16 bytes
2406void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2407 asm volatile (
2408 "sub %0,%1 \n"
2409 "pcmpeqb %%xmm4,%%xmm4 \n"
2410 "pslld $0x18,%%xmm4 \n"
2411 "pcmpeqb %%xmm5,%%xmm5 \n"
2412 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002413
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002414 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002415 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002416 "1: \n"
2417 "movdqa (%0),%%xmm0 \n"
2418 "punpcklbw %%xmm0,%%xmm0 \n"
2419 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2420 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2421 "pmulhuw %%xmm2,%%xmm0 \n"
2422 "movdqa (%0),%%xmm1 \n"
2423 "punpckhbw %%xmm1,%%xmm1 \n"
2424 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2425 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2426 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002427 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002428 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002429 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002430 "psrlw $0x8,%%xmm1 \n"
2431 "packuswb %%xmm1,%%xmm0 \n"
2432 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002433 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002434 "sub $0x4,%2 \n"
2435 "movdqa %%xmm0,(%0,%1,1) \n"
2436 "lea 0x10(%0),%0 \n"
2437 "jg 1b \n"
2438 : "+r"(src_argb), // %0
2439 "+r"(dst_argb), // %1
2440 "+r"(width) // %2
2441 :
2442 : "memory", "cc"
2443#if defined(__SSE2__)
2444 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2445#endif
2446 );
2447}
2448#endif // HAS_ARGBATTENUATE_SSE2
2449
fbarchard@google.com810cd912012-04-20 20:15:27 +00002450#ifdef HAS_ARGBATTENUATE_SSSE3
2451// Shuffle table duplicating alpha
2452CONST uvec8 kShuffleAlpha0 = {
2453 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2454};
2455CONST uvec8 kShuffleAlpha1 = {
2456 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2457 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2458};
2459// Attenuate 4 pixels at a time.
2460// aligned to 16 bytes
2461void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2462 asm volatile (
2463 "sub %0,%1 \n"
2464 "pcmpeqb %%xmm3,%%xmm3 \n"
2465 "pslld $0x18,%%xmm3 \n"
2466 "movdqa %3,%%xmm4 \n"
2467 "movdqa %4,%%xmm5 \n"
2468
2469 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002470 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002471 "1: \n"
2472 "movdqa (%0),%%xmm0 \n"
2473 "pshufb %%xmm4,%%xmm0 \n"
2474 "movdqa (%0),%%xmm1 \n"
2475 "punpcklbw %%xmm1,%%xmm1 \n"
2476 "pmulhuw %%xmm1,%%xmm0 \n"
2477 "movdqa (%0),%%xmm1 \n"
2478 "pshufb %%xmm5,%%xmm1 \n"
2479 "movdqa (%0),%%xmm2 \n"
2480 "punpckhbw %%xmm2,%%xmm2 \n"
2481 "pmulhuw %%xmm2,%%xmm1 \n"
2482 "movdqa (%0),%%xmm2 \n"
2483 "pand %%xmm3,%%xmm2 \n"
2484 "psrlw $0x8,%%xmm0 \n"
2485 "psrlw $0x8,%%xmm1 \n"
2486 "packuswb %%xmm1,%%xmm0 \n"
2487 "por %%xmm2,%%xmm0 \n"
2488 "sub $0x4,%2 \n"
2489 "movdqa %%xmm0,(%0,%1,1) \n"
2490 "lea 0x10(%0),%0 \n"
2491 "jg 1b \n"
2492 : "+r"(src_argb), // %0
2493 "+r"(dst_argb), // %1
2494 "+r"(width) // %2
2495 : "m"(kShuffleAlpha0), // %3
2496 "m"(kShuffleAlpha1) // %4
2497 : "memory", "cc"
2498#if defined(__SSE2__)
2499 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2500#endif
2501 );
2502}
2503#endif // HAS_ARGBATTENUATE_SSSE3
2504
2505#ifdef HAS_ARGBUNATTENUATE_SSE2
2506// Divide source RGB by alpha and store to destination.
2507// b = (b * 255 + (a / 2)) / a;
2508// g = (g * 255 + (a / 2)) / a;
2509// r = (r * 255 + (a / 2)) / a;
2510// Reciprocal method is off by 1 on some values. ie 125
2511// 8.16 fixed point inverse table
2512#define T(a) 0x10000 / a
2513CONST uint32 fixed_invtbl8[256] = {
2514 0x100, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
2515 T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
2516 T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
2517 T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
2518 T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
2519 T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
2520 T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
2521 T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
2522 T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
2523 T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
2524 T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
2525 T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
2526 T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
2527 T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
2528 T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
2529 T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
2530 T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
2531 T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
2532 T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
2533 T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
2534 T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
2535 T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
2536 T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
2537 T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
2538 T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
2539 T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
2540 T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
2541 T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
2542 T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
2543 T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
2544 T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
2545 T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x100 };
2546#undef T
2547
2548// Unattenuate 4 pixels at a time.
2549// aligned to 16 bytes
2550void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2551 int width) {
2552 uintptr_t alpha = 0;
2553 asm volatile (
2554 "sub %0,%1 \n"
2555 "pcmpeqb %%xmm4,%%xmm4 \n"
2556 "pslld $0x18,%%xmm4 \n"
2557
2558 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002559 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002560 "1: \n"
2561 "movdqa (%0),%%xmm0 \n"
2562 "movzb 0x3(%0),%3 \n"
2563 "punpcklbw %%xmm0,%%xmm0 \n"
2564 "movd 0x0(%4,%3,4),%%xmm2 \n"
2565 "movzb 0x7(%0),%3 \n"
2566 "movd 0x0(%4,%3,4),%%xmm3 \n"
2567 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2568 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2569 "movlhps %%xmm3,%%xmm2 \n"
2570 "pmulhuw %%xmm2,%%xmm0 \n"
2571 "movdqa (%0),%%xmm1 \n"
2572 "movzb 0xb(%0),%3 \n"
2573 "punpckhbw %%xmm1,%%xmm1 \n"
2574 "movd 0x0(%4,%3,4),%%xmm2 \n"
2575 "movzb 0xf(%0),%3 \n"
2576 "movd 0x0(%4,%3,4),%%xmm3 \n"
2577 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2578 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2579 "movlhps %%xmm3,%%xmm2 \n"
2580 "pmulhuw %%xmm2,%%xmm1 \n"
2581 "movdqa (%0),%%xmm2 \n"
2582 "pand %%xmm4,%%xmm2 \n"
2583 "packuswb %%xmm1,%%xmm0 \n"
2584 "por %%xmm2,%%xmm0 \n"
2585 "sub $0x4,%2 \n"
2586 "movdqa %%xmm0,(%0,%1,1) \n"
2587 "lea 0x10(%0),%0 \n"
2588 "jg 1b \n"
2589 : "+r"(src_argb), // %0
2590 "+r"(dst_argb), // %1
2591 "+r"(width), // %2
2592 "+r"(alpha) // %3
2593 : "r"(fixed_invtbl8) // %4
2594 : "memory", "cc"
2595#if defined(__SSE2__)
2596 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2597#endif
2598 );
2599}
2600#endif // HAS_ARGBUNATTENUATE_SSE2
2601
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002602#endif // defined(__x86_64__) || defined(__i386__)
2603
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002604#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002605} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002606} // namespace libyuv
2607#endif