blob: 28f10c040ae81b0baf50f7fe3058b2c9d0d9f5d5 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000101// Shuffle table for converting ARGB to RGB24.
102CONST uvec8 kShuffleMaskARGBToRGB24 = {
103 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
104};
105
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000106// Shuffle table for converting ARGB to RAW.
107CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000108 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000109};
110
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000112 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000113 "pcmpeqb %%xmm5,%%xmm5 \n"
114 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000115 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000116 "1: \n"
117 "movq (%0),%%xmm0 \n"
118 "lea 0x8(%0),%0 \n"
119 "punpcklbw %%xmm0,%%xmm0 \n"
120 "movdqa %%xmm0,%%xmm1 \n"
121 "punpcklwd %%xmm0,%%xmm0 \n"
122 "punpckhwd %%xmm1,%%xmm1 \n"
123 "por %%xmm5,%%xmm0 \n"
124 "por %%xmm5,%%xmm1 \n"
125 "movdqa %%xmm0,(%1) \n"
126 "movdqa %%xmm1,0x10(%1) \n"
127 "lea 0x20(%1),%1 \n"
128 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000129 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000130 : "+r"(src_y), // %0
131 "+r"(dst_argb), // %1
132 "+r"(pix) // %2
133 :
134 : "memory", "cc"
135#if defined(__SSE2__)
136 , "xmm0", "xmm1", "xmm5"
137#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000138 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000139}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000140
141void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000142 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000143 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000144 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000145 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "1: \n"
147 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000148 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000150 "movdqa %%xmm0,(%0,%1,1) \n"
151 "lea 0x10(%0),%0 \n"
152 "jg 1b \n"
153
fbarchard@google.comb6149762011-11-07 21:58:52 +0000154 : "+r"(src_abgr), // %0
155 "+r"(dst_argb), // %1
156 "+r"(pix) // %2
157 : "m"(kShuffleMaskABGRToARGB) // %3
158 : "memory", "cc"
159#if defined(__SSE2__)
160 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000161#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000162 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163}
164
165void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000166 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000167 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000168 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000169 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000170 "1: \n"
171 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000172 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000173 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000174 "movdqa %%xmm0,(%0,%1,1) \n"
175 "lea 0x10(%0),%0 \n"
176 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000177 : "+r"(src_bgra), // %0
178 "+r"(dst_argb), // %1
179 "+r"(pix) // %2
180 : "m"(kShuffleMaskBGRAToARGB) // %3
181 : "memory", "cc"
182#if defined(__SSE2__)
183 , "xmm0", "xmm5"
184#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000185 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000186}
187
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000188void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000189 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000190 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
191 "pslld $0x18,%%xmm5 \n"
192 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000193 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000194 "1: \n"
195 "movdqu (%0),%%xmm0 \n"
196 "movdqu 0x10(%0),%%xmm1 \n"
197 "movdqu 0x20(%0),%%xmm3 \n"
198 "lea 0x30(%0),%0 \n"
199 "movdqa %%xmm3,%%xmm2 \n"
200 "palignr $0x8,%%xmm1,%%xmm2 \n"
201 "pshufb %%xmm4,%%xmm2 \n"
202 "por %%xmm5,%%xmm2 \n"
203 "palignr $0xc,%%xmm0,%%xmm1 \n"
204 "pshufb %%xmm4,%%xmm0 \n"
205 "movdqa %%xmm2,0x20(%1) \n"
206 "por %%xmm5,%%xmm0 \n"
207 "pshufb %%xmm4,%%xmm1 \n"
208 "movdqa %%xmm0,(%1) \n"
209 "por %%xmm5,%%xmm1 \n"
210 "palignr $0x4,%%xmm3,%%xmm3 \n"
211 "pshufb %%xmm4,%%xmm3 \n"
212 "movdqa %%xmm1,0x10(%1) \n"
213 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000215 "movdqa %%xmm3,0x30(%1) \n"
216 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000217 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000218 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000221 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000227}
228
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000229void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
232 "pslld $0x18,%%xmm5 \n"
233 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000234 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000235 "1: \n"
236 "movdqu (%0),%%xmm0 \n"
237 "movdqu 0x10(%0),%%xmm1 \n"
238 "movdqu 0x20(%0),%%xmm3 \n"
239 "lea 0x30(%0),%0 \n"
240 "movdqa %%xmm3,%%xmm2 \n"
241 "palignr $0x8,%%xmm1,%%xmm2 \n"
242 "pshufb %%xmm4,%%xmm2 \n"
243 "por %%xmm5,%%xmm2 \n"
244 "palignr $0xc,%%xmm0,%%xmm1 \n"
245 "pshufb %%xmm4,%%xmm0 \n"
246 "movdqa %%xmm2,0x20(%1) \n"
247 "por %%xmm5,%%xmm0 \n"
248 "pshufb %%xmm4,%%xmm1 \n"
249 "movdqa %%xmm0,(%1) \n"
250 "por %%xmm5,%%xmm1 \n"
251 "palignr $0x4,%%xmm3,%%xmm3 \n"
252 "pshufb %%xmm4,%%xmm3 \n"
253 "movdqa %%xmm1,0x10(%1) \n"
254 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000255 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000256 "movdqa %%xmm3,0x30(%1) \n"
257 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000258 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000259 : "+r"(src_raw), // %0
260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000262 : "m"(kShuffleMaskRAWToARGB) // %3
263 : "memory", "cc"
264#if defined(__SSE2__)
265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
266#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000267 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000268}
269
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000270void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000271 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000272 "mov $0x1080108,%%eax \n"
273 "movd %%eax,%%xmm5 \n"
274 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000275 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000276 "movd %%eax,%%xmm6 \n"
277 "pshufd $0x0,%%xmm6,%%xmm6 \n"
278 "pcmpeqb %%xmm3,%%xmm3 \n"
279 "psllw $0xb,%%xmm3 \n"
280 "pcmpeqb %%xmm4,%%xmm4 \n"
281 "psllw $0xa,%%xmm4 \n"
282 "psrlw $0x5,%%xmm4 \n"
283 "pcmpeqb %%xmm7,%%xmm7 \n"
284 "psllw $0x8,%%xmm7 \n"
285 "sub %0,%1 \n"
286 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000287 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000288 "1: \n"
289 "movdqu (%0),%%xmm0 \n"
290 "movdqa %%xmm0,%%xmm1 \n"
291 "movdqa %%xmm0,%%xmm2 \n"
292 "pand %%xmm3,%%xmm1 \n"
293 "psllw $0xb,%%xmm2 \n"
294 "pmulhuw %%xmm5,%%xmm1 \n"
295 "pmulhuw %%xmm5,%%xmm2 \n"
296 "psllw $0x8,%%xmm1 \n"
297 "por %%xmm2,%%xmm1 \n"
298 "pand %%xmm4,%%xmm0 \n"
299 "pmulhuw %%xmm6,%%xmm0 \n"
300 "por %%xmm7,%%xmm0 \n"
301 "movdqa %%xmm1,%%xmm2 \n"
302 "punpcklbw %%xmm0,%%xmm1 \n"
303 "punpckhbw %%xmm0,%%xmm2 \n"
304 "movdqa %%xmm1,(%1,%0,2) \n"
305 "movdqa %%xmm2,0x10(%1,%0,2) \n"
306 "lea 0x10(%0),%0 \n"
307 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000308 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000309 : "+r"(src), // %0
310 "+r"(dst), // %1
311 "+r"(pix) // %2
312 :
313 : "memory", "cc", "eax"
314#if defined(__SSE2__)
315 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
316#endif
317 );
318}
319
320void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000321 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000322 "mov $0x1080108,%%eax \n"
323 "movd %%eax,%%xmm5 \n"
324 "pshufd $0x0,%%xmm5,%%xmm5 \n"
325 "mov $0x42004200,%%eax \n"
326 "movd %%eax,%%xmm6 \n"
327 "pshufd $0x0,%%xmm6,%%xmm6 \n"
328 "pcmpeqb %%xmm3,%%xmm3 \n"
329 "psllw $0xb,%%xmm3 \n"
330 "movdqa %%xmm3,%%xmm4 \n"
331 "psrlw $0x6,%%xmm4 \n"
332 "pcmpeqb %%xmm7,%%xmm7 \n"
333 "psllw $0x8,%%xmm7 \n"
334 "sub %0,%1 \n"
335 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000336 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000337 "1: \n"
338 "movdqu (%0),%%xmm0 \n"
339 "movdqa %%xmm0,%%xmm1 \n"
340 "movdqa %%xmm0,%%xmm2 \n"
341 "psllw $0x1,%%xmm1 \n"
342 "psllw $0xb,%%xmm2 \n"
343 "pand %%xmm3,%%xmm1 \n"
344 "pmulhuw %%xmm5,%%xmm2 \n"
345 "pmulhuw %%xmm5,%%xmm1 \n"
346 "psllw $0x8,%%xmm1 \n"
347 "por %%xmm2,%%xmm1 \n"
348 "movdqa %%xmm0,%%xmm2 \n"
349 "pand %%xmm4,%%xmm0 \n"
350 "psraw $0x8,%%xmm2 \n"
351 "pmulhuw %%xmm6,%%xmm0 \n"
352 "pand %%xmm7,%%xmm2 \n"
353 "por %%xmm2,%%xmm0 \n"
354 "movdqa %%xmm1,%%xmm2 \n"
355 "punpcklbw %%xmm0,%%xmm1 \n"
356 "punpckhbw %%xmm0,%%xmm2 \n"
357 "movdqa %%xmm1,(%1,%0,2) \n"
358 "movdqa %%xmm2,0x10(%1,%0,2) \n"
359 "lea 0x10(%0),%0 \n"
360 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000361 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000362 : "+r"(src), // %0
363 "+r"(dst), // %1
364 "+r"(pix) // %2
365 :
366 : "memory", "cc", "eax"
367#if defined(__SSE2__)
368 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
369#endif
370 );
371}
372
373void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000374 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000375 "mov $0xf0f0f0f,%%eax \n"
376 "movd %%eax,%%xmm4 \n"
377 "pshufd $0x0,%%xmm4,%%xmm4 \n"
378 "movdqa %%xmm4,%%xmm5 \n"
379 "pslld $0x4,%%xmm5 \n"
380 "sub %0,%1 \n"
381 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000382 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000383 "1: \n"
384 "movdqu (%0),%%xmm0 \n"
385 "movdqa %%xmm0,%%xmm2 \n"
386 "pand %%xmm4,%%xmm0 \n"
387 "pand %%xmm5,%%xmm2 \n"
388 "movdqa %%xmm0,%%xmm1 \n"
389 "movdqa %%xmm2,%%xmm3 \n"
390 "psllw $0x4,%%xmm1 \n"
391 "psrlw $0x4,%%xmm3 \n"
392 "por %%xmm1,%%xmm0 \n"
393 "por %%xmm3,%%xmm2 \n"
394 "movdqa %%xmm0,%%xmm1 \n"
395 "punpcklbw %%xmm2,%%xmm0 \n"
396 "punpckhbw %%xmm2,%%xmm1 \n"
397 "movdqa %%xmm0,(%1,%0,2) \n"
398 "movdqa %%xmm1,0x10(%1,%0,2) \n"
399 "lea 0x10(%0),%0 \n"
400 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000401 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000402 : "+r"(src), // %0
403 "+r"(dst), // %1
404 "+r"(pix) // %2
405 :
406 : "memory", "cc", "eax"
407#if defined(__SSE2__)
408 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
409#endif
410 );
411}
412
413void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000414 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000415 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000416 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000417 "1: \n"
418 "movdqa (%0),%%xmm0 \n"
419 "movdqa 0x10(%0),%%xmm1 \n"
420 "movdqa 0x20(%0),%%xmm2 \n"
421 "movdqa 0x30(%0),%%xmm3 \n"
422 "lea 0x40(%0),%0 \n"
423 "pshufb %%xmm6,%%xmm0 \n"
424 "pshufb %%xmm6,%%xmm1 \n"
425 "pshufb %%xmm6,%%xmm2 \n"
426 "pshufb %%xmm6,%%xmm3 \n"
427 "movdqa %%xmm1,%%xmm4 \n"
428 "psrldq $0x4,%%xmm1 \n"
429 "pslldq $0xc,%%xmm4 \n"
430 "movdqa %%xmm2,%%xmm5 \n"
431 "por %%xmm4,%%xmm0 \n"
432 "pslldq $0x8,%%xmm5 \n"
433 "movdqa %%xmm0,(%1) \n"
434 "por %%xmm5,%%xmm1 \n"
435 "psrldq $0x8,%%xmm2 \n"
436 "pslldq $0x4,%%xmm3 \n"
437 "por %%xmm3,%%xmm2 \n"
438 "movdqa %%xmm1,0x10(%1) \n"
439 "movdqa %%xmm2,0x20(%1) \n"
440 "lea 0x30(%1),%1 \n"
441 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000442 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 : "+r"(src), // %0
444 "+r"(dst), // %1
445 "+r"(pix) // %2
446 : "m"(kShuffleMaskARGBToRGB24) // %3
447 : "memory", "cc"
448#if defined(__SSE2__)
449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
450#endif
451 );
452}
453
454void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000455 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000456 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000457 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000458 "1: \n"
459 "movdqa (%0),%%xmm0 \n"
460 "movdqa 0x10(%0),%%xmm1 \n"
461 "movdqa 0x20(%0),%%xmm2 \n"
462 "movdqa 0x30(%0),%%xmm3 \n"
463 "lea 0x40(%0),%0 \n"
464 "pshufb %%xmm6,%%xmm0 \n"
465 "pshufb %%xmm6,%%xmm1 \n"
466 "pshufb %%xmm6,%%xmm2 \n"
467 "pshufb %%xmm6,%%xmm3 \n"
468 "movdqa %%xmm1,%%xmm4 \n"
469 "psrldq $0x4,%%xmm1 \n"
470 "pslldq $0xc,%%xmm4 \n"
471 "movdqa %%xmm2,%%xmm5 \n"
472 "por %%xmm4,%%xmm0 \n"
473 "pslldq $0x8,%%xmm5 \n"
474 "movdqa %%xmm0,(%1) \n"
475 "por %%xmm5,%%xmm1 \n"
476 "psrldq $0x8,%%xmm2 \n"
477 "pslldq $0x4,%%xmm3 \n"
478 "por %%xmm3,%%xmm2 \n"
479 "movdqa %%xmm1,0x10(%1) \n"
480 "movdqa %%xmm2,0x20(%1) \n"
481 "lea 0x30(%1),%1 \n"
482 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000483 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000484 : "+r"(src), // %0
485 "+r"(dst), // %1
486 "+r"(pix) // %2
487 : "m"(kShuffleMaskARGBToRAW) // %3
488 : "memory", "cc"
489#if defined(__SSE2__)
490 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
491#endif
492 );
493}
494
495void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000496 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000497 "pcmpeqb %%xmm3,%%xmm3 \n"
498 "psrld $0x1b,%%xmm3 \n"
499 "pcmpeqb %%xmm4,%%xmm4 \n"
500 "psrld $0x1a,%%xmm4 \n"
501 "pslld $0x5,%%xmm4 \n"
502 "pcmpeqb %%xmm5,%%xmm5 \n"
503 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000504 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000505 "1: \n"
506 "movdqa (%0),%%xmm0 \n"
507 "movdqa %%xmm0,%%xmm1 \n"
508 "movdqa %%xmm0,%%xmm2 \n"
509 "pslld $0x8,%%xmm0 \n"
510 "psrld $0x3,%%xmm1 \n"
511 "psrld $0x5,%%xmm2 \n"
512 "psrad $0x10,%%xmm0 \n"
513 "pand %%xmm3,%%xmm1 \n"
514 "pand %%xmm4,%%xmm2 \n"
515 "pand %%xmm5,%%xmm0 \n"
516 "por %%xmm2,%%xmm1 \n"
517 "por %%xmm1,%%xmm0 \n"
518 "packssdw %%xmm0,%%xmm0 \n"
519 "lea 0x10(%0),%0 \n"
520 "movq %%xmm0,(%1) \n"
521 "lea 0x8(%1),%1 \n"
522 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000523 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 : "+r"(src), // %0
525 "+r"(dst), // %1
526 "+r"(pix) // %2
527 :
528 : "memory", "cc"
529#if defined(__SSE2__)
530 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
531#endif
532 );
533}
534
535void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000536 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000537 "pcmpeqb %%xmm4,%%xmm4 \n"
538 "psrld $0x1b,%%xmm4 \n"
539 "movdqa %%xmm4,%%xmm5 \n"
540 "pslld $0x5,%%xmm5 \n"
541 "movdqa %%xmm4,%%xmm6 \n"
542 "pslld $0xa,%%xmm6 \n"
543 "pcmpeqb %%xmm7,%%xmm7 \n"
544 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000545 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000546 "1: \n"
547 "movdqa (%0),%%xmm0 \n"
548 "movdqa %%xmm0,%%xmm1 \n"
549 "movdqa %%xmm0,%%xmm2 \n"
550 "movdqa %%xmm0,%%xmm3 \n"
551 "psrad $0x10,%%xmm0 \n"
552 "psrld $0x3,%%xmm1 \n"
553 "psrld $0x6,%%xmm2 \n"
554 "psrld $0x9,%%xmm3 \n"
555 "pand %%xmm7,%%xmm0 \n"
556 "pand %%xmm4,%%xmm1 \n"
557 "pand %%xmm5,%%xmm2 \n"
558 "pand %%xmm6,%%xmm3 \n"
559 "por %%xmm1,%%xmm0 \n"
560 "por %%xmm3,%%xmm2 \n"
561 "por %%xmm2,%%xmm0 \n"
562 "packssdw %%xmm0,%%xmm0 \n"
563 "lea 0x10(%0),%0 \n"
564 "movq %%xmm0,(%1) \n"
565 "lea 0x8(%1),%1 \n"
566 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000567 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 : "+r"(src), // %0
569 "+r"(dst), // %1
570 "+r"(pix) // %2
571 :
572 : "memory", "cc"
573#if defined(__SSE2__)
574 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
575#endif
576 );
577}
578
579void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000580 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000581 "pcmpeqb %%xmm4,%%xmm4 \n"
582 "psllw $0xc,%%xmm4 \n"
583 "movdqa %%xmm4,%%xmm3 \n"
584 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000585 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 "1: \n"
587 "movdqa (%0),%%xmm0 \n"
588 "movdqa %%xmm0,%%xmm1 \n"
589 "pand %%xmm3,%%xmm0 \n"
590 "pand %%xmm4,%%xmm1 \n"
591 "psrlq $0x4,%%xmm0 \n"
592 "psrlq $0x8,%%xmm1 \n"
593 "por %%xmm1,%%xmm0 \n"
594 "packuswb %%xmm0,%%xmm0 \n"
595 "lea 0x10(%0),%0 \n"
596 "movq %%xmm0,(%1) \n"
597 "lea 0x8(%1),%1 \n"
598 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000599 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000600 : "+r"(src), // %0
601 "+r"(dst), // %1
602 "+r"(pix) // %2
603 :
604 : "memory", "cc"
605#if defined(__SSE2__)
606 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
607#endif
608 );
609}
610
fbarchard@google.comb6149762011-11-07 21:58:52 +0000611void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000612 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000613 "movdqa %4,%%xmm5 \n"
614 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000615 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000616 "1: \n"
617 "movdqa (%0),%%xmm0 \n"
618 "movdqa 0x10(%0),%%xmm1 \n"
619 "movdqa 0x20(%0),%%xmm2 \n"
620 "movdqa 0x30(%0),%%xmm3 \n"
621 "pmaddubsw %%xmm4,%%xmm0 \n"
622 "pmaddubsw %%xmm4,%%xmm1 \n"
623 "pmaddubsw %%xmm4,%%xmm2 \n"
624 "pmaddubsw %%xmm4,%%xmm3 \n"
625 "lea 0x40(%0),%0 \n"
626 "phaddw %%xmm1,%%xmm0 \n"
627 "phaddw %%xmm3,%%xmm2 \n"
628 "psrlw $0x7,%%xmm0 \n"
629 "psrlw $0x7,%%xmm2 \n"
630 "packuswb %%xmm2,%%xmm0 \n"
631 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000632 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000633 "movdqa %%xmm0,(%1) \n"
634 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000636 : "+r"(src_argb), // %0
637 "+r"(dst_y), // %1
638 "+r"(pix) // %2
639 : "m"(kARGBToY), // %3
640 "m"(kAddY16) // %4
641 : "memory", "cc"
642#if defined(__SSE2__)
643 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
644#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000645 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000646}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000647
648void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000649 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000650 "movdqa %4,%%xmm5 \n"
651 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000652 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000653 "1: \n"
654 "movdqu (%0),%%xmm0 \n"
655 "movdqu 0x10(%0),%%xmm1 \n"
656 "movdqu 0x20(%0),%%xmm2 \n"
657 "movdqu 0x30(%0),%%xmm3 \n"
658 "pmaddubsw %%xmm4,%%xmm0 \n"
659 "pmaddubsw %%xmm4,%%xmm1 \n"
660 "pmaddubsw %%xmm4,%%xmm2 \n"
661 "pmaddubsw %%xmm4,%%xmm3 \n"
662 "lea 0x40(%0),%0 \n"
663 "phaddw %%xmm1,%%xmm0 \n"
664 "phaddw %%xmm3,%%xmm2 \n"
665 "psrlw $0x7,%%xmm0 \n"
666 "psrlw $0x7,%%xmm2 \n"
667 "packuswb %%xmm2,%%xmm0 \n"
668 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000669 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000670 "movdqu %%xmm0,(%1) \n"
671 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000672 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000673 : "+r"(src_argb), // %0
674 "+r"(dst_y), // %1
675 "+r"(pix) // %2
676 : "m"(kARGBToY), // %3
677 "m"(kAddY16) // %4
678 : "memory", "cc"
679#if defined(__SSE2__)
680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
681#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000682 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000683}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000684
fbarchard@google.com714050a2012-02-17 22:59:56 +0000685// TODO(fbarchard): pass xmm constants to single block of assembly.
686// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
687// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
688// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
689// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000690void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
691 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000693 "movdqa %0,%%xmm4 \n"
694 "movdqa %1,%%xmm3 \n"
695 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000696 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000697 : "m"(kARGBToU), // %0
698 "m"(kARGBToV), // %1
699 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000700 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000701 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000702 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000703 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000704 "1: \n"
705 "movdqa (%0),%%xmm0 \n"
706 "movdqa 0x10(%0),%%xmm1 \n"
707 "movdqa 0x20(%0),%%xmm2 \n"
708 "movdqa 0x30(%0),%%xmm6 \n"
709 "pavgb (%0,%4,1),%%xmm0 \n"
710 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
711 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
712 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
713 "lea 0x40(%0),%0 \n"
714 "movdqa %%xmm0,%%xmm7 \n"
715 "shufps $0x88,%%xmm1,%%xmm0 \n"
716 "shufps $0xdd,%%xmm1,%%xmm7 \n"
717 "pavgb %%xmm7,%%xmm0 \n"
718 "movdqa %%xmm2,%%xmm7 \n"
719 "shufps $0x88,%%xmm6,%%xmm2 \n"
720 "shufps $0xdd,%%xmm6,%%xmm7 \n"
721 "pavgb %%xmm7,%%xmm2 \n"
722 "movdqa %%xmm0,%%xmm1 \n"
723 "movdqa %%xmm2,%%xmm6 \n"
724 "pmaddubsw %%xmm4,%%xmm0 \n"
725 "pmaddubsw %%xmm4,%%xmm2 \n"
726 "pmaddubsw %%xmm3,%%xmm1 \n"
727 "pmaddubsw %%xmm3,%%xmm6 \n"
728 "phaddw %%xmm2,%%xmm0 \n"
729 "phaddw %%xmm6,%%xmm1 \n"
730 "psraw $0x8,%%xmm0 \n"
731 "psraw $0x8,%%xmm1 \n"
732 "packsswb %%xmm1,%%xmm0 \n"
733 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000734 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000735 "movlps %%xmm0,(%1) \n"
736 "movhps %%xmm0,(%1,%2,1) \n"
737 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000738 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000739 : "+r"(src_argb0), // %0
740 "+r"(dst_u), // %1
741 "+r"(dst_v), // %2
742 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000743 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000744 : "memory", "cc"
745#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000746 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000747#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000748 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000749}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000750
751void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
752 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000753 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000754 "movdqa %0,%%xmm4 \n"
755 "movdqa %1,%%xmm3 \n"
756 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000757 :
758 : "m"(kARGBToU), // %0
759 "m"(kARGBToV), // %1
760 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000761 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000762 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000763 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm6 \n"
770 "movdqu (%0,%4,1),%%xmm7 \n"
771 "pavgb %%xmm7,%%xmm0 \n"
772 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
773 "pavgb %%xmm7,%%xmm1 \n"
774 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm2 \n"
776 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
777 "pavgb %%xmm7,%%xmm6 \n"
778 "lea 0x40(%0),%0 \n"
779 "movdqa %%xmm0,%%xmm7 \n"
780 "shufps $0x88,%%xmm1,%%xmm0 \n"
781 "shufps $0xdd,%%xmm1,%%xmm7 \n"
782 "pavgb %%xmm7,%%xmm0 \n"
783 "movdqa %%xmm2,%%xmm7 \n"
784 "shufps $0x88,%%xmm6,%%xmm2 \n"
785 "shufps $0xdd,%%xmm6,%%xmm7 \n"
786 "pavgb %%xmm7,%%xmm2 \n"
787 "movdqa %%xmm0,%%xmm1 \n"
788 "movdqa %%xmm2,%%xmm6 \n"
789 "pmaddubsw %%xmm4,%%xmm0 \n"
790 "pmaddubsw %%xmm4,%%xmm2 \n"
791 "pmaddubsw %%xmm3,%%xmm1 \n"
792 "pmaddubsw %%xmm3,%%xmm6 \n"
793 "phaddw %%xmm2,%%xmm0 \n"
794 "phaddw %%xmm6,%%xmm1 \n"
795 "psraw $0x8,%%xmm0 \n"
796 "psraw $0x8,%%xmm1 \n"
797 "packsswb %%xmm1,%%xmm0 \n"
798 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000799 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000800 "movlps %%xmm0,(%1) \n"
801 "movhps %%xmm0,(%1,%2,1) \n"
802 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000803 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000804 : "+r"(src_argb0), // %0
805 "+r"(dst_u), // %1
806 "+r"(dst_v), // %2
807 "+rm"(width) // %3
808 : "r"(static_cast<intptr_t>(src_stride_argb))
809 : "memory", "cc"
810#if defined(__SSE2__)
811 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
812#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000813 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000814}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000815
fbarchard@google.com714050a2012-02-17 22:59:56 +0000816void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000817 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000818 "movdqa %4,%%xmm5 \n"
819 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000820 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000821 "1: \n"
822 "movdqa (%0),%%xmm0 \n"
823 "movdqa 0x10(%0),%%xmm1 \n"
824 "movdqa 0x20(%0),%%xmm2 \n"
825 "movdqa 0x30(%0),%%xmm3 \n"
826 "pmaddubsw %%xmm4,%%xmm0 \n"
827 "pmaddubsw %%xmm4,%%xmm1 \n"
828 "pmaddubsw %%xmm4,%%xmm2 \n"
829 "pmaddubsw %%xmm4,%%xmm3 \n"
830 "lea 0x40(%0),%0 \n"
831 "phaddw %%xmm1,%%xmm0 \n"
832 "phaddw %%xmm3,%%xmm2 \n"
833 "psrlw $0x7,%%xmm0 \n"
834 "psrlw $0x7,%%xmm2 \n"
835 "packuswb %%xmm2,%%xmm0 \n"
836 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000837 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000838 "movdqa %%xmm0,(%1) \n"
839 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000840 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000841 : "+r"(src_bgra), // %0
842 "+r"(dst_y), // %1
843 "+r"(pix) // %2
844 : "m"(kBGRAToY), // %3
845 "m"(kAddY16) // %4
846 : "memory", "cc"
847#if defined(__SSE2__)
848 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000849#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000850 );
851}
852
853void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000854 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000855 "movdqa %4,%%xmm5 \n"
856 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000857 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000858 "1: \n"
859 "movdqu (%0),%%xmm0 \n"
860 "movdqu 0x10(%0),%%xmm1 \n"
861 "movdqu 0x20(%0),%%xmm2 \n"
862 "movdqu 0x30(%0),%%xmm3 \n"
863 "pmaddubsw %%xmm4,%%xmm0 \n"
864 "pmaddubsw %%xmm4,%%xmm1 \n"
865 "pmaddubsw %%xmm4,%%xmm2 \n"
866 "pmaddubsw %%xmm4,%%xmm3 \n"
867 "lea 0x40(%0),%0 \n"
868 "phaddw %%xmm1,%%xmm0 \n"
869 "phaddw %%xmm3,%%xmm2 \n"
870 "psrlw $0x7,%%xmm0 \n"
871 "psrlw $0x7,%%xmm2 \n"
872 "packuswb %%xmm2,%%xmm0 \n"
873 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000874 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000875 "movdqu %%xmm0,(%1) \n"
876 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000877 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000878 : "+r"(src_bgra), // %0
879 "+r"(dst_y), // %1
880 "+r"(pix) // %2
881 : "m"(kBGRAToY), // %3
882 "m"(kAddY16) // %4
883 : "memory", "cc"
884#if defined(__SSE2__)
885 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
886#endif
887 );
888}
889
890void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
891 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000892 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000893 "movdqa %0,%%xmm4 \n"
894 "movdqa %1,%%xmm3 \n"
895 "movdqa %2,%%xmm5 \n"
896 :
897 : "m"(kBGRAToU), // %0
898 "m"(kBGRAToV), // %1
899 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000900 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000901 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000902 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000903 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000904 "1: \n"
905 "movdqa (%0),%%xmm0 \n"
906 "movdqa 0x10(%0),%%xmm1 \n"
907 "movdqa 0x20(%0),%%xmm2 \n"
908 "movdqa 0x30(%0),%%xmm6 \n"
909 "pavgb (%0,%4,1),%%xmm0 \n"
910 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
911 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
912 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
913 "lea 0x40(%0),%0 \n"
914 "movdqa %%xmm0,%%xmm7 \n"
915 "shufps $0x88,%%xmm1,%%xmm0 \n"
916 "shufps $0xdd,%%xmm1,%%xmm7 \n"
917 "pavgb %%xmm7,%%xmm0 \n"
918 "movdqa %%xmm2,%%xmm7 \n"
919 "shufps $0x88,%%xmm6,%%xmm2 \n"
920 "shufps $0xdd,%%xmm6,%%xmm7 \n"
921 "pavgb %%xmm7,%%xmm2 \n"
922 "movdqa %%xmm0,%%xmm1 \n"
923 "movdqa %%xmm2,%%xmm6 \n"
924 "pmaddubsw %%xmm4,%%xmm0 \n"
925 "pmaddubsw %%xmm4,%%xmm2 \n"
926 "pmaddubsw %%xmm3,%%xmm1 \n"
927 "pmaddubsw %%xmm3,%%xmm6 \n"
928 "phaddw %%xmm2,%%xmm0 \n"
929 "phaddw %%xmm6,%%xmm1 \n"
930 "psraw $0x8,%%xmm0 \n"
931 "psraw $0x8,%%xmm1 \n"
932 "packsswb %%xmm1,%%xmm0 \n"
933 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000934 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000935 "movlps %%xmm0,(%1) \n"
936 "movhps %%xmm0,(%1,%2,1) \n"
937 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000938 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000939 : "+r"(src_bgra0), // %0
940 "+r"(dst_u), // %1
941 "+r"(dst_v), // %2
942 "+rm"(width) // %3
943 : "r"(static_cast<intptr_t>(src_stride_bgra))
944 : "memory", "cc"
945#if defined(__SSE2__)
946 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
947#endif
948 );
949}
950
951void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
952 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000953 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000954 "movdqa %0,%%xmm4 \n"
955 "movdqa %1,%%xmm3 \n"
956 "movdqa %2,%%xmm5 \n"
957 :
958 : "m"(kBGRAToU), // %0
959 "m"(kBGRAToV), // %1
960 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000961 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000962 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000963 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000964 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000965 "1: \n"
966 "movdqu (%0),%%xmm0 \n"
967 "movdqu 0x10(%0),%%xmm1 \n"
968 "movdqu 0x20(%0),%%xmm2 \n"
969 "movdqu 0x30(%0),%%xmm6 \n"
970 "movdqu (%0,%4,1),%%xmm7 \n"
971 "pavgb %%xmm7,%%xmm0 \n"
972 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
973 "pavgb %%xmm7,%%xmm1 \n"
974 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm2 \n"
976 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
977 "pavgb %%xmm7,%%xmm6 \n"
978 "lea 0x40(%0),%0 \n"
979 "movdqa %%xmm0,%%xmm7 \n"
980 "shufps $0x88,%%xmm1,%%xmm0 \n"
981 "shufps $0xdd,%%xmm1,%%xmm7 \n"
982 "pavgb %%xmm7,%%xmm0 \n"
983 "movdqa %%xmm2,%%xmm7 \n"
984 "shufps $0x88,%%xmm6,%%xmm2 \n"
985 "shufps $0xdd,%%xmm6,%%xmm7 \n"
986 "pavgb %%xmm7,%%xmm2 \n"
987 "movdqa %%xmm0,%%xmm1 \n"
988 "movdqa %%xmm2,%%xmm6 \n"
989 "pmaddubsw %%xmm4,%%xmm0 \n"
990 "pmaddubsw %%xmm4,%%xmm2 \n"
991 "pmaddubsw %%xmm3,%%xmm1 \n"
992 "pmaddubsw %%xmm3,%%xmm6 \n"
993 "phaddw %%xmm2,%%xmm0 \n"
994 "phaddw %%xmm6,%%xmm1 \n"
995 "psraw $0x8,%%xmm0 \n"
996 "psraw $0x8,%%xmm1 \n"
997 "packsswb %%xmm1,%%xmm0 \n"
998 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000999 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001000 "movlps %%xmm0,(%1) \n"
1001 "movhps %%xmm0,(%1,%2,1) \n"
1002 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001003 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001004 : "+r"(src_bgra0), // %0
1005 "+r"(dst_u), // %1
1006 "+r"(dst_v), // %2
1007 "+rm"(width) // %3
1008 : "r"(static_cast<intptr_t>(src_stride_bgra))
1009 : "memory", "cc"
1010#if defined(__SSE2__)
1011 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1012#endif
1013 );
1014}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001015
1016void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001017 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001018 "movdqa %4,%%xmm5 \n"
1019 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001020 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001021 "1: \n"
1022 "movdqa (%0),%%xmm0 \n"
1023 "movdqa 0x10(%0),%%xmm1 \n"
1024 "movdqa 0x20(%0),%%xmm2 \n"
1025 "movdqa 0x30(%0),%%xmm3 \n"
1026 "pmaddubsw %%xmm4,%%xmm0 \n"
1027 "pmaddubsw %%xmm4,%%xmm1 \n"
1028 "pmaddubsw %%xmm4,%%xmm2 \n"
1029 "pmaddubsw %%xmm4,%%xmm3 \n"
1030 "lea 0x40(%0),%0 \n"
1031 "phaddw %%xmm1,%%xmm0 \n"
1032 "phaddw %%xmm3,%%xmm2 \n"
1033 "psrlw $0x7,%%xmm0 \n"
1034 "psrlw $0x7,%%xmm2 \n"
1035 "packuswb %%xmm2,%%xmm0 \n"
1036 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001037 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001038 "movdqa %%xmm0,(%1) \n"
1039 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001040 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001041 : "+r"(src_abgr), // %0
1042 "+r"(dst_y), // %1
1043 "+r"(pix) // %2
1044 : "m"(kABGRToY), // %3
1045 "m"(kAddY16) // %4
1046 : "memory", "cc"
1047#if defined(__SSE2__)
1048 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1049#endif
1050 );
1051}
1052
1053void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001054 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001055 "movdqa %4,%%xmm5 \n"
1056 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001057 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001058 "1: \n"
1059 "movdqu (%0),%%xmm0 \n"
1060 "movdqu 0x10(%0),%%xmm1 \n"
1061 "movdqu 0x20(%0),%%xmm2 \n"
1062 "movdqu 0x30(%0),%%xmm3 \n"
1063 "pmaddubsw %%xmm4,%%xmm0 \n"
1064 "pmaddubsw %%xmm4,%%xmm1 \n"
1065 "pmaddubsw %%xmm4,%%xmm2 \n"
1066 "pmaddubsw %%xmm4,%%xmm3 \n"
1067 "lea 0x40(%0),%0 \n"
1068 "phaddw %%xmm1,%%xmm0 \n"
1069 "phaddw %%xmm3,%%xmm2 \n"
1070 "psrlw $0x7,%%xmm0 \n"
1071 "psrlw $0x7,%%xmm2 \n"
1072 "packuswb %%xmm2,%%xmm0 \n"
1073 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001074 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001075 "movdqu %%xmm0,(%1) \n"
1076 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001077 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001078 : "+r"(src_abgr), // %0
1079 "+r"(dst_y), // %1
1080 "+r"(pix) // %2
1081 : "m"(kABGRToY), // %3
1082 "m"(kAddY16) // %4
1083 : "memory", "cc"
1084#if defined(__SSE2__)
1085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1086#endif
1087 );
1088}
1089
1090void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1091 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001092 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001093 "movdqa %0,%%xmm4 \n"
1094 "movdqa %1,%%xmm3 \n"
1095 "movdqa %2,%%xmm5 \n"
1096 :
1097 : "m"(kABGRToU), // %0
1098 "m"(kABGRToV), // %1
1099 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001100 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001101 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001102 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001103 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001104 "1: \n"
1105 "movdqa (%0),%%xmm0 \n"
1106 "movdqa 0x10(%0),%%xmm1 \n"
1107 "movdqa 0x20(%0),%%xmm2 \n"
1108 "movdqa 0x30(%0),%%xmm6 \n"
1109 "pavgb (%0,%4,1),%%xmm0 \n"
1110 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1111 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1112 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1113 "lea 0x40(%0),%0 \n"
1114 "movdqa %%xmm0,%%xmm7 \n"
1115 "shufps $0x88,%%xmm1,%%xmm0 \n"
1116 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1117 "pavgb %%xmm7,%%xmm0 \n"
1118 "movdqa %%xmm2,%%xmm7 \n"
1119 "shufps $0x88,%%xmm6,%%xmm2 \n"
1120 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1121 "pavgb %%xmm7,%%xmm2 \n"
1122 "movdqa %%xmm0,%%xmm1 \n"
1123 "movdqa %%xmm2,%%xmm6 \n"
1124 "pmaddubsw %%xmm4,%%xmm0 \n"
1125 "pmaddubsw %%xmm4,%%xmm2 \n"
1126 "pmaddubsw %%xmm3,%%xmm1 \n"
1127 "pmaddubsw %%xmm3,%%xmm6 \n"
1128 "phaddw %%xmm2,%%xmm0 \n"
1129 "phaddw %%xmm6,%%xmm1 \n"
1130 "psraw $0x8,%%xmm0 \n"
1131 "psraw $0x8,%%xmm1 \n"
1132 "packsswb %%xmm1,%%xmm0 \n"
1133 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001134 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001135 "movlps %%xmm0,(%1) \n"
1136 "movhps %%xmm0,(%1,%2,1) \n"
1137 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001138 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001139 : "+r"(src_abgr0), // %0
1140 "+r"(dst_u), // %1
1141 "+r"(dst_v), // %2
1142 "+rm"(width) // %3
1143 : "r"(static_cast<intptr_t>(src_stride_abgr))
1144 : "memory", "cc"
1145#if defined(__SSE2__)
1146 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1147#endif
1148 );
1149}
1150
1151void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1152 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001153 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001154 "movdqa %0,%%xmm4 \n"
1155 "movdqa %1,%%xmm3 \n"
1156 "movdqa %2,%%xmm5 \n"
1157 :
1158 : "m"(kABGRToU), // %0
1159 "m"(kABGRToV), // %1
1160 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001161 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001162 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001163 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001164 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001165 "1: \n"
1166 "movdqu (%0),%%xmm0 \n"
1167 "movdqu 0x10(%0),%%xmm1 \n"
1168 "movdqu 0x20(%0),%%xmm2 \n"
1169 "movdqu 0x30(%0),%%xmm6 \n"
1170 "movdqu (%0,%4,1),%%xmm7 \n"
1171 "pavgb %%xmm7,%%xmm0 \n"
1172 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1173 "pavgb %%xmm7,%%xmm1 \n"
1174 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm2 \n"
1176 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1177 "pavgb %%xmm7,%%xmm6 \n"
1178 "lea 0x40(%0),%0 \n"
1179 "movdqa %%xmm0,%%xmm7 \n"
1180 "shufps $0x88,%%xmm1,%%xmm0 \n"
1181 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1182 "pavgb %%xmm7,%%xmm0 \n"
1183 "movdqa %%xmm2,%%xmm7 \n"
1184 "shufps $0x88,%%xmm6,%%xmm2 \n"
1185 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1186 "pavgb %%xmm7,%%xmm2 \n"
1187 "movdqa %%xmm0,%%xmm1 \n"
1188 "movdqa %%xmm2,%%xmm6 \n"
1189 "pmaddubsw %%xmm4,%%xmm0 \n"
1190 "pmaddubsw %%xmm4,%%xmm2 \n"
1191 "pmaddubsw %%xmm3,%%xmm1 \n"
1192 "pmaddubsw %%xmm3,%%xmm6 \n"
1193 "phaddw %%xmm2,%%xmm0 \n"
1194 "phaddw %%xmm6,%%xmm1 \n"
1195 "psraw $0x8,%%xmm0 \n"
1196 "psraw $0x8,%%xmm1 \n"
1197 "packsswb %%xmm1,%%xmm0 \n"
1198 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001199 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001200 "movlps %%xmm0,(%1) \n"
1201 "movhps %%xmm0,(%1,%2,1) \n"
1202 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001203 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001204 : "+r"(src_abgr0), // %0
1205 "+r"(dst_u), // %1
1206 "+r"(dst_v), // %2
1207 "+rm"(width) // %3
1208 : "r"(static_cast<intptr_t>(src_stride_abgr))
1209 : "memory", "cc"
1210#if defined(__SSE2__)
1211 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1212#endif
1213 );
1214}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001215#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001216
fbarchard@google.come214fe32012-06-04 23:47:11 +00001217#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001218#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1219#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1220#define UR 0
1221
1222#define VB 0
1223#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1224#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1225
1226// Bias
1227#define BB UB * 128 + VB * 128
1228#define BG UG * 128 + VG * 128
1229#define BR UR * 128 + VR * 128
1230
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001231#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001232
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001233struct {
1234 vec8 kUVToB;
1235 vec8 kUVToG;
1236 vec8 kUVToR;
1237 vec16 kUVBiasB;
1238 vec16 kUVBiasG;
1239 vec16 kUVBiasR;
1240 vec16 kYSub16;
1241 vec16 kYToRgb;
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001242} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001243 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1244 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1245 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1246 { BB, BB, BB, BB, BB, BB, BB, BB },
1247 { BG, BG, BG, BG, BG, BG, BG, BG },
1248 { BR, BR, BR, BR, BR, BR, BR, BR },
1249 { 16, 16, 16, 16, 16, 16, 16, 16 },
1250 { YG, YG, YG, YG, YG, YG, YG, YG }
1251};
1252
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001253// Read 8 UV from 411
1254#define READYUV444 \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001255 "movq (%1),%%xmm0 \n" \
1256 "movq (%1,%2,1),%%xmm1 \n" \
1257 "lea 0x8(%1),%1 \n" \
1258 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001259
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001260// Read 4 UV from 422, upsample to 8 UV
1261#define READYUV422 \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001262 "movd (%1),%%xmm0 \n" \
1263 "movd (%1,%2,1),%%xmm1 \n" \
1264 "lea 0x4(%1),%1 \n" \
1265 "punpcklbw %%xmm1,%%xmm0 \n" \
1266 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001267
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001268// Read 2 UV from 411, upsample to 8 UV
1269#define READYUV411 \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001270 "movd (%1),%%xmm0 \n" \
1271 "movd (%1,%2,1),%%xmm1 \n" \
1272 "lea 0x2(%1),%1 \n" \
1273 "punpcklbw %%xmm1,%%xmm0 \n" \
1274 "punpcklwd %%xmm0,%%xmm0 \n" \
1275 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001276
1277// Convert 8 pixels: 8 UV and 8 Y
1278#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001279 "movdqa %%xmm0,%%xmm1 \n" \
1280 "movdqa %%xmm0,%%xmm2 \n" \
1281 "pmaddubsw (%5),%%xmm0 \n" \
1282 "pmaddubsw 16(%5),%%xmm1 \n" \
1283 "pmaddubsw 32(%5),%%xmm2 \n" \
1284 "psubw 48(%5),%%xmm0 \n" \
1285 "psubw 64(%5),%%xmm1 \n" \
1286 "psubw 80(%5),%%xmm2 \n" \
1287 "movq (%0),%%xmm3 \n" \
1288 "lea 0x8(%0),%0 \n" \
1289 "punpcklbw %%xmm4,%%xmm3 \n" \
1290 "psubsw 96(%5),%%xmm3 \n" \
1291 "pmullw 112(%5),%%xmm3 \n" \
1292 "paddsw %%xmm3,%%xmm0 \n" \
1293 "paddsw %%xmm3,%%xmm1 \n" \
1294 "paddsw %%xmm3,%%xmm2 \n" \
1295 "psraw $0x6,%%xmm0 \n" \
1296 "psraw $0x6,%%xmm1 \n" \
1297 "psraw $0x6,%%xmm2 \n" \
1298 "packuswb %%xmm0,%%xmm0 \n" \
1299 "packuswb %%xmm1,%%xmm1 \n" \
1300 "packuswb %%xmm2,%%xmm2 \n"
1301
1302void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001303 const uint8* u_buf,
1304 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001305 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001306 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001307 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001308 "sub %1,%2 \n"
1309 "pcmpeqb %%xmm5,%%xmm5 \n"
1310 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001311 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001312 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001313 READYUV444
1314 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001315 "punpcklbw %%xmm1,%%xmm0 \n"
1316 "punpcklbw %%xmm5,%%xmm2 \n"
1317 "movdqa %%xmm0,%%xmm1 \n"
1318 "punpcklwd %%xmm2,%%xmm0 \n"
1319 "punpckhwd %%xmm2,%%xmm1 \n"
1320 "movdqa %%xmm0,(%3) \n"
1321 "movdqa %%xmm1,0x10(%3) \n"
1322 "lea 0x20(%3),%3 \n"
1323 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001324 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001325 : "+r"(y_buf), // %0
1326 "+r"(u_buf), // %1
1327 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001328 "+r"(argb_buf), // %3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001329 "+rm"(width) // %4
1330 : "r"(&kYuvConstants.kUVToB) // %5
1331 : "memory", "cc"
1332#if defined(__SSE2__)
1333 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1334#endif
1335 );
1336}
1337
fbarchard@google.come214fe32012-06-04 23:47:11 +00001338void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001339 const uint8* u_buf,
1340 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001341 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001342 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001343 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001344 "sub %1,%2 \n"
1345 "pcmpeqb %%xmm5,%%xmm5 \n"
1346 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001347 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001348 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001349 READYUV422
1350 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001351 "punpcklbw %%xmm1,%%xmm0 \n"
1352 "punpcklbw %%xmm5,%%xmm2 \n"
1353 "movdqa %%xmm0,%%xmm1 \n"
1354 "punpcklwd %%xmm2,%%xmm0 \n"
1355 "punpckhwd %%xmm2,%%xmm1 \n"
1356 "movdqa %%xmm0,(%3) \n"
1357 "movdqa %%xmm1,0x10(%3) \n"
1358 "lea 0x20(%3),%3 \n"
1359 "sub $0x8,%4 \n"
1360 "jg 1b \n"
1361 : "+r"(y_buf), // %0
1362 "+r"(u_buf), // %1
1363 "+r"(v_buf), // %2
1364 "+r"(argb_buf), // %3
1365 "+rm"(width) // %4
1366 : "r"(&kYuvConstants.kUVToB) // %5
1367 : "memory", "cc"
1368#if defined(__SSE2__)
1369 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1370#endif
1371 );
1372}
1373
1374void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1375 const uint8* u_buf,
1376 const uint8* v_buf,
1377 uint8* argb_buf,
1378 int width) {
1379 asm volatile (
1380 "sub %1,%2 \n"
1381 "pcmpeqb %%xmm5,%%xmm5 \n"
1382 "pxor %%xmm4,%%xmm4 \n"
1383 ".p2align 4 \n"
1384 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001385 READYUV411
1386 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001387 "punpcklbw %%xmm1,%%xmm0 \n"
1388 "punpcklbw %%xmm5,%%xmm2 \n"
1389 "movdqa %%xmm0,%%xmm1 \n"
1390 "punpcklwd %%xmm2,%%xmm0 \n"
1391 "punpckhwd %%xmm2,%%xmm1 \n"
1392 "movdqa %%xmm0,(%3) \n"
1393 "movdqa %%xmm1,0x10(%3) \n"
1394 "lea 0x20(%3),%3 \n"
1395 "sub $0x8,%4 \n"
1396 "jg 1b \n"
1397 : "+r"(y_buf), // %0
1398 "+r"(u_buf), // %1
1399 "+r"(v_buf), // %2
1400 "+r"(argb_buf), // %3
1401 "+rm"(width) // %4
1402 : "r"(&kYuvConstants.kUVToB) // %5
1403 : "memory", "cc"
1404#if defined(__SSE2__)
1405 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1406#endif
1407 );
1408}
1409
1410void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1411 const uint8* u_buf,
1412 const uint8* v_buf,
1413 uint8* argb_buf,
1414 int width) {
1415 asm volatile (
1416 "sub %1,%2 \n"
1417 "pcmpeqb %%xmm5,%%xmm5 \n"
1418 "pxor %%xmm4,%%xmm4 \n"
1419 ".p2align 4 \n"
1420 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001421 READYUV444
1422 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001423 "punpcklbw %%xmm1,%%xmm0 \n"
1424 "punpcklbw %%xmm5,%%xmm2 \n"
1425 "movdqa %%xmm0,%%xmm1 \n"
1426 "punpcklwd %%xmm2,%%xmm0 \n"
1427 "punpckhwd %%xmm2,%%xmm1 \n"
1428 "movdqu %%xmm0,(%3) \n"
1429 "movdqu %%xmm1,0x10(%3) \n"
1430 "lea 0x20(%3),%3 \n"
1431 "sub $0x8,%4 \n"
1432 "jg 1b \n"
1433 : "+r"(y_buf), // %0
1434 "+r"(u_buf), // %1
1435 "+r"(v_buf), // %2
1436 "+r"(argb_buf), // %3
1437 "+rm"(width) // %4
1438 : "r"(&kYuvConstants.kUVToB) // %5
1439 : "memory", "cc"
1440#if defined(__SSE2__)
1441 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1442#endif
1443 );
1444}
1445
1446void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1447 const uint8* u_buf,
1448 const uint8* v_buf,
1449 uint8* argb_buf,
1450 int width) {
1451 asm volatile (
1452 "sub %1,%2 \n"
1453 "pcmpeqb %%xmm5,%%xmm5 \n"
1454 "pxor %%xmm4,%%xmm4 \n"
1455 ".p2align 4 \n"
1456 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001457 READYUV422
1458 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001459 "punpcklbw %%xmm1,%%xmm0 \n"
1460 "punpcklbw %%xmm5,%%xmm2 \n"
1461 "movdqa %%xmm0,%%xmm1 \n"
1462 "punpcklwd %%xmm2,%%xmm0 \n"
1463 "punpckhwd %%xmm2,%%xmm1 \n"
1464 "movdqu %%xmm0,(%3) \n"
1465 "movdqu %%xmm1,0x10(%3) \n"
1466 "lea 0x20(%3),%3 \n"
1467 "sub $0x8,%4 \n"
1468 "jg 1b \n"
1469 : "+r"(y_buf), // %0
1470 "+r"(u_buf), // %1
1471 "+r"(v_buf), // %2
1472 "+r"(argb_buf), // %3
1473 "+rm"(width) // %4
1474 : "r"(&kYuvConstants.kUVToB) // %5
1475 : "memory", "cc"
1476#if defined(__SSE2__)
1477 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1478#endif
1479 );
1480}
1481
1482void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1483 const uint8* u_buf,
1484 const uint8* v_buf,
1485 uint8* argb_buf,
1486 int width) {
1487 asm volatile (
1488 "sub %1,%2 \n"
1489 "pcmpeqb %%xmm5,%%xmm5 \n"
1490 "pxor %%xmm4,%%xmm4 \n"
1491 ".p2align 4 \n"
1492 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001493 READYUV411
1494 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001495 "punpcklbw %%xmm1,%%xmm0 \n"
1496 "punpcklbw %%xmm5,%%xmm2 \n"
1497 "movdqa %%xmm0,%%xmm1 \n"
1498 "punpcklwd %%xmm2,%%xmm0 \n"
1499 "punpckhwd %%xmm2,%%xmm1 \n"
1500 "movdqu %%xmm0,(%3) \n"
1501 "movdqu %%xmm1,0x10(%3) \n"
1502 "lea 0x20(%3),%3 \n"
1503 "sub $0x8,%4 \n"
1504 "jg 1b \n"
1505 : "+r"(y_buf), // %0
1506 "+r"(u_buf), // %1
1507 "+r"(v_buf), // %2
1508 "+r"(argb_buf), // %3
1509 "+rm"(width) // %4
1510 : "r"(&kYuvConstants.kUVToB) // %5
1511 : "memory", "cc"
1512#if defined(__SSE2__)
1513 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1514#endif
1515 );
1516}
1517
1518void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1519 const uint8* u_buf,
1520 const uint8* v_buf,
1521 uint8* bgra_buf,
1522 int width) {
1523 asm volatile (
1524 "sub %1,%2 \n"
1525 "pcmpeqb %%xmm5,%%xmm5 \n"
1526 "pxor %%xmm4,%%xmm4 \n"
1527 ".p2align 4 \n"
1528 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001529 READYUV422
1530 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001531 "pcmpeqb %%xmm5,%%xmm5 \n"
1532 "punpcklbw %%xmm0,%%xmm1 \n"
1533 "punpcklbw %%xmm2,%%xmm5 \n"
1534 "movdqa %%xmm5,%%xmm0 \n"
1535 "punpcklwd %%xmm1,%%xmm5 \n"
1536 "punpckhwd %%xmm1,%%xmm0 \n"
1537 "movdqa %%xmm5,(%3) \n"
1538 "movdqa %%xmm0,0x10(%3) \n"
1539 "lea 0x20(%3),%3 \n"
1540 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001541 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001542 : "+r"(y_buf), // %0
1543 "+r"(u_buf), // %1
1544 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001545 "+r"(bgra_buf), // %3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001546 "+rm"(width) // %4
1547 : "r"(&kYuvConstants.kUVToB) // %5
1548 : "memory", "cc"
1549#if defined(__SSE2__)
1550 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1551#endif
1552 );
1553}
1554
fbarchard@google.come214fe32012-06-04 23:47:11 +00001555void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001556 const uint8* u_buf,
1557 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001558 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001559 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001560 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001561 "sub %1,%2 \n"
1562 "pcmpeqb %%xmm5,%%xmm5 \n"
1563 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001564 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001565 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001566 READYUV422
1567 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001568 "punpcklbw %%xmm1,%%xmm2 \n"
1569 "punpcklbw %%xmm5,%%xmm0 \n"
1570 "movdqa %%xmm2,%%xmm1 \n"
1571 "punpcklwd %%xmm0,%%xmm2 \n"
1572 "punpckhwd %%xmm0,%%xmm1 \n"
1573 "movdqa %%xmm2,(%3) \n"
1574 "movdqa %%xmm1,0x10(%3) \n"
1575 "lea 0x20(%3),%3 \n"
1576 "sub $0x8,%4 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001577 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001578 : "+r"(y_buf), // %0
1579 "+r"(u_buf), // %1
1580 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001581 "+r"(abgr_buf), // %3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001582 "+rm"(width) // %4
1583 : "r"(&kYuvConstants.kUVToB) // %5
1584 : "memory", "cc"
1585#if defined(__SSE2__)
1586 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1587#endif
1588 );
1589}
1590
fbarchard@google.come214fe32012-06-04 23:47:11 +00001591void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001592 const uint8* u_buf,
1593 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001594 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001595 int width) {
1596 asm volatile (
1597 "sub %1,%2 \n"
1598 "pcmpeqb %%xmm5,%%xmm5 \n"
1599 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001600 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001601 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001602 READYUV422
1603 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001604 "pcmpeqb %%xmm5,%%xmm5 \n"
1605 "punpcklbw %%xmm0,%%xmm1 \n"
1606 "punpcklbw %%xmm2,%%xmm5 \n"
1607 "movdqa %%xmm5,%%xmm0 \n"
1608 "punpcklwd %%xmm1,%%xmm5 \n"
1609 "punpckhwd %%xmm1,%%xmm0 \n"
1610 "movdqu %%xmm5,(%3) \n"
1611 "movdqu %%xmm0,0x10(%3) \n"
1612 "lea 0x20(%3),%3 \n"
1613 "sub $0x8,%4 \n"
1614 "jg 1b \n"
1615 : "+r"(y_buf), // %0
1616 "+r"(u_buf), // %1
1617 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001618 "+r"(bgra_buf), // %3
fbarchard@google.com952a5072012-03-30 18:10:50 +00001619 "+rm"(width) // %4
1620 : "r"(&kYuvConstants.kUVToB) // %5
1621 : "memory", "cc"
1622#if defined(__SSE2__)
1623 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1624#endif
1625 );
1626}
1627
fbarchard@google.come214fe32012-06-04 23:47:11 +00001628void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001629 const uint8* u_buf,
1630 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001631 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001632 int width) {
1633 asm volatile (
1634 "sub %1,%2 \n"
1635 "pcmpeqb %%xmm5,%%xmm5 \n"
1636 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001637 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001638 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001639 READYUV422
1640 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001641 "punpcklbw %%xmm1,%%xmm2 \n"
1642 "punpcklbw %%xmm5,%%xmm0 \n"
1643 "movdqa %%xmm2,%%xmm1 \n"
1644 "punpcklwd %%xmm0,%%xmm2 \n"
1645 "punpckhwd %%xmm0,%%xmm1 \n"
1646 "movdqu %%xmm2,(%3) \n"
1647 "movdqu %%xmm1,0x10(%3) \n"
1648 "lea 0x20(%3),%3 \n"
1649 "sub $0x8,%4 \n"
1650 "jg 1b \n"
1651 : "+r"(y_buf), // %0
1652 "+r"(u_buf), // %1
1653 "+r"(v_buf), // %2
fbarchard@google.come214fe32012-06-04 23:47:11 +00001654 "+r"(abgr_buf), // %3
fbarchard@google.com952a5072012-03-30 18:10:50 +00001655 "+rm"(width) // %4
1656 : "r"(&kYuvConstants.kUVToB) // %5
1657 : "memory", "cc"
1658#if defined(__SSE2__)
1659 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1660#endif
1661 );
1662}
1663
fbarchard@google.come214fe32012-06-04 23:47:11 +00001664#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001665
1666#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001667void YToARGBRow_SSE2(const uint8* y_buf,
1668 uint8* rgb_buf,
1669 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001670 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001671 "pcmpeqb %%xmm4,%%xmm4 \n"
1672 "pslld $0x18,%%xmm4 \n"
1673 "mov $0x10001000,%%eax \n"
1674 "movd %%eax,%%xmm3 \n"
1675 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1676 "mov $0x012a012a,%%eax \n"
1677 "movd %%eax,%%xmm2 \n"
1678 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001679 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001680 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001681 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001682 "movq (%0),%%xmm0 \n"
1683 "lea 0x8(%0),%0 \n"
1684 "punpcklbw %%xmm0,%%xmm0 \n"
1685 "psubusw %%xmm3,%%xmm0 \n"
1686 "pmulhuw %%xmm2,%%xmm0 \n"
1687 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001688
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001689 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001690 "punpcklbw %%xmm0,%%xmm0 \n"
1691 "movdqa %%xmm0,%%xmm1 \n"
1692 "punpcklwd %%xmm0,%%xmm0 \n"
1693 "punpckhwd %%xmm1,%%xmm1 \n"
1694 "por %%xmm4,%%xmm0 \n"
1695 "por %%xmm4,%%xmm1 \n"
1696 "movdqa %%xmm0,(%1) \n"
1697 "movdqa %%xmm1,16(%1) \n"
1698 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001699
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001700 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001701 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00001702 : "+r"(y_buf), // %0
1703 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00001704 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001705 :
1706 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001707#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00001708 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001709#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001710 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00001711}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001712#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001713
fbarchard@google.com42831e02012-01-21 02:54:17 +00001714#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001715// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00001716CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001717 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1718};
1719
fbarchard@google.com42831e02012-01-21 02:54:17 +00001720void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00001721 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001722 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001723 "movdqa %3,%%xmm5 \n"
1724 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001725 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001726 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001727 "movdqa (%0,%2),%%xmm0 \n"
1728 "pshufb %%xmm5,%%xmm0 \n"
1729 "sub $0x10,%2 \n"
1730 "movdqa %%xmm0,(%1) \n"
1731 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001732 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00001733 : "+r"(src), // %0
1734 "+r"(dst), // %1
1735 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001736 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001737 : "memory", "cc"
1738#if defined(__SSE2__)
1739 , "xmm0", "xmm5"
1740#endif
1741 );
1742}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001743#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00001744
fbarchard@google.com42831e02012-01-21 02:54:17 +00001745#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00001746void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001747 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001748 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001749 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001750 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001751 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001752 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001753 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001754 "psllw $0x8,%%xmm0 \n"
1755 "psrlw $0x8,%%xmm1 \n"
1756 "por %%xmm1,%%xmm0 \n"
1757 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1758 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1759 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1760 "sub $0x10,%2 \n"
1761 "movdqu %%xmm0,(%1) \n"
1762 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001763 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001764 : "+r"(src), // %0
1765 "+r"(dst), // %1
1766 "+r"(temp_width) // %2
1767 :
1768 : "memory", "cc"
1769#if defined(__SSE2__)
1770 , "xmm0", "xmm1"
1771#endif
1772 );
1773}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001774#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001775
fbarchard@google.com16a96642012-03-02 22:38:09 +00001776#ifdef HAS_MIRRORROW_UV_SSSE3
1777// Shuffle table for reversing the bytes of UV channels.
1778CONST uvec8 kShuffleMirrorUV = {
1779 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
1780};
1781void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
1782 int width) {
1783 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001784 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00001785 "movdqa %4,%%xmm1 \n"
1786 "lea -16(%0,%3,2),%0 \n"
1787 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001788 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001789 "1: \n"
1790 "movdqa (%0),%%xmm0 \n"
1791 "lea -16(%0),%0 \n"
1792 "pshufb %%xmm1,%%xmm0 \n"
1793 "sub $8,%3 \n"
1794 "movlpd %%xmm0,(%1) \n"
1795 "movhpd %%xmm0,(%1,%2) \n"
1796 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001797 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00001798 : "+r"(src), // %0
1799 "+r"(dst_u), // %1
1800 "+r"(dst_v), // %2
1801 "+r"(temp_width) // %3
1802 : "m"(kShuffleMirrorUV) // %4
1803 : "memory", "cc"
1804#if defined(__SSE2__)
1805 , "xmm0", "xmm1"
1806#endif
1807 );
1808}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001809#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00001810
fbarchard@google.com55663022012-04-26 00:01:41 +00001811#ifdef HAS_ADDROW_SSE2
1812// dst and width aligned to 16
1813void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
1814 asm volatile (
1815 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001816 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001817 "1: \n"
1818 "movdqu (%0),%%xmm2 \n"
1819 "lea 0x10(%0),%0 \n"
1820 "movdqa (%1),%%xmm0 \n"
1821 "movdqa 0x10(%1),%%xmm1 \n"
1822 "movdqa %%xmm2,%%xmm3 \n"
1823 "punpcklbw %%xmm4,%%xmm2 \n"
1824 "punpckhbw %%xmm4,%%xmm3 \n"
1825 "paddusw %%xmm2,%%xmm0 \n"
1826 "paddusw %%xmm3,%%xmm1 \n"
1827 "sub $0x10,%2 \n"
1828 "movdqa %%xmm0,(%1) \n"
1829 "movdqa %%xmm1,0x10(%1) \n"
1830 "lea 0x20(%1),%1 \n"
1831 "jg 1b \n"
1832 : "+r"(src), // %0
1833 "+r"(dst), // %1
1834 "+r"(width) // %2
1835 :
1836 : "memory", "cc"
1837#if defined(__SSE2__)
1838 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1839#endif
1840 );
1841}
1842
1843// dst and width aligned to 16
1844void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
1845 asm volatile (
1846 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001847 ".p2align 4 \n"
fbarchard@google.com55663022012-04-26 00:01:41 +00001848 "1: \n"
1849 "movdqu (%0),%%xmm2 \n"
1850 "lea 0x10(%0),%0 \n"
1851 "movdqa (%1),%%xmm0 \n"
1852 "movdqa 0x10(%1),%%xmm1 \n"
1853 "movdqa %%xmm2,%%xmm3 \n"
1854 "punpcklbw %%xmm4,%%xmm2 \n"
1855 "punpckhbw %%xmm4,%%xmm3 \n"
1856 "psubusw %%xmm2,%%xmm0 \n"
1857 "psubusw %%xmm3,%%xmm1 \n"
1858 "sub $0x10,%2 \n"
1859 "movdqa %%xmm0,(%1) \n"
1860 "movdqa %%xmm1,0x10(%1) \n"
1861 "lea 0x20(%1),%1 \n"
1862 "jg 1b \n"
1863 : "+r"(src), // %0
1864 "+r"(dst), // %1
1865 "+r"(width) // %2
1866 :
1867 : "memory", "cc"
1868#if defined(__SSE2__)
1869 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1870#endif
1871 );
1872}
1873#endif // HAS_ADDROW_SSE2
1874
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001875#ifdef HAS_SPLITUV_SSE2
1876void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001877 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001878 "pcmpeqb %%xmm5,%%xmm5 \n"
1879 "psrlw $0x8,%%xmm5 \n"
1880 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001881 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001882 "1: \n"
1883 "movdqa (%0),%%xmm0 \n"
1884 "movdqa 0x10(%0),%%xmm1 \n"
1885 "lea 0x20(%0),%0 \n"
1886 "movdqa %%xmm0,%%xmm2 \n"
1887 "movdqa %%xmm1,%%xmm3 \n"
1888 "pand %%xmm5,%%xmm0 \n"
1889 "pand %%xmm5,%%xmm1 \n"
1890 "packuswb %%xmm1,%%xmm0 \n"
1891 "psrlw $0x8,%%xmm2 \n"
1892 "psrlw $0x8,%%xmm3 \n"
1893 "packuswb %%xmm3,%%xmm2 \n"
1894 "movdqa %%xmm0,(%1) \n"
1895 "movdqa %%xmm2,(%1,%2) \n"
1896 "lea 0x10(%1),%1 \n"
1897 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001898 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001899 : "+r"(src_uv), // %0
1900 "+r"(dst_u), // %1
1901 "+r"(dst_v), // %2
1902 "+r"(pix) // %3
1903 :
1904 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001905#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001906 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001907#endif
1908 );
1909}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001910#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00001911
fbarchard@google.com19932f82012-02-16 22:19:14 +00001912#ifdef HAS_COPYROW_SSE2
1913void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001914 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001915 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00001916 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001917 "1: \n"
1918 "movdqa (%0),%%xmm0 \n"
1919 "movdqa 0x10(%0),%%xmm1 \n"
1920 "movdqa %%xmm0,(%0,%1) \n"
1921 "movdqa %%xmm1,0x10(%0,%1) \n"
1922 "lea 0x20(%0),%0 \n"
1923 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001924 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00001925 : "+r"(src), // %0
1926 "+r"(dst), // %1
1927 "+r"(count) // %2
1928 :
1929 : "memory", "cc"
1930#if defined(__SSE2__)
1931 , "xmm0", "xmm1"
1932#endif
1933 );
1934}
1935#endif // HAS_COPYROW_SSE2
1936
1937#ifdef HAS_COPYROW_X86
1938void CopyRow_X86(const uint8* src, uint8* dst, int width) {
1939 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001940 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00001941 "shr $0x2,%2 \n"
1942 "rep movsl \n"
1943 : "+S"(src), // %0
1944 "+D"(dst), // %1
1945 "+c"(width_tmp) // %2
1946 :
1947 : "memory", "cc"
1948 );
1949}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001950#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00001951
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00001952#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001953void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001954 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001955 "pcmpeqb %%xmm5,%%xmm5 \n"
1956 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001957 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001958 "1: \n"
1959 "movdqa (%0),%%xmm0 \n"
1960 "movdqa 0x10(%0),%%xmm1 \n"
1961 "lea 0x20(%0),%0 \n"
1962 "pand %%xmm5,%%xmm0 \n"
1963 "pand %%xmm5,%%xmm1 \n"
1964 "packuswb %%xmm1,%%xmm0 \n"
1965 "movdqa %%xmm0,(%1) \n"
1966 "lea 0x10(%1),%1 \n"
1967 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001968 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001969 : "+r"(src_yuy2), // %0
1970 "+r"(dst_y), // %1
1971 "+r"(pix) // %2
1972 :
1973 : "memory", "cc"
1974#if defined(__SSE2__)
1975 , "xmm0", "xmm1", "xmm5"
1976#endif
1977 );
1978}
1979
1980void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
1981 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001982 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001983 "pcmpeqb %%xmm5,%%xmm5 \n"
1984 "psrlw $0x8,%%xmm5 \n"
1985 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001986 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001987 "1: \n"
1988 "movdqa (%0),%%xmm0 \n"
1989 "movdqa 0x10(%0),%%xmm1 \n"
1990 "movdqa (%0,%4,1),%%xmm2 \n"
1991 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1992 "lea 0x20(%0),%0 \n"
1993 "pavgb %%xmm2,%%xmm0 \n"
1994 "pavgb %%xmm3,%%xmm1 \n"
1995 "psrlw $0x8,%%xmm0 \n"
1996 "psrlw $0x8,%%xmm1 \n"
1997 "packuswb %%xmm1,%%xmm0 \n"
1998 "movdqa %%xmm0,%%xmm1 \n"
1999 "pand %%xmm5,%%xmm0 \n"
2000 "packuswb %%xmm0,%%xmm0 \n"
2001 "psrlw $0x8,%%xmm1 \n"
2002 "packuswb %%xmm1,%%xmm1 \n"
2003 "movq %%xmm0,(%1) \n"
2004 "movq %%xmm1,(%1,%2) \n"
2005 "lea 0x8(%1),%1 \n"
2006 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002007 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002008 : "+r"(src_yuy2), // %0
2009 "+r"(dst_u), // %1
2010 "+r"(dst_y), // %2
2011 "+r"(pix) // %3
2012 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2013 : "memory", "cc"
2014#if defined(__SSE2__)
2015 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2016#endif
2017 );
2018}
2019
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002020
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002021void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2022 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002023 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002024 "pcmpeqb %%xmm5,%%xmm5 \n"
2025 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002026 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002027 "1: \n"
2028 "movdqu (%0),%%xmm0 \n"
2029 "movdqu 0x10(%0),%%xmm1 \n"
2030 "lea 0x20(%0),%0 \n"
2031 "pand %%xmm5,%%xmm0 \n"
2032 "pand %%xmm5,%%xmm1 \n"
2033 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002034 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002035 "movdqu %%xmm0,(%1) \n"
2036 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002037 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002038 : "+r"(src_yuy2), // %0
2039 "+r"(dst_y), // %1
2040 "+r"(pix) // %2
2041 :
2042 : "memory", "cc"
2043#if defined(__SSE2__)
2044 , "xmm0", "xmm1", "xmm5"
2045#endif
2046 );
2047}
2048
2049void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2050 int stride_yuy2,
2051 uint8* dst_u, uint8* dst_y,
2052 int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002053 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002054 "pcmpeqb %%xmm5,%%xmm5 \n"
2055 "psrlw $0x8,%%xmm5 \n"
2056 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002057 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002058 "1: \n"
2059 "movdqu (%0),%%xmm0 \n"
2060 "movdqu 0x10(%0),%%xmm1 \n"
2061 "movdqu (%0,%4,1),%%xmm2 \n"
2062 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2063 "lea 0x20(%0),%0 \n"
2064 "pavgb %%xmm2,%%xmm0 \n"
2065 "pavgb %%xmm3,%%xmm1 \n"
2066 "psrlw $0x8,%%xmm0 \n"
2067 "psrlw $0x8,%%xmm1 \n"
2068 "packuswb %%xmm1,%%xmm0 \n"
2069 "movdqa %%xmm0,%%xmm1 \n"
2070 "pand %%xmm5,%%xmm0 \n"
2071 "packuswb %%xmm0,%%xmm0 \n"
2072 "psrlw $0x8,%%xmm1 \n"
2073 "packuswb %%xmm1,%%xmm1 \n"
2074 "movq %%xmm0,(%1) \n"
2075 "movq %%xmm1,(%1,%2) \n"
2076 "lea 0x8(%1),%1 \n"
2077 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002078 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002079 : "+r"(src_yuy2), // %0
2080 "+r"(dst_u), // %1
2081 "+r"(dst_y), // %2
2082 "+r"(pix) // %3
2083 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2084 : "memory", "cc"
2085#if defined(__SSE2__)
2086 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2087#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002088 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002089}
2090
2091void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002092 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002093 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002094 "1: \n"
2095 "movdqa (%0),%%xmm0 \n"
2096 "movdqa 0x10(%0),%%xmm1 \n"
2097 "lea 0x20(%0),%0 \n"
2098 "psrlw $0x8,%%xmm0 \n"
2099 "psrlw $0x8,%%xmm1 \n"
2100 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002101 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002102 "movdqa %%xmm0,(%1) \n"
2103 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002104 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002105 : "+r"(src_uyvy), // %0
2106 "+r"(dst_y), // %1
2107 "+r"(pix) // %2
2108 :
2109 : "memory", "cc"
2110#if defined(__SSE2__)
2111 , "xmm0", "xmm1"
2112#endif
2113 );
2114}
2115
2116void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2117 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002118 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002119 "pcmpeqb %%xmm5,%%xmm5 \n"
2120 "psrlw $0x8,%%xmm5 \n"
2121 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002122 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002123 "1: \n"
2124 "movdqa (%0),%%xmm0 \n"
2125 "movdqa 0x10(%0),%%xmm1 \n"
2126 "movdqa (%0,%4,1),%%xmm2 \n"
2127 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2128 "lea 0x20(%0),%0 \n"
2129 "pavgb %%xmm2,%%xmm0 \n"
2130 "pavgb %%xmm3,%%xmm1 \n"
2131 "pand %%xmm5,%%xmm0 \n"
2132 "pand %%xmm5,%%xmm1 \n"
2133 "packuswb %%xmm1,%%xmm0 \n"
2134 "movdqa %%xmm0,%%xmm1 \n"
2135 "pand %%xmm5,%%xmm0 \n"
2136 "packuswb %%xmm0,%%xmm0 \n"
2137 "psrlw $0x8,%%xmm1 \n"
2138 "packuswb %%xmm1,%%xmm1 \n"
2139 "movq %%xmm0,(%1) \n"
2140 "movq %%xmm1,(%1,%2) \n"
2141 "lea 0x8(%1),%1 \n"
2142 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002143 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002144 : "+r"(src_uyvy), // %0
2145 "+r"(dst_u), // %1
2146 "+r"(dst_y), // %2
2147 "+r"(pix) // %3
2148 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2149 : "memory", "cc"
2150#if defined(__SSE2__)
2151 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2152#endif
2153 );
2154}
2155
2156void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2157 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002158 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002159 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002160 "1: \n"
2161 "movdqu (%0),%%xmm0 \n"
2162 "movdqu 0x10(%0),%%xmm1 \n"
2163 "lea 0x20(%0),%0 \n"
2164 "psrlw $0x8,%%xmm0 \n"
2165 "psrlw $0x8,%%xmm1 \n"
2166 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002167 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002168 "movdqu %%xmm0,(%1) \n"
2169 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002170 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002171 : "+r"(src_uyvy), // %0
2172 "+r"(dst_y), // %1
2173 "+r"(pix) // %2
2174 :
2175 : "memory", "cc"
2176#if defined(__SSE2__)
2177 , "xmm0", "xmm1"
2178#endif
2179 );
2180}
2181
2182void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2183 uint8* dst_u, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002184 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002185 "pcmpeqb %%xmm5,%%xmm5 \n"
2186 "psrlw $0x8,%%xmm5 \n"
2187 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002188 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002189 "1: \n"
2190 "movdqu (%0),%%xmm0 \n"
2191 "movdqu 0x10(%0),%%xmm1 \n"
2192 "movdqu (%0,%4,1),%%xmm2 \n"
2193 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2194 "lea 0x20(%0),%0 \n"
2195 "pavgb %%xmm2,%%xmm0 \n"
2196 "pavgb %%xmm3,%%xmm1 \n"
2197 "pand %%xmm5,%%xmm0 \n"
2198 "pand %%xmm5,%%xmm1 \n"
2199 "packuswb %%xmm1,%%xmm0 \n"
2200 "movdqa %%xmm0,%%xmm1 \n"
2201 "pand %%xmm5,%%xmm0 \n"
2202 "packuswb %%xmm0,%%xmm0 \n"
2203 "psrlw $0x8,%%xmm1 \n"
2204 "packuswb %%xmm1,%%xmm1 \n"
2205 "movq %%xmm0,(%1) \n"
2206 "movq %%xmm1,(%1,%2) \n"
2207 "lea 0x8(%1),%1 \n"
2208 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002209 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002210 : "+r"(src_uyvy), // %0
2211 "+r"(dst_u), // %1
2212 "+r"(dst_y), // %2
2213 "+r"(pix) // %3
2214 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2215 : "memory", "cc"
2216#if defined(__SSE2__)
2217 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2218#endif
2219 );
2220}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002221#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002222
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002223#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002224// Blend 8 pixels at a time.
2225// src_argb0 unaligned.
2226// src_argb1 and dst_argb aligned to 16 bytes.
2227// width must be multiple of 4 pixels.
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002228void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.comc757f302012-04-03 00:49:16 +00002229 uint8* dst_argb, int width) {
2230 asm volatile (
2231 "pcmpeqb %%xmm7,%%xmm7 \n"
2232 "psrlw $0xf,%%xmm7 \n"
2233 "pcmpeqb %%xmm6,%%xmm6 \n"
2234 "psrlw $0x8,%%xmm6 \n"
2235 "pcmpeqb %%xmm5,%%xmm5 \n"
2236 "psllw $0x8,%%xmm5 \n"
2237 "pcmpeqb %%xmm4,%%xmm4 \n"
2238 "pslld $0x18,%%xmm4 \n"
2239
2240 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002241 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002242 "1: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002243 "movdqu (%0),%%xmm3 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002244 "movdqa %%xmm3,%%xmm0 \n"
2245 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002246 "movdqu (%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002247 "psrlw $0x8,%%xmm3 \n"
2248 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2249 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2250 "pand %%xmm6,%%xmm2 \n"
2251 "paddw %%xmm7,%%xmm3 \n"
2252 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002253 "movdqu (%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002254 "psrlw $0x8,%%xmm1 \n"
2255 "por %%xmm4,%%xmm0 \n"
2256 "pmullw %%xmm3,%%xmm1 \n"
2257 "movdqu 0x10(%0),%%xmm3 \n"
2258 "lea 0x20(%0),%0 \n"
2259 "psrlw $0x8,%%xmm2 \n"
2260 "paddusb %%xmm2,%%xmm0 \n"
2261 "pand %%xmm5,%%xmm1 \n"
2262 "paddusb %%xmm1,%%xmm0 \n"
2263 "sub $0x4,%3 \n"
2264 "movdqa %%xmm0,(%2) \n"
2265 "jle 9f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002266 "movdqa %%xmm3,%%xmm0 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002267 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002268 "movdqu 0x10(%1),%%xmm2 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002269 "psrlw $0x8,%%xmm3 \n"
2270 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2271 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2272 "pand %%xmm6,%%xmm2 \n"
2273 "paddw %%xmm7,%%xmm3 \n"
2274 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.com1702ec72012-04-05 01:15:12 +00002275 "movdqu 0x10(%1),%%xmm1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002276 "lea 0x20(%1),%1 \n"
2277 "psrlw $0x8,%%xmm1 \n"
2278 "por %%xmm4,%%xmm0 \n"
2279 "pmullw %%xmm3,%%xmm1 \n"
2280 "psrlw $0x8,%%xmm2 \n"
2281 "paddusb %%xmm2,%%xmm0 \n"
2282 "pand %%xmm5,%%xmm1 \n"
2283 "paddusb %%xmm1,%%xmm0 \n"
2284 "sub $0x4,%3 \n"
2285 "movdqa %%xmm0,0x10(%2) \n"
2286 "lea 0x20(%2),%2 \n"
2287 "jg 1b \n"
2288 "9: \n"
2289 : "+r"(src_argb0), // %0
2290 "+r"(src_argb1), // %1
2291 "+r"(dst_argb), // %2
2292 "+r"(width) // %3
2293 :
2294 : "memory", "cc"
2295#if defined(__SSE2__)
2296 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2297#endif
2298 );
2299}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002300#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002301
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002302#ifdef HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002303// Blend 1 pixel at a time, unaligned
fbarchard@google.comd2f44132012-04-04 21:53:27 +00002304void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com96af8702012-04-06 18:22:27 +00002305 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002306 asm volatile (
2307 "pcmpeqb %%xmm7,%%xmm7 \n"
2308 "psrlw $0xf,%%xmm7 \n"
2309 "pcmpeqb %%xmm6,%%xmm6 \n"
2310 "psrlw $0x8,%%xmm6 \n"
2311 "pcmpeqb %%xmm5,%%xmm5 \n"
2312 "psllw $0x8,%%xmm5 \n"
2313 "pcmpeqb %%xmm4,%%xmm4 \n"
2314 "pslld $0x18,%%xmm4 \n"
2315
2316 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002317 ".p2align 4 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002318 "1: \n"
2319 "movd (%0),%%xmm3 \n"
2320 "lea 0x4(%0),%0 \n"
2321 "movdqa %%xmm3,%%xmm0 \n"
2322 "pxor %%xmm4,%%xmm3 \n"
2323 "movd (%1),%%xmm2 \n"
2324 "psrlw $0x8,%%xmm3 \n"
2325 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2326 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2327 "pand %%xmm6,%%xmm2 \n"
2328 "paddw %%xmm7,%%xmm3 \n"
2329 "pmullw %%xmm3,%%xmm2 \n"
2330 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002331 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002332 "psrlw $0x8,%%xmm1 \n"
2333 "por %%xmm4,%%xmm0 \n"
2334 "pmullw %%xmm3,%%xmm1 \n"
2335 "psrlw $0x8,%%xmm2 \n"
2336 "paddusb %%xmm2,%%xmm0 \n"
2337 "pand %%xmm5,%%xmm1 \n"
2338 "paddusb %%xmm1,%%xmm0 \n"
2339 "sub $0x1,%3 \n"
2340 "movd %%xmm0,(%2) \n"
2341 "lea 0x4(%2),%2 \n"
2342 "jg 1b \n"
2343 : "+r"(src_argb0), // %0
2344 "+r"(src_argb1), // %1
2345 "+r"(dst_argb), // %2
2346 "+r"(width) // %3
2347 :
2348 : "memory", "cc"
2349#if defined(__SSE2__)
2350 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2351#endif
2352 );
2353}
fbarchard@google.comda5cc422012-04-24 06:01:32 +00002354#endif // HAS_ARGBBLENDROW1_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002355
fbarchard@google.com96af8702012-04-06 18:22:27 +00002356#ifdef HAS_ARGBBLENDROW_SSSE3
2357// Shuffle table for reversing the bytes.
2358CONST uvec8 kShuffleAlpha = {
2359 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2360 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2361};
2362void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002363 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002364 asm volatile (
2365 "pcmpeqb %%xmm7,%%xmm7 \n"
2366 "psrlw $0xf,%%xmm7 \n"
2367 "pcmpeqb %%xmm6,%%xmm6 \n"
2368 "psrlw $0x8,%%xmm6 \n"
2369 "pcmpeqb %%xmm5,%%xmm5 \n"
2370 "psllw $0x8,%%xmm5 \n"
2371 "pcmpeqb %%xmm4,%%xmm4 \n"
2372 "pslld $0x18,%%xmm4 \n"
2373
2374 // 8 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002375 ".p2align 4 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002376 "1: \n"
2377 "movdqu (%0),%%xmm3 \n"
2378 "movdqa %%xmm3,%%xmm0 \n"
2379 "pxor %%xmm4,%%xmm3 \n"
2380 "pshufb %4,%%xmm3 \n"
2381 "movdqu (%1),%%xmm2 \n"
2382 "pand %%xmm6,%%xmm2 \n"
2383 "paddw %%xmm7,%%xmm3 \n"
2384 "pmullw %%xmm3,%%xmm2 \n"
2385 "movdqu (%1),%%xmm1 \n"
2386 "psrlw $0x8,%%xmm1 \n"
2387 "por %%xmm4,%%xmm0 \n"
2388 "pmullw %%xmm3,%%xmm1 \n"
2389 "movdqu 0x10(%0),%%xmm3 \n"
2390 "lea 0x20(%0),%0 \n"
2391 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002392 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002393 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002394 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002395 "sub $0x4,%3 \n"
2396 "movdqa %%xmm0,(%2) \n"
2397 "jle 9f \n"
2398 "movdqa %%xmm3,%%xmm0 \n"
2399 "pxor %%xmm4,%%xmm3 \n"
2400 "movdqu 0x10(%1),%%xmm2 \n"
2401 "pshufb %4,%%xmm3 \n"
2402 "pand %%xmm6,%%xmm2 \n"
2403 "paddw %%xmm7,%%xmm3 \n"
2404 "pmullw %%xmm3,%%xmm2 \n"
2405 "movdqu 0x10(%1),%%xmm1 \n"
2406 "lea 0x20(%1),%1 \n"
2407 "psrlw $0x8,%%xmm1 \n"
2408 "por %%xmm4,%%xmm0 \n"
2409 "pmullw %%xmm3,%%xmm1 \n"
2410 "psrlw $0x8,%%xmm2 \n"
2411 "paddusb %%xmm2,%%xmm0 \n"
2412 "pand %%xmm5,%%xmm1 \n"
2413 "paddusb %%xmm1,%%xmm0 \n"
2414 "sub $0x4,%3 \n"
2415 "movdqa %%xmm0,0x10(%2) \n"
2416 "lea 0x20(%2),%2 \n"
2417 "jg 1b \n"
2418 "9: \n"
2419 : "+r"(src_argb0), // %0
2420 "+r"(src_argb1), // %1
2421 "+r"(dst_argb), // %2
2422 "+r"(width) // %3
2423 : "m"(kShuffleAlpha) // %4
2424 : "memory", "cc"
2425#if defined(__SSE2__)
2426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2427#endif
2428 );
2429}
2430#endif // HAS_ARGBBLENDROW_SSSE3
2431
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002432
2433#ifdef HAS_ARGBBLENDROW1_SSSE3
2434// Blend 1 pixel at a time, unaligned
2435void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2436 uint8* dst_argb, int width) {
2437 asm volatile (
2438 "pcmpeqb %%xmm7,%%xmm7 \n"
2439 "psrlw $0xf,%%xmm7 \n"
2440 "pcmpeqb %%xmm6,%%xmm6 \n"
2441 "psrlw $0x8,%%xmm6 \n"
2442 "pcmpeqb %%xmm5,%%xmm5 \n"
2443 "psllw $0x8,%%xmm5 \n"
2444 "pcmpeqb %%xmm4,%%xmm4 \n"
2445 "pslld $0x18,%%xmm4 \n"
2446
2447 // 1 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002448 ".p2align 4 \n"
fbarchard@google.com5ff3a8f2012-04-24 19:43:45 +00002449 "1: \n"
2450 "movd (%0),%%xmm3 \n"
2451 "lea 0x4(%0),%0 \n"
2452 "movdqa %%xmm3,%%xmm0 \n"
2453 "pxor %%xmm4,%%xmm3 \n"
2454 "movd (%1),%%xmm2 \n"
2455 "pshufb %4,%%xmm3 \n"
2456 "pand %%xmm6,%%xmm2 \n"
2457 "paddw %%xmm7,%%xmm3 \n"
2458 "pmullw %%xmm3,%%xmm2 \n"
2459 "movd (%1),%%xmm1 \n"
2460 "lea 0x4(%1),%1 \n"
2461 "psrlw $0x8,%%xmm1 \n"
2462 "por %%xmm4,%%xmm0 \n"
2463 "pmullw %%xmm3,%%xmm1 \n"
2464 "psrlw $0x8,%%xmm2 \n"
2465 "paddusb %%xmm2,%%xmm0 \n"
2466 "pand %%xmm5,%%xmm1 \n"
2467 "paddusb %%xmm1,%%xmm0 \n"
2468 "sub $0x1,%3 \n"
2469 "movd %%xmm0,(%2) \n"
2470 "lea 0x4(%2),%2 \n"
2471 "jg 1b \n"
2472 : "+r"(src_argb0), // %0
2473 "+r"(src_argb1), // %1
2474 "+r"(dst_argb), // %2
2475 "+r"(width) // %3
2476 : "m"(kShuffleAlpha) // %4
2477 : "memory", "cc"
2478#if defined(__SSE2__)
2479 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2480#endif
2481 );
2482}
2483#endif // HAS_ARGBBLENDROW1_SSSE3
2484
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002485#ifdef HAS_ARGBATTENUATE_SSE2
2486// Attenuate 4 pixels at a time.
2487// aligned to 16 bytes
2488void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2489 asm volatile (
2490 "sub %0,%1 \n"
2491 "pcmpeqb %%xmm4,%%xmm4 \n"
2492 "pslld $0x18,%%xmm4 \n"
2493 "pcmpeqb %%xmm5,%%xmm5 \n"
2494 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002495
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002496 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002497 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002498 "1: \n"
2499 "movdqa (%0),%%xmm0 \n"
2500 "punpcklbw %%xmm0,%%xmm0 \n"
2501 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2502 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2503 "pmulhuw %%xmm2,%%xmm0 \n"
2504 "movdqa (%0),%%xmm1 \n"
2505 "punpckhbw %%xmm1,%%xmm1 \n"
2506 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2507 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2508 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002509 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002510 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002511 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002512 "psrlw $0x8,%%xmm1 \n"
2513 "packuswb %%xmm1,%%xmm0 \n"
2514 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002515 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00002516 "sub $0x4,%2 \n"
2517 "movdqa %%xmm0,(%0,%1,1) \n"
2518 "lea 0x10(%0),%0 \n"
2519 "jg 1b \n"
2520 : "+r"(src_argb), // %0
2521 "+r"(dst_argb), // %1
2522 "+r"(width) // %2
2523 :
2524 : "memory", "cc"
2525#if defined(__SSE2__)
2526 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2527#endif
2528 );
2529}
2530#endif // HAS_ARGBATTENUATE_SSE2
2531
fbarchard@google.com810cd912012-04-20 20:15:27 +00002532#ifdef HAS_ARGBATTENUATE_SSSE3
2533// Shuffle table duplicating alpha
2534CONST uvec8 kShuffleAlpha0 = {
2535 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2536};
2537CONST uvec8 kShuffleAlpha1 = {
2538 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2539 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2540};
2541// Attenuate 4 pixels at a time.
2542// aligned to 16 bytes
2543void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2544 asm volatile (
2545 "sub %0,%1 \n"
2546 "pcmpeqb %%xmm3,%%xmm3 \n"
2547 "pslld $0x18,%%xmm3 \n"
2548 "movdqa %3,%%xmm4 \n"
2549 "movdqa %4,%%xmm5 \n"
2550
2551 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002552 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002553 "1: \n"
2554 "movdqa (%0),%%xmm0 \n"
2555 "pshufb %%xmm4,%%xmm0 \n"
2556 "movdqa (%0),%%xmm1 \n"
2557 "punpcklbw %%xmm1,%%xmm1 \n"
2558 "pmulhuw %%xmm1,%%xmm0 \n"
2559 "movdqa (%0),%%xmm1 \n"
2560 "pshufb %%xmm5,%%xmm1 \n"
2561 "movdqa (%0),%%xmm2 \n"
2562 "punpckhbw %%xmm2,%%xmm2 \n"
2563 "pmulhuw %%xmm2,%%xmm1 \n"
2564 "movdqa (%0),%%xmm2 \n"
2565 "pand %%xmm3,%%xmm2 \n"
2566 "psrlw $0x8,%%xmm0 \n"
2567 "psrlw $0x8,%%xmm1 \n"
2568 "packuswb %%xmm1,%%xmm0 \n"
2569 "por %%xmm2,%%xmm0 \n"
2570 "sub $0x4,%2 \n"
2571 "movdqa %%xmm0,(%0,%1,1) \n"
2572 "lea 0x10(%0),%0 \n"
2573 "jg 1b \n"
2574 : "+r"(src_argb), // %0
2575 "+r"(dst_argb), // %1
2576 "+r"(width) // %2
2577 : "m"(kShuffleAlpha0), // %3
2578 "m"(kShuffleAlpha1) // %4
2579 : "memory", "cc"
2580#if defined(__SSE2__)
2581 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2582#endif
2583 );
2584}
2585#endif // HAS_ARGBATTENUATE_SSSE3
2586
2587#ifdef HAS_ARGBUNATTENUATE_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00002588// Unattenuate 4 pixels at a time.
2589// aligned to 16 bytes
2590void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2591 int width) {
2592 uintptr_t alpha = 0;
2593 asm volatile (
2594 "sub %0,%1 \n"
2595 "pcmpeqb %%xmm4,%%xmm4 \n"
2596 "pslld $0x18,%%xmm4 \n"
2597
2598 // 4 pixel loop
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002599 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002600 "1: \n"
2601 "movdqa (%0),%%xmm0 \n"
2602 "movzb 0x3(%0),%3 \n"
2603 "punpcklbw %%xmm0,%%xmm0 \n"
2604 "movd 0x0(%4,%3,4),%%xmm2 \n"
2605 "movzb 0x7(%0),%3 \n"
2606 "movd 0x0(%4,%3,4),%%xmm3 \n"
2607 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2608 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2609 "movlhps %%xmm3,%%xmm2 \n"
2610 "pmulhuw %%xmm2,%%xmm0 \n"
2611 "movdqa (%0),%%xmm1 \n"
2612 "movzb 0xb(%0),%3 \n"
2613 "punpckhbw %%xmm1,%%xmm1 \n"
2614 "movd 0x0(%4,%3,4),%%xmm2 \n"
2615 "movzb 0xf(%0),%3 \n"
2616 "movd 0x0(%4,%3,4),%%xmm3 \n"
2617 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2618 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2619 "movlhps %%xmm3,%%xmm2 \n"
2620 "pmulhuw %%xmm2,%%xmm1 \n"
2621 "movdqa (%0),%%xmm2 \n"
2622 "pand %%xmm4,%%xmm2 \n"
2623 "packuswb %%xmm1,%%xmm0 \n"
2624 "por %%xmm2,%%xmm0 \n"
2625 "sub $0x4,%2 \n"
2626 "movdqa %%xmm0,(%0,%1,1) \n"
2627 "lea 0x10(%0),%0 \n"
2628 "jg 1b \n"
2629 : "+r"(src_argb), // %0
2630 "+r"(dst_argb), // %1
2631 "+r"(width), // %2
2632 "+r"(alpha) // %3
2633 : "r"(fixed_invtbl8) // %4
2634 : "memory", "cc"
2635#if defined(__SSE2__)
2636 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2637#endif
2638 );
2639}
2640#endif // HAS_ARGBUNATTENUATE_SSE2
2641
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002642#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002643// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
2644CONST vec8 kARGBToGray = {
2645 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
2646};
2647
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002648// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2649void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) {
2650 asm volatile (
2651 "movdqa %2,%%xmm4 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002652 // 8 pixel loop \n"
2653 ".p2align 4 \n"
2654 "1: \n"
2655 "movdqa (%0),%%xmm0 \n"
2656 "movdqa 0x10(%0),%%xmm1 \n"
2657 "pmaddubsw %%xmm4,%%xmm0 \n"
2658 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002659 "phaddw %%xmm1,%%xmm0 \n"
2660 "psrlw $0x7,%%xmm0 \n"
2661 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002662 "movdqa (%0),%%xmm2 \n"
2663 "movdqa 0x10(%0),%%xmm3 \n"
2664 "psrld $0x18,%%xmm2 \n"
2665 "psrld $0x18,%%xmm3 \n"
2666 "packuswb %%xmm3,%%xmm2 \n"
2667 "packuswb %%xmm2,%%xmm2 \n"
2668 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002669 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002670 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002671 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002672 "punpcklwd %%xmm3,%%xmm0 \n"
2673 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002674 "sub $0x8,%1 \n"
2675 "movdqa %%xmm0,(%0) \n"
2676 "movdqa %%xmm1,0x10(%0) \n"
2677 "lea 0x20(%0),%0 \n"
2678 "jg 1b \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00002679 : "+r"(dst_argb), // %0
2680 "+r"(width) // %1
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002681 : "m"(kARGBToGray) // %2
2682 : "memory", "cc"
2683#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00002684 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00002685#endif
2686 );
2687}
2688#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00002689
2690#ifdef HAS_ARGBSEPIAROW_SSSE3
2691// b = (r * 35 + g * 68 + b * 17) >> 7
2692// g = (r * 45 + g * 88 + b * 22) >> 7
2693// r = (r * 50 + g * 98 + b * 24) >> 7
2694// Constant for ARGB color to sepia tone
2695CONST vec8 kARGBToSepiaB = {
2696 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
2697};
2698
2699CONST vec8 kARGBToSepiaG = {
2700 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
2701};
2702
2703CONST vec8 kARGBToSepiaR = {
2704 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
2705};
2706
2707// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels
2708void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
2709 asm volatile (
2710 "movdqa %2,%%xmm2 \n"
2711 "movdqa %3,%%xmm3 \n"
2712 "movdqa %4,%%xmm4 \n"
2713 // 8 pixel loop \n"
2714 ".p2align 4 \n"
2715 "1: \n"
2716 "movdqa (%0),%%xmm0 \n"
2717 "movdqa 0x10(%0),%%xmm6 \n"
2718 "pmaddubsw %%xmm2,%%xmm0 \n"
2719 "pmaddubsw %%xmm2,%%xmm6 \n"
2720 "phaddw %%xmm6,%%xmm0 \n"
2721 "psrlw $0x7,%%xmm0 \n"
2722 "packuswb %%xmm0,%%xmm0 \n"
2723 "movdqa (%0),%%xmm5 \n"
2724 "movdqa 0x10(%0),%%xmm1 \n"
2725 "pmaddubsw %%xmm3,%%xmm5 \n"
2726 "pmaddubsw %%xmm3,%%xmm1 \n"
2727 "phaddw %%xmm1,%%xmm5 \n"
2728 "psrlw $0x7,%%xmm5 \n"
2729 "packuswb %%xmm5,%%xmm5 \n"
2730 "punpcklbw %%xmm5,%%xmm0 \n"
2731 "movdqa (%0),%%xmm5 \n"
2732 "movdqa 0x10(%0),%%xmm1 \n"
2733 "pmaddubsw %%xmm4,%%xmm5 \n"
2734 "pmaddubsw %%xmm4,%%xmm1 \n"
2735 "phaddw %%xmm1,%%xmm5 \n"
2736 "psrlw $0x7,%%xmm5 \n"
2737 "packuswb %%xmm5,%%xmm5 \n"
2738 "movdqa (%0),%%xmm6 \n"
2739 "movdqa 0x10(%0),%%xmm1 \n"
2740 "psrld $0x18,%%xmm6 \n"
2741 "psrld $0x18,%%xmm1 \n"
2742 "packuswb %%xmm1,%%xmm6 \n"
2743 "packuswb %%xmm6,%%xmm6 \n"
2744 "punpcklbw %%xmm6,%%xmm5 \n"
2745 "movdqa %%xmm0,%%xmm1 \n"
2746 "punpcklwd %%xmm5,%%xmm0 \n"
2747 "punpckhwd %%xmm5,%%xmm1 \n"
2748 "sub $0x8,%1 \n"
2749 "movdqa %%xmm0,(%0) \n"
2750 "movdqa %%xmm1,0x10(%0) \n"
2751 "lea 0x20(%0),%0 \n"
2752 "jg 1b \n"
2753 : "+r"(dst_argb), // %0
2754 "+r"(width) // %1
2755 : "m"(kARGBToSepiaB), // %2
2756 "m"(kARGBToSepiaG), // %3
2757 "m"(kARGBToSepiaR) // %4
2758 : "memory", "cc"
2759#if defined(__SSE2__)
2760 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2761#endif
2762 );
2763}
2764#endif // HAS_ARGBSEPIAROW_SSSE3
2765
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002766#endif // defined(__x86_64__) || defined(__i386__)
2767
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002768#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002769} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00002770} // namespace libyuv
2771#endif