blob: 8e584e06c60996dff1748847140c9f7309717a1e [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000074};
fbarchard@google.com2430e042011-11-11 21:57:06 +000075
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000076CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000077 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000080
fbarchard@google.comba1f5262012-01-12 19:22:41 +000081// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000083 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000087CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000088 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
fbarchard@google.comb6149762011-11-07 21:58:52 +000091// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000092CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000093 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000097CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000098 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000101// Shuffle table for converting RGBA to ARGB.
102CONST uvec8 kShuffleMaskRGBAToARGB = {
103 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
104};
105
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000106// Shuffle table for converting ARGB to RGBA.
107CONST uvec8 kShuffleMaskARGBToRGBA = {
108 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
109};
110
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000111// Shuffle table for converting ARGB to RGB24.
112CONST uvec8 kShuffleMaskARGBToRGB24 = {
113 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
114};
115
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000116// Shuffle table for converting ARGB to RAW.
117CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000118 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000119};
120
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000121// Shuffle table for converting ARGBToRGB24 for I420ToRGB24. First 8 + next 4
122CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
123 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
124};
125
126// Shuffle table for converting ARGB to RAW.
127CONST uvec8 kShuffleMaskARGBToRAW_0 = {
128 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
129};
130
fbarchard@google.comb6149762011-11-07 21:58:52 +0000131void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000132 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000133 "pcmpeqb %%xmm5,%%xmm5 \n"
134 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000135 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000136 "1: \n"
137 "movq (%0),%%xmm0 \n"
138 "lea 0x8(%0),%0 \n"
139 "punpcklbw %%xmm0,%%xmm0 \n"
140 "movdqa %%xmm0,%%xmm1 \n"
141 "punpcklwd %%xmm0,%%xmm0 \n"
142 "punpckhwd %%xmm1,%%xmm1 \n"
143 "por %%xmm5,%%xmm0 \n"
144 "por %%xmm5,%%xmm1 \n"
145 "movdqa %%xmm0,(%1) \n"
146 "movdqa %%xmm1,0x10(%1) \n"
147 "lea 0x20(%1),%1 \n"
148 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000149 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000150 : "+r"(src_y), // %0
151 "+r"(dst_argb), // %1
152 "+r"(pix) // %2
153 :
154 : "memory", "cc"
155#if defined(__SSE2__)
156 , "xmm0", "xmm1", "xmm5"
157#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000158 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000159}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000160
161void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000162 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000163 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000164 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000165 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000166 "1: \n"
167 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000168 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000169 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000170 "movdqa %%xmm0,(%0,%1,1) \n"
171 "lea 0x10(%0),%0 \n"
172 "jg 1b \n"
173
fbarchard@google.comb6149762011-11-07 21:58:52 +0000174 : "+r"(src_abgr), // %0
175 "+r"(dst_argb), // %1
176 "+r"(pix) // %2
177 : "m"(kShuffleMaskABGRToARGB) // %3
178 : "memory", "cc"
179#if defined(__SSE2__)
180 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000181#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000182 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000183}
184
185void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000186 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000187 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000188 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000189 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000190 "1: \n"
191 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000192 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000193 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000194 "movdqa %%xmm0,(%0,%1,1) \n"
195 "lea 0x10(%0),%0 \n"
196 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000197 : "+r"(src_bgra), // %0
198 "+r"(dst_argb), // %1
199 "+r"(pix) // %2
200 : "m"(kShuffleMaskBGRAToARGB) // %3
201 : "memory", "cc"
202#if defined(__SSE2__)
203 , "xmm0", "xmm5"
204#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000205 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000206}
207
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000208void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
209 asm volatile (
210 "movdqa %3,%%xmm5 \n"
211 "sub %0,%1 \n"
212 ".p2align 4 \n"
213 "1: \n"
214 "movdqa (%0),%%xmm0 \n"
215 "pshufb %%xmm5,%%xmm0 \n"
216 "sub $0x4,%2 \n"
217 "movdqa %%xmm0,(%0,%1,1) \n"
218 "lea 0x10(%0),%0 \n"
219 "jg 1b \n"
220
221 : "+r"(src_rgba), // %0
222 "+r"(dst_argb), // %1
223 "+r"(pix) // %2
224 : "m"(kShuffleMaskRGBAToARGB) // %3
225 : "memory", "cc"
226#if defined(__SSE2__)
227 , "xmm0", "xmm5"
228#endif
229 );
230}
231
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000232void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
233 asm volatile (
234 "movdqa %3,%%xmm5 \n"
235 "sub %0,%1 \n"
236 ".p2align 4 \n"
237 "1: \n"
238 "movdqa (%0),%%xmm0 \n"
239 "pshufb %%xmm5,%%xmm0 \n"
240 "sub $0x4,%2 \n"
241 "movdqa %%xmm0,(%0,%1,1) \n"
242 "lea 0x10(%0),%0 \n"
243 "jg 1b \n"
244
245 : "+r"(src_argb), // %0
246 "+r"(dst_rgba), // %1
247 "+r"(pix) // %2
248 : "m"(kShuffleMaskARGBToRGBA) // %3
249 : "memory", "cc"
250#if defined(__SSE2__)
251 , "xmm0", "xmm5"
252#endif
253 );
254}
255
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000256void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000257 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000258 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
259 "pslld $0x18,%%xmm5 \n"
260 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000261 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000262 "1: \n"
263 "movdqu (%0),%%xmm0 \n"
264 "movdqu 0x10(%0),%%xmm1 \n"
265 "movdqu 0x20(%0),%%xmm3 \n"
266 "lea 0x30(%0),%0 \n"
267 "movdqa %%xmm3,%%xmm2 \n"
268 "palignr $0x8,%%xmm1,%%xmm2 \n"
269 "pshufb %%xmm4,%%xmm2 \n"
270 "por %%xmm5,%%xmm2 \n"
271 "palignr $0xc,%%xmm0,%%xmm1 \n"
272 "pshufb %%xmm4,%%xmm0 \n"
273 "movdqa %%xmm2,0x20(%1) \n"
274 "por %%xmm5,%%xmm0 \n"
275 "pshufb %%xmm4,%%xmm1 \n"
276 "movdqa %%xmm0,(%1) \n"
277 "por %%xmm5,%%xmm1 \n"
278 "palignr $0x4,%%xmm3,%%xmm3 \n"
279 "pshufb %%xmm4,%%xmm3 \n"
280 "movdqa %%xmm1,0x10(%1) \n"
281 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000282 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000283 "movdqa %%xmm3,0x30(%1) \n"
284 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000285 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000286 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000287 "+r"(dst_argb), // %1
288 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000289 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000290 : "memory", "cc"
291#if defined(__SSE2__)
292 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
293#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000294 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000295}
296
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000297void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000298 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000299 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
300 "pslld $0x18,%%xmm5 \n"
301 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000302 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000303 "1: \n"
304 "movdqu (%0),%%xmm0 \n"
305 "movdqu 0x10(%0),%%xmm1 \n"
306 "movdqu 0x20(%0),%%xmm3 \n"
307 "lea 0x30(%0),%0 \n"
308 "movdqa %%xmm3,%%xmm2 \n"
309 "palignr $0x8,%%xmm1,%%xmm2 \n"
310 "pshufb %%xmm4,%%xmm2 \n"
311 "por %%xmm5,%%xmm2 \n"
312 "palignr $0xc,%%xmm0,%%xmm1 \n"
313 "pshufb %%xmm4,%%xmm0 \n"
314 "movdqa %%xmm2,0x20(%1) \n"
315 "por %%xmm5,%%xmm0 \n"
316 "pshufb %%xmm4,%%xmm1 \n"
317 "movdqa %%xmm0,(%1) \n"
318 "por %%xmm5,%%xmm1 \n"
319 "palignr $0x4,%%xmm3,%%xmm3 \n"
320 "pshufb %%xmm4,%%xmm3 \n"
321 "movdqa %%xmm1,0x10(%1) \n"
322 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000323 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000324 "movdqa %%xmm3,0x30(%1) \n"
325 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000326 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000327 : "+r"(src_raw), // %0
328 "+r"(dst_argb), // %1
329 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000330 : "m"(kShuffleMaskRAWToARGB) // %3
331 : "memory", "cc"
332#if defined(__SSE2__)
333 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
334#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000335 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000336}
337
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000338void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000339 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000340 "mov $0x1080108,%%eax \n"
341 "movd %%eax,%%xmm5 \n"
342 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000343 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000344 "movd %%eax,%%xmm6 \n"
345 "pshufd $0x0,%%xmm6,%%xmm6 \n"
346 "pcmpeqb %%xmm3,%%xmm3 \n"
347 "psllw $0xb,%%xmm3 \n"
348 "pcmpeqb %%xmm4,%%xmm4 \n"
349 "psllw $0xa,%%xmm4 \n"
350 "psrlw $0x5,%%xmm4 \n"
351 "pcmpeqb %%xmm7,%%xmm7 \n"
352 "psllw $0x8,%%xmm7 \n"
353 "sub %0,%1 \n"
354 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000355 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000356 "1: \n"
357 "movdqu (%0),%%xmm0 \n"
358 "movdqa %%xmm0,%%xmm1 \n"
359 "movdqa %%xmm0,%%xmm2 \n"
360 "pand %%xmm3,%%xmm1 \n"
361 "psllw $0xb,%%xmm2 \n"
362 "pmulhuw %%xmm5,%%xmm1 \n"
363 "pmulhuw %%xmm5,%%xmm2 \n"
364 "psllw $0x8,%%xmm1 \n"
365 "por %%xmm2,%%xmm1 \n"
366 "pand %%xmm4,%%xmm0 \n"
367 "pmulhuw %%xmm6,%%xmm0 \n"
368 "por %%xmm7,%%xmm0 \n"
369 "movdqa %%xmm1,%%xmm2 \n"
370 "punpcklbw %%xmm0,%%xmm1 \n"
371 "punpckhbw %%xmm0,%%xmm2 \n"
372 "movdqa %%xmm1,(%1,%0,2) \n"
373 "movdqa %%xmm2,0x10(%1,%0,2) \n"
374 "lea 0x10(%0),%0 \n"
375 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000376 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000377 : "+r"(src), // %0
378 "+r"(dst), // %1
379 "+r"(pix) // %2
380 :
381 : "memory", "cc", "eax"
382#if defined(__SSE2__)
383 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
384#endif
385 );
386}
387
388void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000389 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000390 "mov $0x1080108,%%eax \n"
391 "movd %%eax,%%xmm5 \n"
392 "pshufd $0x0,%%xmm5,%%xmm5 \n"
393 "mov $0x42004200,%%eax \n"
394 "movd %%eax,%%xmm6 \n"
395 "pshufd $0x0,%%xmm6,%%xmm6 \n"
396 "pcmpeqb %%xmm3,%%xmm3 \n"
397 "psllw $0xb,%%xmm3 \n"
398 "movdqa %%xmm3,%%xmm4 \n"
399 "psrlw $0x6,%%xmm4 \n"
400 "pcmpeqb %%xmm7,%%xmm7 \n"
401 "psllw $0x8,%%xmm7 \n"
402 "sub %0,%1 \n"
403 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000404 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000405 "1: \n"
406 "movdqu (%0),%%xmm0 \n"
407 "movdqa %%xmm0,%%xmm1 \n"
408 "movdqa %%xmm0,%%xmm2 \n"
409 "psllw $0x1,%%xmm1 \n"
410 "psllw $0xb,%%xmm2 \n"
411 "pand %%xmm3,%%xmm1 \n"
412 "pmulhuw %%xmm5,%%xmm2 \n"
413 "pmulhuw %%xmm5,%%xmm1 \n"
414 "psllw $0x8,%%xmm1 \n"
415 "por %%xmm2,%%xmm1 \n"
416 "movdqa %%xmm0,%%xmm2 \n"
417 "pand %%xmm4,%%xmm0 \n"
418 "psraw $0x8,%%xmm2 \n"
419 "pmulhuw %%xmm6,%%xmm0 \n"
420 "pand %%xmm7,%%xmm2 \n"
421 "por %%xmm2,%%xmm0 \n"
422 "movdqa %%xmm1,%%xmm2 \n"
423 "punpcklbw %%xmm0,%%xmm1 \n"
424 "punpckhbw %%xmm0,%%xmm2 \n"
425 "movdqa %%xmm1,(%1,%0,2) \n"
426 "movdqa %%xmm2,0x10(%1,%0,2) \n"
427 "lea 0x10(%0),%0 \n"
428 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000429 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000430 : "+r"(src), // %0
431 "+r"(dst), // %1
432 "+r"(pix) // %2
433 :
434 : "memory", "cc", "eax"
435#if defined(__SSE2__)
436 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
437#endif
438 );
439}
440
441void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000442 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000443 "mov $0xf0f0f0f,%%eax \n"
444 "movd %%eax,%%xmm4 \n"
445 "pshufd $0x0,%%xmm4,%%xmm4 \n"
446 "movdqa %%xmm4,%%xmm5 \n"
447 "pslld $0x4,%%xmm5 \n"
448 "sub %0,%1 \n"
449 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000450 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000451 "1: \n"
452 "movdqu (%0),%%xmm0 \n"
453 "movdqa %%xmm0,%%xmm2 \n"
454 "pand %%xmm4,%%xmm0 \n"
455 "pand %%xmm5,%%xmm2 \n"
456 "movdqa %%xmm0,%%xmm1 \n"
457 "movdqa %%xmm2,%%xmm3 \n"
458 "psllw $0x4,%%xmm1 \n"
459 "psrlw $0x4,%%xmm3 \n"
460 "por %%xmm1,%%xmm0 \n"
461 "por %%xmm3,%%xmm2 \n"
462 "movdqa %%xmm0,%%xmm1 \n"
463 "punpcklbw %%xmm2,%%xmm0 \n"
464 "punpckhbw %%xmm2,%%xmm1 \n"
465 "movdqa %%xmm0,(%1,%0,2) \n"
466 "movdqa %%xmm1,0x10(%1,%0,2) \n"
467 "lea 0x10(%0),%0 \n"
468 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000469 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000470 : "+r"(src), // %0
471 "+r"(dst), // %1
472 "+r"(pix) // %2
473 :
474 : "memory", "cc", "eax"
475#if defined(__SSE2__)
476 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
477#endif
478 );
479}
480
481void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000482 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000483 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000484 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000485 "1: \n"
486 "movdqa (%0),%%xmm0 \n"
487 "movdqa 0x10(%0),%%xmm1 \n"
488 "movdqa 0x20(%0),%%xmm2 \n"
489 "movdqa 0x30(%0),%%xmm3 \n"
490 "lea 0x40(%0),%0 \n"
491 "pshufb %%xmm6,%%xmm0 \n"
492 "pshufb %%xmm6,%%xmm1 \n"
493 "pshufb %%xmm6,%%xmm2 \n"
494 "pshufb %%xmm6,%%xmm3 \n"
495 "movdqa %%xmm1,%%xmm4 \n"
496 "psrldq $0x4,%%xmm1 \n"
497 "pslldq $0xc,%%xmm4 \n"
498 "movdqa %%xmm2,%%xmm5 \n"
499 "por %%xmm4,%%xmm0 \n"
500 "pslldq $0x8,%%xmm5 \n"
501 "movdqa %%xmm0,(%1) \n"
502 "por %%xmm5,%%xmm1 \n"
503 "psrldq $0x8,%%xmm2 \n"
504 "pslldq $0x4,%%xmm3 \n"
505 "por %%xmm3,%%xmm2 \n"
506 "movdqa %%xmm1,0x10(%1) \n"
507 "movdqa %%xmm2,0x20(%1) \n"
508 "lea 0x30(%1),%1 \n"
509 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000510 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000511 : "+r"(src), // %0
512 "+r"(dst), // %1
513 "+r"(pix) // %2
514 : "m"(kShuffleMaskARGBToRGB24) // %3
515 : "memory", "cc"
516#if defined(__SSE2__)
517 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
518#endif
519 );
520}
521
522void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000523 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000524 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000525 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000526 "1: \n"
527 "movdqa (%0),%%xmm0 \n"
528 "movdqa 0x10(%0),%%xmm1 \n"
529 "movdqa 0x20(%0),%%xmm2 \n"
530 "movdqa 0x30(%0),%%xmm3 \n"
531 "lea 0x40(%0),%0 \n"
532 "pshufb %%xmm6,%%xmm0 \n"
533 "pshufb %%xmm6,%%xmm1 \n"
534 "pshufb %%xmm6,%%xmm2 \n"
535 "pshufb %%xmm6,%%xmm3 \n"
536 "movdqa %%xmm1,%%xmm4 \n"
537 "psrldq $0x4,%%xmm1 \n"
538 "pslldq $0xc,%%xmm4 \n"
539 "movdqa %%xmm2,%%xmm5 \n"
540 "por %%xmm4,%%xmm0 \n"
541 "pslldq $0x8,%%xmm5 \n"
542 "movdqa %%xmm0,(%1) \n"
543 "por %%xmm5,%%xmm1 \n"
544 "psrldq $0x8,%%xmm2 \n"
545 "pslldq $0x4,%%xmm3 \n"
546 "por %%xmm3,%%xmm2 \n"
547 "movdqa %%xmm1,0x10(%1) \n"
548 "movdqa %%xmm2,0x20(%1) \n"
549 "lea 0x30(%1),%1 \n"
550 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000551 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000552 : "+r"(src), // %0
553 "+r"(dst), // %1
554 "+r"(pix) // %2
555 : "m"(kShuffleMaskARGBToRAW) // %3
556 : "memory", "cc"
557#if defined(__SSE2__)
558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
559#endif
560 );
561}
562
563void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000564 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000565 "pcmpeqb %%xmm3,%%xmm3 \n"
566 "psrld $0x1b,%%xmm3 \n"
567 "pcmpeqb %%xmm4,%%xmm4 \n"
568 "psrld $0x1a,%%xmm4 \n"
569 "pslld $0x5,%%xmm4 \n"
570 "pcmpeqb %%xmm5,%%xmm5 \n"
571 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000572 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000573 "1: \n"
574 "movdqa (%0),%%xmm0 \n"
575 "movdqa %%xmm0,%%xmm1 \n"
576 "movdqa %%xmm0,%%xmm2 \n"
577 "pslld $0x8,%%xmm0 \n"
578 "psrld $0x3,%%xmm1 \n"
579 "psrld $0x5,%%xmm2 \n"
580 "psrad $0x10,%%xmm0 \n"
581 "pand %%xmm3,%%xmm1 \n"
582 "pand %%xmm4,%%xmm2 \n"
583 "pand %%xmm5,%%xmm0 \n"
584 "por %%xmm2,%%xmm1 \n"
585 "por %%xmm1,%%xmm0 \n"
586 "packssdw %%xmm0,%%xmm0 \n"
587 "lea 0x10(%0),%0 \n"
588 "movq %%xmm0,(%1) \n"
589 "lea 0x8(%1),%1 \n"
590 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000591 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000592 : "+r"(src), // %0
593 "+r"(dst), // %1
594 "+r"(pix) // %2
595 :
596 : "memory", "cc"
597#if defined(__SSE2__)
598 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
599#endif
600 );
601}
602
603void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000604 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000605 "pcmpeqb %%xmm4,%%xmm4 \n"
606 "psrld $0x1b,%%xmm4 \n"
607 "movdqa %%xmm4,%%xmm5 \n"
608 "pslld $0x5,%%xmm5 \n"
609 "movdqa %%xmm4,%%xmm6 \n"
610 "pslld $0xa,%%xmm6 \n"
611 "pcmpeqb %%xmm7,%%xmm7 \n"
612 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000613 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000614 "1: \n"
615 "movdqa (%0),%%xmm0 \n"
616 "movdqa %%xmm0,%%xmm1 \n"
617 "movdqa %%xmm0,%%xmm2 \n"
618 "movdqa %%xmm0,%%xmm3 \n"
619 "psrad $0x10,%%xmm0 \n"
620 "psrld $0x3,%%xmm1 \n"
621 "psrld $0x6,%%xmm2 \n"
622 "psrld $0x9,%%xmm3 \n"
623 "pand %%xmm7,%%xmm0 \n"
624 "pand %%xmm4,%%xmm1 \n"
625 "pand %%xmm5,%%xmm2 \n"
626 "pand %%xmm6,%%xmm3 \n"
627 "por %%xmm1,%%xmm0 \n"
628 "por %%xmm3,%%xmm2 \n"
629 "por %%xmm2,%%xmm0 \n"
630 "packssdw %%xmm0,%%xmm0 \n"
631 "lea 0x10(%0),%0 \n"
632 "movq %%xmm0,(%1) \n"
633 "lea 0x8(%1),%1 \n"
634 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000636 : "+r"(src), // %0
637 "+r"(dst), // %1
638 "+r"(pix) // %2
639 :
640 : "memory", "cc"
641#if defined(__SSE2__)
642 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
643#endif
644 );
645}
646
647void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000648 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000649 "pcmpeqb %%xmm4,%%xmm4 \n"
650 "psllw $0xc,%%xmm4 \n"
651 "movdqa %%xmm4,%%xmm3 \n"
652 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000653 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000654 "1: \n"
655 "movdqa (%0),%%xmm0 \n"
656 "movdqa %%xmm0,%%xmm1 \n"
657 "pand %%xmm3,%%xmm0 \n"
658 "pand %%xmm4,%%xmm1 \n"
659 "psrlq $0x4,%%xmm0 \n"
660 "psrlq $0x8,%%xmm1 \n"
661 "por %%xmm1,%%xmm0 \n"
662 "packuswb %%xmm0,%%xmm0 \n"
663 "lea 0x10(%0),%0 \n"
664 "movq %%xmm0,(%1) \n"
665 "lea 0x8(%1),%1 \n"
666 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000667 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000668 : "+r"(src), // %0
669 "+r"(dst), // %1
670 "+r"(pix) // %2
671 :
672 : "memory", "cc"
673#if defined(__SSE2__)
674 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
675#endif
676 );
677}
678
fbarchard@google.comb6149762011-11-07 21:58:52 +0000679void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000680 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000681 "movdqa %4,%%xmm5 \n"
682 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000683 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000684 "1: \n"
685 "movdqa (%0),%%xmm0 \n"
686 "movdqa 0x10(%0),%%xmm1 \n"
687 "movdqa 0x20(%0),%%xmm2 \n"
688 "movdqa 0x30(%0),%%xmm3 \n"
689 "pmaddubsw %%xmm4,%%xmm0 \n"
690 "pmaddubsw %%xmm4,%%xmm1 \n"
691 "pmaddubsw %%xmm4,%%xmm2 \n"
692 "pmaddubsw %%xmm4,%%xmm3 \n"
693 "lea 0x40(%0),%0 \n"
694 "phaddw %%xmm1,%%xmm0 \n"
695 "phaddw %%xmm3,%%xmm2 \n"
696 "psrlw $0x7,%%xmm0 \n"
697 "psrlw $0x7,%%xmm2 \n"
698 "packuswb %%xmm2,%%xmm0 \n"
699 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000700 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000701 "movdqa %%xmm0,(%1) \n"
702 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000703 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000704 : "+r"(src_argb), // %0
705 "+r"(dst_y), // %1
706 "+r"(pix) // %2
707 : "m"(kARGBToY), // %3
708 "m"(kAddY16) // %4
709 : "memory", "cc"
710#if defined(__SSE2__)
711 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
712#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000713 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000714}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000715
716void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000717 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000718 "movdqa %4,%%xmm5 \n"
719 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000720 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000721 "1: \n"
722 "movdqu (%0),%%xmm0 \n"
723 "movdqu 0x10(%0),%%xmm1 \n"
724 "movdqu 0x20(%0),%%xmm2 \n"
725 "movdqu 0x30(%0),%%xmm3 \n"
726 "pmaddubsw %%xmm4,%%xmm0 \n"
727 "pmaddubsw %%xmm4,%%xmm1 \n"
728 "pmaddubsw %%xmm4,%%xmm2 \n"
729 "pmaddubsw %%xmm4,%%xmm3 \n"
730 "lea 0x40(%0),%0 \n"
731 "phaddw %%xmm1,%%xmm0 \n"
732 "phaddw %%xmm3,%%xmm2 \n"
733 "psrlw $0x7,%%xmm0 \n"
734 "psrlw $0x7,%%xmm2 \n"
735 "packuswb %%xmm2,%%xmm0 \n"
736 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000737 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000738 "movdqu %%xmm0,(%1) \n"
739 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000740 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000741 : "+r"(src_argb), // %0
742 "+r"(dst_y), // %1
743 "+r"(pix) // %2
744 : "m"(kARGBToY), // %3
745 "m"(kAddY16) // %4
746 : "memory", "cc"
747#if defined(__SSE2__)
748 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
749#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000750 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000751}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000752
fbarchard@google.com714050a2012-02-17 22:59:56 +0000753// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000754// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
755// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
756// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000757// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000758void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
759 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000760 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000761 "movdqa %0,%%xmm4 \n"
762 "movdqa %1,%%xmm3 \n"
763 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000764 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000765 : "m"(kARGBToU), // %0
766 "m"(kARGBToV), // %1
767 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000768 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000769 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000770 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000771 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000772 "1: \n"
773 "movdqa (%0),%%xmm0 \n"
774 "movdqa 0x10(%0),%%xmm1 \n"
775 "movdqa 0x20(%0),%%xmm2 \n"
776 "movdqa 0x30(%0),%%xmm6 \n"
777 "pavgb (%0,%4,1),%%xmm0 \n"
778 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
779 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
780 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
781 "lea 0x40(%0),%0 \n"
782 "movdqa %%xmm0,%%xmm7 \n"
783 "shufps $0x88,%%xmm1,%%xmm0 \n"
784 "shufps $0xdd,%%xmm1,%%xmm7 \n"
785 "pavgb %%xmm7,%%xmm0 \n"
786 "movdqa %%xmm2,%%xmm7 \n"
787 "shufps $0x88,%%xmm6,%%xmm2 \n"
788 "shufps $0xdd,%%xmm6,%%xmm7 \n"
789 "pavgb %%xmm7,%%xmm2 \n"
790 "movdqa %%xmm0,%%xmm1 \n"
791 "movdqa %%xmm2,%%xmm6 \n"
792 "pmaddubsw %%xmm4,%%xmm0 \n"
793 "pmaddubsw %%xmm4,%%xmm2 \n"
794 "pmaddubsw %%xmm3,%%xmm1 \n"
795 "pmaddubsw %%xmm3,%%xmm6 \n"
796 "phaddw %%xmm2,%%xmm0 \n"
797 "phaddw %%xmm6,%%xmm1 \n"
798 "psraw $0x8,%%xmm0 \n"
799 "psraw $0x8,%%xmm1 \n"
800 "packsswb %%xmm1,%%xmm0 \n"
801 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000802 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000803 "movlps %%xmm0,(%1) \n"
804 "movhps %%xmm0,(%1,%2,1) \n"
805 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000806 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000807 : "+r"(src_argb0), // %0
808 "+r"(dst_u), // %1
809 "+r"(dst_v), // %2
810 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000811 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000812 : "memory", "cc"
813#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000814 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000815#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000816 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000817}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000818
819void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
820 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000821 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000822 "movdqa %0,%%xmm4 \n"
823 "movdqa %1,%%xmm3 \n"
824 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000825 :
826 : "m"(kARGBToU), // %0
827 "m"(kARGBToV), // %1
828 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000829 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000830 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000831 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000832 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000833 "1: \n"
834 "movdqu (%0),%%xmm0 \n"
835 "movdqu 0x10(%0),%%xmm1 \n"
836 "movdqu 0x20(%0),%%xmm2 \n"
837 "movdqu 0x30(%0),%%xmm6 \n"
838 "movdqu (%0,%4,1),%%xmm7 \n"
839 "pavgb %%xmm7,%%xmm0 \n"
840 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
841 "pavgb %%xmm7,%%xmm1 \n"
842 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
843 "pavgb %%xmm7,%%xmm2 \n"
844 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
845 "pavgb %%xmm7,%%xmm6 \n"
846 "lea 0x40(%0),%0 \n"
847 "movdqa %%xmm0,%%xmm7 \n"
848 "shufps $0x88,%%xmm1,%%xmm0 \n"
849 "shufps $0xdd,%%xmm1,%%xmm7 \n"
850 "pavgb %%xmm7,%%xmm0 \n"
851 "movdqa %%xmm2,%%xmm7 \n"
852 "shufps $0x88,%%xmm6,%%xmm2 \n"
853 "shufps $0xdd,%%xmm6,%%xmm7 \n"
854 "pavgb %%xmm7,%%xmm2 \n"
855 "movdqa %%xmm0,%%xmm1 \n"
856 "movdqa %%xmm2,%%xmm6 \n"
857 "pmaddubsw %%xmm4,%%xmm0 \n"
858 "pmaddubsw %%xmm4,%%xmm2 \n"
859 "pmaddubsw %%xmm3,%%xmm1 \n"
860 "pmaddubsw %%xmm3,%%xmm6 \n"
861 "phaddw %%xmm2,%%xmm0 \n"
862 "phaddw %%xmm6,%%xmm1 \n"
863 "psraw $0x8,%%xmm0 \n"
864 "psraw $0x8,%%xmm1 \n"
865 "packsswb %%xmm1,%%xmm0 \n"
866 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000867 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000868 "movlps %%xmm0,(%1) \n"
869 "movhps %%xmm0,(%1,%2,1) \n"
870 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000871 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000872 : "+r"(src_argb0), // %0
873 "+r"(dst_u), // %1
874 "+r"(dst_v), // %2
875 "+rm"(width) // %3
876 : "r"(static_cast<intptr_t>(src_stride_argb))
877 : "memory", "cc"
878#if defined(__SSE2__)
879 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
880#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000881 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000882}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000883
fbarchard@google.com714050a2012-02-17 22:59:56 +0000884void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000885 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000886 "movdqa %4,%%xmm5 \n"
887 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000888 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000889 "1: \n"
890 "movdqa (%0),%%xmm0 \n"
891 "movdqa 0x10(%0),%%xmm1 \n"
892 "movdqa 0x20(%0),%%xmm2 \n"
893 "movdqa 0x30(%0),%%xmm3 \n"
894 "pmaddubsw %%xmm4,%%xmm0 \n"
895 "pmaddubsw %%xmm4,%%xmm1 \n"
896 "pmaddubsw %%xmm4,%%xmm2 \n"
897 "pmaddubsw %%xmm4,%%xmm3 \n"
898 "lea 0x40(%0),%0 \n"
899 "phaddw %%xmm1,%%xmm0 \n"
900 "phaddw %%xmm3,%%xmm2 \n"
901 "psrlw $0x7,%%xmm0 \n"
902 "psrlw $0x7,%%xmm2 \n"
903 "packuswb %%xmm2,%%xmm0 \n"
904 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000905 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000906 "movdqa %%xmm0,(%1) \n"
907 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000908 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000909 : "+r"(src_bgra), // %0
910 "+r"(dst_y), // %1
911 "+r"(pix) // %2
912 : "m"(kBGRAToY), // %3
913 "m"(kAddY16) // %4
914 : "memory", "cc"
915#if defined(__SSE2__)
916 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000917#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000918 );
919}
920
921void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000922 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000923 "movdqa %4,%%xmm5 \n"
924 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000925 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000926 "1: \n"
927 "movdqu (%0),%%xmm0 \n"
928 "movdqu 0x10(%0),%%xmm1 \n"
929 "movdqu 0x20(%0),%%xmm2 \n"
930 "movdqu 0x30(%0),%%xmm3 \n"
931 "pmaddubsw %%xmm4,%%xmm0 \n"
932 "pmaddubsw %%xmm4,%%xmm1 \n"
933 "pmaddubsw %%xmm4,%%xmm2 \n"
934 "pmaddubsw %%xmm4,%%xmm3 \n"
935 "lea 0x40(%0),%0 \n"
936 "phaddw %%xmm1,%%xmm0 \n"
937 "phaddw %%xmm3,%%xmm2 \n"
938 "psrlw $0x7,%%xmm0 \n"
939 "psrlw $0x7,%%xmm2 \n"
940 "packuswb %%xmm2,%%xmm0 \n"
941 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000942 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000943 "movdqu %%xmm0,(%1) \n"
944 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000945 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000946 : "+r"(src_bgra), // %0
947 "+r"(dst_y), // %1
948 "+r"(pix) // %2
949 : "m"(kBGRAToY), // %3
950 "m"(kAddY16) // %4
951 : "memory", "cc"
952#if defined(__SSE2__)
953 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
954#endif
955 );
956}
957
958void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
959 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000960 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000961 "movdqa %0,%%xmm4 \n"
962 "movdqa %1,%%xmm3 \n"
963 "movdqa %2,%%xmm5 \n"
964 :
965 : "m"(kBGRAToU), // %0
966 "m"(kBGRAToV), // %1
967 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +0000968 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000969 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000970 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000971 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000972 "1: \n"
973 "movdqa (%0),%%xmm0 \n"
974 "movdqa 0x10(%0),%%xmm1 \n"
975 "movdqa 0x20(%0),%%xmm2 \n"
976 "movdqa 0x30(%0),%%xmm6 \n"
977 "pavgb (%0,%4,1),%%xmm0 \n"
978 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
979 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
980 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
981 "lea 0x40(%0),%0 \n"
982 "movdqa %%xmm0,%%xmm7 \n"
983 "shufps $0x88,%%xmm1,%%xmm0 \n"
984 "shufps $0xdd,%%xmm1,%%xmm7 \n"
985 "pavgb %%xmm7,%%xmm0 \n"
986 "movdqa %%xmm2,%%xmm7 \n"
987 "shufps $0x88,%%xmm6,%%xmm2 \n"
988 "shufps $0xdd,%%xmm6,%%xmm7 \n"
989 "pavgb %%xmm7,%%xmm2 \n"
990 "movdqa %%xmm0,%%xmm1 \n"
991 "movdqa %%xmm2,%%xmm6 \n"
992 "pmaddubsw %%xmm4,%%xmm0 \n"
993 "pmaddubsw %%xmm4,%%xmm2 \n"
994 "pmaddubsw %%xmm3,%%xmm1 \n"
995 "pmaddubsw %%xmm3,%%xmm6 \n"
996 "phaddw %%xmm2,%%xmm0 \n"
997 "phaddw %%xmm6,%%xmm1 \n"
998 "psraw $0x8,%%xmm0 \n"
999 "psraw $0x8,%%xmm1 \n"
1000 "packsswb %%xmm1,%%xmm0 \n"
1001 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001002 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001003 "movlps %%xmm0,(%1) \n"
1004 "movhps %%xmm0,(%1,%2,1) \n"
1005 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001006 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001007 : "+r"(src_bgra0), // %0
1008 "+r"(dst_u), // %1
1009 "+r"(dst_v), // %2
1010 "+rm"(width) // %3
1011 : "r"(static_cast<intptr_t>(src_stride_bgra))
1012 : "memory", "cc"
1013#if defined(__SSE2__)
1014 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1015#endif
1016 );
1017}
1018
1019void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1020 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001021 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001022 "movdqa %0,%%xmm4 \n"
1023 "movdqa %1,%%xmm3 \n"
1024 "movdqa %2,%%xmm5 \n"
1025 :
1026 : "m"(kBGRAToU), // %0
1027 "m"(kBGRAToV), // %1
1028 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001029 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001030 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001031 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001032 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001033 "1: \n"
1034 "movdqu (%0),%%xmm0 \n"
1035 "movdqu 0x10(%0),%%xmm1 \n"
1036 "movdqu 0x20(%0),%%xmm2 \n"
1037 "movdqu 0x30(%0),%%xmm6 \n"
1038 "movdqu (%0,%4,1),%%xmm7 \n"
1039 "pavgb %%xmm7,%%xmm0 \n"
1040 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1041 "pavgb %%xmm7,%%xmm1 \n"
1042 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1043 "pavgb %%xmm7,%%xmm2 \n"
1044 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1045 "pavgb %%xmm7,%%xmm6 \n"
1046 "lea 0x40(%0),%0 \n"
1047 "movdqa %%xmm0,%%xmm7 \n"
1048 "shufps $0x88,%%xmm1,%%xmm0 \n"
1049 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1050 "pavgb %%xmm7,%%xmm0 \n"
1051 "movdqa %%xmm2,%%xmm7 \n"
1052 "shufps $0x88,%%xmm6,%%xmm2 \n"
1053 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1054 "pavgb %%xmm7,%%xmm2 \n"
1055 "movdqa %%xmm0,%%xmm1 \n"
1056 "movdqa %%xmm2,%%xmm6 \n"
1057 "pmaddubsw %%xmm4,%%xmm0 \n"
1058 "pmaddubsw %%xmm4,%%xmm2 \n"
1059 "pmaddubsw %%xmm3,%%xmm1 \n"
1060 "pmaddubsw %%xmm3,%%xmm6 \n"
1061 "phaddw %%xmm2,%%xmm0 \n"
1062 "phaddw %%xmm6,%%xmm1 \n"
1063 "psraw $0x8,%%xmm0 \n"
1064 "psraw $0x8,%%xmm1 \n"
1065 "packsswb %%xmm1,%%xmm0 \n"
1066 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001067 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001068 "movlps %%xmm0,(%1) \n"
1069 "movhps %%xmm0,(%1,%2,1) \n"
1070 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001071 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001072 : "+r"(src_bgra0), // %0
1073 "+r"(dst_u), // %1
1074 "+r"(dst_v), // %2
1075 "+rm"(width) // %3
1076 : "r"(static_cast<intptr_t>(src_stride_bgra))
1077 : "memory", "cc"
1078#if defined(__SSE2__)
1079 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1080#endif
1081 );
1082}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001083
1084void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001085 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001086 "movdqa %4,%%xmm5 \n"
1087 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001088 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001089 "1: \n"
1090 "movdqa (%0),%%xmm0 \n"
1091 "movdqa 0x10(%0),%%xmm1 \n"
1092 "movdqa 0x20(%0),%%xmm2 \n"
1093 "movdqa 0x30(%0),%%xmm3 \n"
1094 "pmaddubsw %%xmm4,%%xmm0 \n"
1095 "pmaddubsw %%xmm4,%%xmm1 \n"
1096 "pmaddubsw %%xmm4,%%xmm2 \n"
1097 "pmaddubsw %%xmm4,%%xmm3 \n"
1098 "lea 0x40(%0),%0 \n"
1099 "phaddw %%xmm1,%%xmm0 \n"
1100 "phaddw %%xmm3,%%xmm2 \n"
1101 "psrlw $0x7,%%xmm0 \n"
1102 "psrlw $0x7,%%xmm2 \n"
1103 "packuswb %%xmm2,%%xmm0 \n"
1104 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001105 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001106 "movdqa %%xmm0,(%1) \n"
1107 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001108 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001109 : "+r"(src_abgr), // %0
1110 "+r"(dst_y), // %1
1111 "+r"(pix) // %2
1112 : "m"(kABGRToY), // %3
1113 "m"(kAddY16) // %4
1114 : "memory", "cc"
1115#if defined(__SSE2__)
1116 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1117#endif
1118 );
1119}
1120
1121void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001122 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001123 "movdqa %4,%%xmm5 \n"
1124 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001125 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001126 "1: \n"
1127 "movdqu (%0),%%xmm0 \n"
1128 "movdqu 0x10(%0),%%xmm1 \n"
1129 "movdqu 0x20(%0),%%xmm2 \n"
1130 "movdqu 0x30(%0),%%xmm3 \n"
1131 "pmaddubsw %%xmm4,%%xmm0 \n"
1132 "pmaddubsw %%xmm4,%%xmm1 \n"
1133 "pmaddubsw %%xmm4,%%xmm2 \n"
1134 "pmaddubsw %%xmm4,%%xmm3 \n"
1135 "lea 0x40(%0),%0 \n"
1136 "phaddw %%xmm1,%%xmm0 \n"
1137 "phaddw %%xmm3,%%xmm2 \n"
1138 "psrlw $0x7,%%xmm0 \n"
1139 "psrlw $0x7,%%xmm2 \n"
1140 "packuswb %%xmm2,%%xmm0 \n"
1141 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001142 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001143 "movdqu %%xmm0,(%1) \n"
1144 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001145 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001146 : "+r"(src_abgr), // %0
1147 "+r"(dst_y), // %1
1148 "+r"(pix) // %2
1149 : "m"(kABGRToY), // %3
1150 "m"(kAddY16) // %4
1151 : "memory", "cc"
1152#if defined(__SSE2__)
1153 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1154#endif
1155 );
1156}
1157
1158void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1159 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001160 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001161 "movdqa %0,%%xmm4 \n"
1162 "movdqa %1,%%xmm3 \n"
1163 "movdqa %2,%%xmm5 \n"
1164 :
1165 : "m"(kABGRToU), // %0
1166 "m"(kABGRToV), // %1
1167 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001168 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001169 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001170 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001171 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001172 "1: \n"
1173 "movdqa (%0),%%xmm0 \n"
1174 "movdqa 0x10(%0),%%xmm1 \n"
1175 "movdqa 0x20(%0),%%xmm2 \n"
1176 "movdqa 0x30(%0),%%xmm6 \n"
1177 "pavgb (%0,%4,1),%%xmm0 \n"
1178 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1179 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1180 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1181 "lea 0x40(%0),%0 \n"
1182 "movdqa %%xmm0,%%xmm7 \n"
1183 "shufps $0x88,%%xmm1,%%xmm0 \n"
1184 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1185 "pavgb %%xmm7,%%xmm0 \n"
1186 "movdqa %%xmm2,%%xmm7 \n"
1187 "shufps $0x88,%%xmm6,%%xmm2 \n"
1188 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1189 "pavgb %%xmm7,%%xmm2 \n"
1190 "movdqa %%xmm0,%%xmm1 \n"
1191 "movdqa %%xmm2,%%xmm6 \n"
1192 "pmaddubsw %%xmm4,%%xmm0 \n"
1193 "pmaddubsw %%xmm4,%%xmm2 \n"
1194 "pmaddubsw %%xmm3,%%xmm1 \n"
1195 "pmaddubsw %%xmm3,%%xmm6 \n"
1196 "phaddw %%xmm2,%%xmm0 \n"
1197 "phaddw %%xmm6,%%xmm1 \n"
1198 "psraw $0x8,%%xmm0 \n"
1199 "psraw $0x8,%%xmm1 \n"
1200 "packsswb %%xmm1,%%xmm0 \n"
1201 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001202 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001203 "movlps %%xmm0,(%1) \n"
1204 "movhps %%xmm0,(%1,%2,1) \n"
1205 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001206 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001207 : "+r"(src_abgr0), // %0
1208 "+r"(dst_u), // %1
1209 "+r"(dst_v), // %2
1210 "+rm"(width) // %3
1211 : "r"(static_cast<intptr_t>(src_stride_abgr))
1212 : "memory", "cc"
1213#if defined(__SSE2__)
1214 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1215#endif
1216 );
1217}
1218
1219void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1220 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001221 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001222 "movdqa %0,%%xmm4 \n"
1223 "movdqa %1,%%xmm3 \n"
1224 "movdqa %2,%%xmm5 \n"
1225 :
1226 : "m"(kABGRToU), // %0
1227 "m"(kABGRToV), // %1
1228 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001229 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001230 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001231 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001232 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001233 "1: \n"
1234 "movdqu (%0),%%xmm0 \n"
1235 "movdqu 0x10(%0),%%xmm1 \n"
1236 "movdqu 0x20(%0),%%xmm2 \n"
1237 "movdqu 0x30(%0),%%xmm6 \n"
1238 "movdqu (%0,%4,1),%%xmm7 \n"
1239 "pavgb %%xmm7,%%xmm0 \n"
1240 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1241 "pavgb %%xmm7,%%xmm1 \n"
1242 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1243 "pavgb %%xmm7,%%xmm2 \n"
1244 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1245 "pavgb %%xmm7,%%xmm6 \n"
1246 "lea 0x40(%0),%0 \n"
1247 "movdqa %%xmm0,%%xmm7 \n"
1248 "shufps $0x88,%%xmm1,%%xmm0 \n"
1249 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1250 "pavgb %%xmm7,%%xmm0 \n"
1251 "movdqa %%xmm2,%%xmm7 \n"
1252 "shufps $0x88,%%xmm6,%%xmm2 \n"
1253 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1254 "pavgb %%xmm7,%%xmm2 \n"
1255 "movdqa %%xmm0,%%xmm1 \n"
1256 "movdqa %%xmm2,%%xmm6 \n"
1257 "pmaddubsw %%xmm4,%%xmm0 \n"
1258 "pmaddubsw %%xmm4,%%xmm2 \n"
1259 "pmaddubsw %%xmm3,%%xmm1 \n"
1260 "pmaddubsw %%xmm3,%%xmm6 \n"
1261 "phaddw %%xmm2,%%xmm0 \n"
1262 "phaddw %%xmm6,%%xmm1 \n"
1263 "psraw $0x8,%%xmm0 \n"
1264 "psraw $0x8,%%xmm1 \n"
1265 "packsswb %%xmm1,%%xmm0 \n"
1266 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001267 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001268 "movlps %%xmm0,(%1) \n"
1269 "movhps %%xmm0,(%1,%2,1) \n"
1270 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001271 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001272 : "+r"(src_abgr0), // %0
1273 "+r"(dst_u), // %1
1274 "+r"(dst_v), // %2
1275 "+rm"(width) // %3
1276 : "r"(static_cast<intptr_t>(src_stride_abgr))
1277 : "memory", "cc"
1278#if defined(__SSE2__)
1279 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1280#endif
1281 );
1282}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001283#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001284
fbarchard@google.come214fe32012-06-04 23:47:11 +00001285#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001286#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1287#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1288#define UR 0
1289
1290#define VB 0
1291#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1292#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1293
1294// Bias
1295#define BB UB * 128 + VB * 128
1296#define BG UG * 128 + VG * 128
1297#define BR UR * 128 + VR * 128
1298
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001299#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001300
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001301struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001302 vec8 kUVToB; // 0
1303 vec8 kUVToG; // 16
1304 vec8 kUVToR; // 32
1305 vec16 kUVBiasB; // 48
1306 vec16 kUVBiasG; // 64
1307 vec16 kUVBiasR; // 80
1308 vec16 kYSub16; // 96
1309 vec16 kYToRgb; // 112
1310 vec8 kVUToB; // 128
1311 vec8 kVUToG; // 144
1312 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001313} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001314 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1315 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1316 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1317 { BB, BB, BB, BB, BB, BB, BB, BB },
1318 { BG, BG, BG, BG, BG, BG, BG, BG },
1319 { BR, BR, BR, BR, BR, BR, BR, BR },
1320 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001321 { YG, YG, YG, YG, YG, YG, YG, YG },
1322 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1323 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1324 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001325};
1326
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001327
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001328// Read 8 UV from 411
1329#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001330 "movq (%[u_buf]),%%xmm0 \n" \
1331 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1332 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001333 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001334
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001335// Read 4 UV from 422, upsample to 8 UV
1336#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001337 "movd (%[u_buf]),%%xmm0 \n" \
1338 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1339 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001340 "punpcklbw %%xmm1,%%xmm0 \n" \
1341 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001342
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001343// Read 2 UV from 411, upsample to 8 UV
1344#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001345 "movd (%[u_buf]),%%xmm0 \n" \
1346 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1347 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001348 "punpcklbw %%xmm1,%%xmm0 \n" \
1349 "punpcklwd %%xmm0,%%xmm0 \n" \
1350 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001351
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001352// Read 4 UV from NV12, upsample to 8 UV
1353#define READNV12 \
1354 "movq (%[uv_buf]),%%xmm0 \n" \
1355 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001356 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001357
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001358// Convert 8 pixels: 8 UV and 8 Y
1359#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001360 "movdqa %%xmm0,%%xmm1 \n" \
1361 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001362 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1363 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1364 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1365 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1366 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1367 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1368 "movq (%[y_buf]),%%xmm3 \n" \
1369 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001370 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001371 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1372 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001373 "paddsw %%xmm3,%%xmm0 \n" \
1374 "paddsw %%xmm3,%%xmm1 \n" \
1375 "paddsw %%xmm3,%%xmm2 \n" \
1376 "psraw $0x6,%%xmm0 \n" \
1377 "psraw $0x6,%%xmm1 \n" \
1378 "psraw $0x6,%%xmm2 \n" \
1379 "packuswb %%xmm0,%%xmm0 \n" \
1380 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001381 "packuswb %%xmm2,%%xmm2 \n" \
1382
1383// Convert 8 pixels: 8 VU and 8 Y
1384#define YVUTORGB \
1385 "movdqa %%xmm0,%%xmm1 \n" \
1386 "movdqa %%xmm0,%%xmm2 \n" \
1387 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1388 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1389 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1390 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1391 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1392 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1393 "movq (%[y_buf]),%%xmm3 \n" \
1394 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1395 "punpcklbw %%xmm4,%%xmm3 \n" \
1396 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1397 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1398 "paddsw %%xmm3,%%xmm0 \n" \
1399 "paddsw %%xmm3,%%xmm1 \n" \
1400 "paddsw %%xmm3,%%xmm2 \n" \
1401 "psraw $0x6,%%xmm0 \n" \
1402 "psraw $0x6,%%xmm1 \n" \
1403 "psraw $0x6,%%xmm2 \n" \
1404 "packuswb %%xmm0,%%xmm0 \n" \
1405 "packuswb %%xmm1,%%xmm1 \n" \
1406 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001407
1408void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001409 const uint8* u_buf,
1410 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001411 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001412 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001413 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001414 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001415 "pcmpeqb %%xmm5,%%xmm5 \n"
1416 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001417 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001418 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001419 READYUV444
1420 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001421 "punpcklbw %%xmm1,%%xmm0 \n"
1422 "punpcklbw %%xmm5,%%xmm2 \n"
1423 "movdqa %%xmm0,%%xmm1 \n"
1424 "punpcklwd %%xmm2,%%xmm0 \n"
1425 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001426 "movdqa %%xmm0,(%[argb_buf]) \n"
1427 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1428 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1429 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001430 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001431 : [y_buf]"+r"(y_buf), // %[y_buf]
1432 [u_buf]"+r"(u_buf), // %[u_buf]
1433 [v_buf]"+r"(v_buf), // %[v_buf]
1434 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1435 [width]"+rm"(width) // %[width]
1436 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001437 : "memory", "cc"
1438#if defined(__SSE2__)
1439 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1440#endif
1441 );
1442}
1443
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001444void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1445 const uint8* u_buf,
1446 const uint8* v_buf,
1447 uint8* rgb24_buf,
1448 int width) {
1449// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1450#ifdef __APPLE__
1451 asm volatile (
1452 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm5 \n"
1453 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm6 \n"
1454 :: [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24),
1455 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0));
1456#endif
1457
1458 asm volatile (
1459#ifndef __APPLE__
1460 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm5 \n"
1461 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm6 \n"
1462#endif
1463 "sub %[u_buf],%[v_buf] \n"
1464 "pxor %%xmm4,%%xmm4 \n"
1465 ".p2align 4 \n"
1466 "1: \n"
1467 READYUV422
1468 YUVTORGB
1469 "punpcklbw %%xmm1,%%xmm0 \n"
1470 "punpcklbw %%xmm2,%%xmm2 \n"
1471 "movdqa %%xmm0,%%xmm1 \n"
1472 "punpcklwd %%xmm2,%%xmm0 \n"
1473 "punpckhwd %%xmm2,%%xmm1 \n"
1474 "pshufb %%xmm5,%%xmm0 \n"
1475 "pshufb %%xmm6,%%xmm1 \n"
1476 "palignr $0xc,%%xmm0,%%xmm1 \n"
1477 "movq %%xmm0,(%[rgb24_buf]) \n"
1478 "movdqu %%xmm1,0x8(%[rgb24_buf]) \n"
1479 "lea 0x18(%[rgb24_buf]),%[rgb24_buf] \n"
1480 "sub $0x8,%[width] \n"
1481 "jg 1b \n"
1482 : [y_buf]"+r"(y_buf), // %[y_buf]
1483 [u_buf]"+r"(u_buf), // %[u_buf]
1484 [v_buf]"+r"(v_buf), // %[v_buf]
1485 [rgb24_buf]"+r"(rgb24_buf), // %[rgb24_buf]
1486 [width]"+rm"(width) // %[width]
1487 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1488#ifndef __APPLE__
1489 , [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24),
1490 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0)
1491#endif
1492 : "memory", "cc"
1493#if defined(__SSE2__)
1494 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1495#endif
1496 );
1497}
1498
1499void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1500 const uint8* u_buf,
1501 const uint8* v_buf,
1502 uint8* raw_buf,
1503 int width) {
1504#ifdef __APPLE__
1505 asm volatile (
1506 "movdqa %[kShuffleMaskARGBToRAW],%%xmm5 \n"
1507 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm6 \n"
1508 :: [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW),
1509 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0));
1510#endif
1511
1512 asm volatile (
1513#ifndef __APPLE__
1514 "movdqa %[kShuffleMaskARGBToRAW],%%xmm5 \n"
1515 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm6 \n"
1516#endif
1517 "sub %[u_buf],%[v_buf] \n"
1518 "pxor %%xmm4,%%xmm4 \n"
1519 ".p2align 4 \n"
1520 "1: \n"
1521 READYUV422
1522 YUVTORGB
1523 "punpcklbw %%xmm1,%%xmm0 \n"
1524 "punpcklbw %%xmm2,%%xmm2 \n"
1525 "movdqa %%xmm0,%%xmm1 \n"
1526 "punpcklwd %%xmm2,%%xmm0 \n"
1527 "punpckhwd %%xmm2,%%xmm1 \n"
1528 "pshufb %%xmm5,%%xmm0 \n"
1529 "pshufb %%xmm6,%%xmm1 \n"
1530 "palignr $0xc,%%xmm0,%%xmm1 \n"
1531 "movq %%xmm0,(%[raw_buf]) \n"
1532 "movdqu %%xmm1,0x8(%[raw_buf]) \n"
1533 "lea 0x18(%[raw_buf]),%[raw_buf] \n"
1534 "sub $0x8,%[width] \n"
1535 "jg 1b \n"
1536 : [y_buf]"+r"(y_buf), // %[y_buf]
1537 [u_buf]"+r"(u_buf), // %[u_buf]
1538 [v_buf]"+r"(v_buf), // %[v_buf]
1539 [raw_buf]"+r"(raw_buf), // %[raw_buf]
1540 [width]"+rm"(width) // %[width]
1541 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1542#ifndef __APPLE__
1543 , [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW),
1544 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0)
1545#endif
1546 : "memory", "cc"
1547#if defined(__SSE2__)
1548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1549#endif
1550 );
1551}
1552
fbarchard@google.come214fe32012-06-04 23:47:11 +00001553void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001554 const uint8* u_buf,
1555 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001556 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001557 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001558 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001559 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001560 "pcmpeqb %%xmm5,%%xmm5 \n"
1561 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001562 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001563 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001564 READYUV422
1565 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001566 "punpcklbw %%xmm1,%%xmm0 \n"
1567 "punpcklbw %%xmm5,%%xmm2 \n"
1568 "movdqa %%xmm0,%%xmm1 \n"
1569 "punpcklwd %%xmm2,%%xmm0 \n"
1570 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001571 "movdqa %%xmm0,(%[argb_buf]) \n"
1572 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1573 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1574 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001575 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001576 : [y_buf]"+r"(y_buf), // %[y_buf]
1577 [u_buf]"+r"(u_buf), // %[u_buf]
1578 [v_buf]"+r"(v_buf), // %[v_buf]
1579 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1580 [width]"+rm"(width) // %[width]
1581 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001582 : "memory", "cc"
1583#if defined(__SSE2__)
1584 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1585#endif
1586 );
1587}
1588
1589void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1590 const uint8* u_buf,
1591 const uint8* v_buf,
1592 uint8* argb_buf,
1593 int width) {
1594 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001595 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001596 "pcmpeqb %%xmm5,%%xmm5 \n"
1597 "pxor %%xmm4,%%xmm4 \n"
1598 ".p2align 4 \n"
1599 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001600 READYUV411
1601 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001602 "punpcklbw %%xmm1,%%xmm0 \n"
1603 "punpcklbw %%xmm5,%%xmm2 \n"
1604 "movdqa %%xmm0,%%xmm1 \n"
1605 "punpcklwd %%xmm2,%%xmm0 \n"
1606 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001607 "movdqa %%xmm0,(%[argb_buf]) \n"
1608 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1609 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1610 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001611 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001612 : [y_buf]"+r"(y_buf), // %[y_buf]
1613 [u_buf]"+r"(u_buf), // %[u_buf]
1614 [v_buf]"+r"(v_buf), // %[v_buf]
1615 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1616 [width]"+rm"(width) // %[width]
1617 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1618 : "memory", "cc"
1619#if defined(__SSE2__)
1620 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1621#endif
1622 );
1623}
1624
1625void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1626 const uint8* uv_buf,
1627 uint8* argb_buf,
1628 int width) {
1629 asm volatile (
1630 "pcmpeqb %%xmm5,%%xmm5 \n"
1631 "pxor %%xmm4,%%xmm4 \n"
1632 ".p2align 4 \n"
1633 "1: \n"
1634 READNV12
1635 YUVTORGB
1636 "punpcklbw %%xmm1,%%xmm0 \n"
1637 "punpcklbw %%xmm5,%%xmm2 \n"
1638 "movdqa %%xmm0,%%xmm1 \n"
1639 "punpcklwd %%xmm2,%%xmm0 \n"
1640 "punpckhwd %%xmm2,%%xmm1 \n"
1641 "movdqa %%xmm0,(%[argb_buf]) \n"
1642 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1643 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1644 "sub $0x8,%[width] \n"
1645 "jg 1b \n"
1646 : [y_buf]"+r"(y_buf), // %[y_buf]
1647 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1648 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1649 [width]"+rm"(width) // %[width]
1650 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1651 : "memory", "cc"
1652#if defined(__SSE2__)
1653 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1654#endif
1655 );
1656}
1657
1658void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1659 const uint8* vu_buf,
1660 uint8* argb_buf,
1661 int width) {
1662 asm volatile (
1663 "pcmpeqb %%xmm5,%%xmm5 \n"
1664 "pxor %%xmm4,%%xmm4 \n"
1665 ".p2align 4 \n"
1666 "1: \n"
1667 READNV12
1668 YVUTORGB
1669 "punpcklbw %%xmm1,%%xmm0 \n"
1670 "punpcklbw %%xmm5,%%xmm2 \n"
1671 "movdqa %%xmm0,%%xmm1 \n"
1672 "punpcklwd %%xmm2,%%xmm0 \n"
1673 "punpckhwd %%xmm2,%%xmm1 \n"
1674 "movdqa %%xmm0,(%[argb_buf]) \n"
1675 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1676 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1677 "sub $0x8,%[width] \n"
1678 "jg 1b \n"
1679 : [y_buf]"+r"(y_buf), // %[y_buf]
1680 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1681 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1682 [width]"+rm"(width) // %[width]
1683 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001684 : "memory", "cc"
1685#if defined(__SSE2__)
1686 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1687#endif
1688 );
1689}
1690
1691void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1692 const uint8* u_buf,
1693 const uint8* v_buf,
1694 uint8* argb_buf,
1695 int width) {
1696 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001697 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001698 "pcmpeqb %%xmm5,%%xmm5 \n"
1699 "pxor %%xmm4,%%xmm4 \n"
1700 ".p2align 4 \n"
1701 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001702 READYUV444
1703 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001704 "punpcklbw %%xmm1,%%xmm0 \n"
1705 "punpcklbw %%xmm5,%%xmm2 \n"
1706 "movdqa %%xmm0,%%xmm1 \n"
1707 "punpcklwd %%xmm2,%%xmm0 \n"
1708 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001709 "movdqu %%xmm0,(%[argb_buf]) \n"
1710 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1711 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1712 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001713 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001714 : [y_buf]"+r"(y_buf), // %[y_buf]
1715 [u_buf]"+r"(u_buf), // %[u_buf]
1716 [v_buf]"+r"(v_buf), // %[v_buf]
1717 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1718 [width]"+rm"(width) // %[width]
1719 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001720 : "memory", "cc"
1721#if defined(__SSE2__)
1722 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1723#endif
1724 );
1725}
1726
1727void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1728 const uint8* u_buf,
1729 const uint8* v_buf,
1730 uint8* argb_buf,
1731 int width) {
1732 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001733 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001734 "pcmpeqb %%xmm5,%%xmm5 \n"
1735 "pxor %%xmm4,%%xmm4 \n"
1736 ".p2align 4 \n"
1737 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001738 READYUV422
1739 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001740 "punpcklbw %%xmm1,%%xmm0 \n"
1741 "punpcklbw %%xmm5,%%xmm2 \n"
1742 "movdqa %%xmm0,%%xmm1 \n"
1743 "punpcklwd %%xmm2,%%xmm0 \n"
1744 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001745 "movdqu %%xmm0,(%[argb_buf]) \n"
1746 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1747 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1748 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001749 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001750 : [y_buf]"+r"(y_buf), // %[y_buf]
1751 [u_buf]"+r"(u_buf), // %[u_buf]
1752 [v_buf]"+r"(v_buf), // %[v_buf]
1753 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1754 [width]"+rm"(width) // %[width]
1755 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001756 : "memory", "cc"
1757#if defined(__SSE2__)
1758 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1759#endif
1760 );
1761}
1762
1763void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1764 const uint8* u_buf,
1765 const uint8* v_buf,
1766 uint8* argb_buf,
1767 int width) {
1768 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001769 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001770 "pcmpeqb %%xmm5,%%xmm5 \n"
1771 "pxor %%xmm4,%%xmm4 \n"
1772 ".p2align 4 \n"
1773 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001774 READYUV411
1775 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001776 "punpcklbw %%xmm1,%%xmm0 \n"
1777 "punpcklbw %%xmm5,%%xmm2 \n"
1778 "movdqa %%xmm0,%%xmm1 \n"
1779 "punpcklwd %%xmm2,%%xmm0 \n"
1780 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001781 "movdqu %%xmm0,(%[argb_buf]) \n"
1782 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1783 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1784 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001785 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001786 : [y_buf]"+r"(y_buf), // %[y_buf]
1787 [u_buf]"+r"(u_buf), // %[u_buf]
1788 [v_buf]"+r"(v_buf), // %[v_buf]
1789 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1790 [width]"+rm"(width) // %[width]
1791 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1792 : "memory", "cc"
1793#if defined(__SSE2__)
1794 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1795#endif
1796 );
1797}
1798
1799void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1800 const uint8* uv_buf,
1801 uint8* argb_buf,
1802 int width) {
1803 asm volatile (
1804 "pcmpeqb %%xmm5,%%xmm5 \n"
1805 "pxor %%xmm4,%%xmm4 \n"
1806 ".p2align 4 \n"
1807 "1: \n"
1808 READNV12
1809 YUVTORGB
1810 "punpcklbw %%xmm1,%%xmm0 \n"
1811 "punpcklbw %%xmm5,%%xmm2 \n"
1812 "movdqa %%xmm0,%%xmm1 \n"
1813 "punpcklwd %%xmm2,%%xmm0 \n"
1814 "punpckhwd %%xmm2,%%xmm1 \n"
1815 "movdqu %%xmm0,(%[argb_buf]) \n"
1816 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1817 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1818 "sub $0x8,%[width] \n"
1819 "jg 1b \n"
1820 : [y_buf]"+r"(y_buf), // %[y_buf]
1821 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1822 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1823 [width]"+rm"(width) // %[width]
1824 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1825 : "memory", "cc"
1826#if defined(__SSE2__)
1827 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1828#endif
1829 );
1830}
1831
1832void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1833 const uint8* vu_buf,
1834 uint8* argb_buf,
1835 int width) {
1836 asm volatile (
1837 "pcmpeqb %%xmm5,%%xmm5 \n"
1838 "pxor %%xmm4,%%xmm4 \n"
1839 ".p2align 4 \n"
1840 "1: \n"
1841 READNV12
1842 YVUTORGB
1843 "punpcklbw %%xmm1,%%xmm0 \n"
1844 "punpcklbw %%xmm5,%%xmm2 \n"
1845 "movdqa %%xmm0,%%xmm1 \n"
1846 "punpcklwd %%xmm2,%%xmm0 \n"
1847 "punpckhwd %%xmm2,%%xmm1 \n"
1848 "movdqu %%xmm0,(%[argb_buf]) \n"
1849 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1850 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1851 "sub $0x8,%[width] \n"
1852 "jg 1b \n"
1853 : [y_buf]"+r"(y_buf), // %[y_buf]
1854 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1855 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1856 [width]"+rm"(width) // %[width]
1857 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001858 : "memory", "cc"
1859#if defined(__SSE2__)
1860 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1861#endif
1862 );
1863}
1864
1865void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1866 const uint8* u_buf,
1867 const uint8* v_buf,
1868 uint8* bgra_buf,
1869 int width) {
1870 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001871 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001872 "pcmpeqb %%xmm5,%%xmm5 \n"
1873 "pxor %%xmm4,%%xmm4 \n"
1874 ".p2align 4 \n"
1875 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001876 READYUV422
1877 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001878 "pcmpeqb %%xmm5,%%xmm5 \n"
1879 "punpcklbw %%xmm0,%%xmm1 \n"
1880 "punpcklbw %%xmm2,%%xmm5 \n"
1881 "movdqa %%xmm5,%%xmm0 \n"
1882 "punpcklwd %%xmm1,%%xmm5 \n"
1883 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001884 "movdqa %%xmm5,(%[argb_buf]) \n"
1885 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1886 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1887 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001888 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001889 : [y_buf]"+r"(y_buf), // %[y_buf]
1890 [u_buf]"+r"(u_buf), // %[u_buf]
1891 [v_buf]"+r"(v_buf), // %[v_buf]
1892 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1893 [width]"+rm"(width) // %[width]
1894 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001895 : "memory", "cc"
1896#if defined(__SSE2__)
1897 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1898#endif
1899 );
1900}
1901
fbarchard@google.come214fe32012-06-04 23:47:11 +00001902void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001903 const uint8* u_buf,
1904 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001905 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001906 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001907 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001908 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001909 "pcmpeqb %%xmm5,%%xmm5 \n"
1910 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001911 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001912 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001913 READYUV422
1914 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001915 "punpcklbw %%xmm1,%%xmm2 \n"
1916 "punpcklbw %%xmm5,%%xmm0 \n"
1917 "movdqa %%xmm2,%%xmm1 \n"
1918 "punpcklwd %%xmm0,%%xmm2 \n"
1919 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001920 "movdqa %%xmm2,(%[argb_buf]) \n"
1921 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1922 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1923 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001924 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001925 : [y_buf]"+r"(y_buf), // %[y_buf]
1926 [u_buf]"+r"(u_buf), // %[u_buf]
1927 [v_buf]"+r"(v_buf), // %[v_buf]
1928 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1929 [width]"+rm"(width) // %[width]
1930 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001931 : "memory", "cc"
1932#if defined(__SSE2__)
1933 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1934#endif
1935 );
1936}
1937
fbarchard@google.come91bdac2012-10-09 21:09:33 +00001938void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1939 const uint8* u_buf,
1940 const uint8* v_buf,
1941 uint8* rgba_buf,
1942 int width) {
1943 asm volatile (
1944 "sub %[u_buf],%[v_buf] \n"
1945 "pcmpeqb %%xmm5,%%xmm5 \n"
1946 "pxor %%xmm4,%%xmm4 \n"
1947 ".p2align 4 \n"
1948 "1: \n"
1949 READYUV422
1950 YUVTORGB
1951 "pcmpeqb %%xmm5,%%xmm5 \n"
1952 "punpcklbw %%xmm2,%%xmm1 \n"
1953 "punpcklbw %%xmm0,%%xmm5 \n"
1954 "movdqa %%xmm5,%%xmm0 \n"
1955 "punpcklwd %%xmm1,%%xmm5 \n"
1956 "punpckhwd %%xmm1,%%xmm0 \n"
1957 "movdqa %%xmm5,(%[argb_buf]) \n"
1958 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1959 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1960 "sub $0x8,%[width] \n"
1961 "jg 1b \n"
1962 : [y_buf]"+r"(y_buf), // %[y_buf]
1963 [u_buf]"+r"(u_buf), // %[u_buf]
1964 [v_buf]"+r"(v_buf), // %[v_buf]
1965 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
1966 [width]"+rm"(width) // %[width]
1967 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1968 : "memory", "cc"
1969#if defined(__SSE2__)
1970 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1971#endif
1972 );
1973}
1974
fbarchard@google.come214fe32012-06-04 23:47:11 +00001975void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001976 const uint8* u_buf,
1977 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001978 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00001979 int width) {
1980 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001981 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001982 "pcmpeqb %%xmm5,%%xmm5 \n"
1983 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001984 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001985 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001986 READYUV422
1987 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00001988 "pcmpeqb %%xmm5,%%xmm5 \n"
1989 "punpcklbw %%xmm0,%%xmm1 \n"
1990 "punpcklbw %%xmm2,%%xmm5 \n"
1991 "movdqa %%xmm5,%%xmm0 \n"
1992 "punpcklwd %%xmm1,%%xmm5 \n"
1993 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001994 "movdqu %%xmm5,(%[argb_buf]) \n"
1995 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
1996 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1997 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00001998 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001999 : [y_buf]"+r"(y_buf), // %[y_buf]
2000 [u_buf]"+r"(u_buf), // %[u_buf]
2001 [v_buf]"+r"(v_buf), // %[v_buf]
2002 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2003 [width]"+rm"(width) // %[width]
2004 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002005 : "memory", "cc"
2006#if defined(__SSE2__)
2007 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2008#endif
2009 );
2010}
2011
fbarchard@google.come214fe32012-06-04 23:47:11 +00002012void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002013 const uint8* u_buf,
2014 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002015 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002016 int width) {
2017 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002018 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002019 "pcmpeqb %%xmm5,%%xmm5 \n"
2020 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002021 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002022 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002023 READYUV422
2024 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002025 "punpcklbw %%xmm1,%%xmm2 \n"
2026 "punpcklbw %%xmm5,%%xmm0 \n"
2027 "movdqa %%xmm2,%%xmm1 \n"
2028 "punpcklwd %%xmm0,%%xmm2 \n"
2029 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002030 "movdqu %%xmm2,(%[argb_buf]) \n"
2031 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2032 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2033 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002034 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002035 : [y_buf]"+r"(y_buf), // %[y_buf]
2036 [u_buf]"+r"(u_buf), // %[u_buf]
2037 [v_buf]"+r"(v_buf), // %[v_buf]
2038 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2039 [width]"+rm"(width) // %[width]
2040 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002041 : "memory", "cc"
2042#if defined(__SSE2__)
2043 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2044#endif
2045 );
2046}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002047
2048void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2049 const uint8* u_buf,
2050 const uint8* v_buf,
2051 uint8* rgba_buf,
2052 int width) {
2053 asm volatile (
2054 "sub %[u_buf],%[v_buf] \n"
2055 "pcmpeqb %%xmm5,%%xmm5 \n"
2056 "pxor %%xmm4,%%xmm4 \n"
2057 ".p2align 4 \n"
2058 "1: \n"
2059 READYUV422
2060 YUVTORGB
2061 "pcmpeqb %%xmm5,%%xmm5 \n"
2062 "punpcklbw %%xmm2,%%xmm1 \n"
2063 "punpcklbw %%xmm0,%%xmm5 \n"
2064 "movdqa %%xmm5,%%xmm0 \n"
2065 "punpcklwd %%xmm1,%%xmm5 \n"
2066 "punpckhwd %%xmm1,%%xmm0 \n"
2067 "movdqa %%xmm5,(%[argb_buf]) \n"
2068 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2069 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2070 "sub $0x8,%[width] \n"
2071 "jg 1b \n"
2072 : [y_buf]"+r"(y_buf), // %[y_buf]
2073 [u_buf]"+r"(u_buf), // %[u_buf]
2074 [v_buf]"+r"(v_buf), // %[v_buf]
2075 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2076 [width]"+rm"(width) // %[width]
2077 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2078 : "memory", "cc"
2079#if defined(__SSE2__)
2080 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2081#endif
2082 );
2083}
2084
fbarchard@google.come214fe32012-06-04 23:47:11 +00002085#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002086
2087#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002088void YToARGBRow_SSE2(const uint8* y_buf,
2089 uint8* rgb_buf,
2090 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002091 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002092 "pcmpeqb %%xmm4,%%xmm4 \n"
2093 "pslld $0x18,%%xmm4 \n"
2094 "mov $0x10001000,%%eax \n"
2095 "movd %%eax,%%xmm3 \n"
2096 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2097 "mov $0x012a012a,%%eax \n"
2098 "movd %%eax,%%xmm2 \n"
2099 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002100 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002101 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002102 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002103 "movq (%0),%%xmm0 \n"
2104 "lea 0x8(%0),%0 \n"
2105 "punpcklbw %%xmm0,%%xmm0 \n"
2106 "psubusw %%xmm3,%%xmm0 \n"
2107 "pmulhuw %%xmm2,%%xmm0 \n"
2108 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002109
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002110 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002111 "punpcklbw %%xmm0,%%xmm0 \n"
2112 "movdqa %%xmm0,%%xmm1 \n"
2113 "punpcklwd %%xmm0,%%xmm0 \n"
2114 "punpckhwd %%xmm1,%%xmm1 \n"
2115 "por %%xmm4,%%xmm0 \n"
2116 "por %%xmm4,%%xmm1 \n"
2117 "movdqa %%xmm0,(%1) \n"
2118 "movdqa %%xmm1,16(%1) \n"
2119 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002120
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002121 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002122 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00002123 : "+r"(y_buf), // %0
2124 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00002125 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002126 :
2127 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002128#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002129 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002130#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002131 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002132}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002133#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002134
fbarchard@google.com42831e02012-01-21 02:54:17 +00002135#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002136// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002137CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002138 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2139};
2140
fbarchard@google.com42831e02012-01-21 02:54:17 +00002141void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002142 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002143 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002144 "movdqa %3,%%xmm5 \n"
2145 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002146 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002147 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002148 "movdqa (%0,%2),%%xmm0 \n"
2149 "pshufb %%xmm5,%%xmm0 \n"
2150 "sub $0x10,%2 \n"
2151 "movdqa %%xmm0,(%1) \n"
2152 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002153 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002154 : "+r"(src), // %0
2155 "+r"(dst), // %1
2156 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002157 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002158 : "memory", "cc"
2159#if defined(__SSE2__)
2160 , "xmm0", "xmm5"
2161#endif
2162 );
2163}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002164#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002165
fbarchard@google.com42831e02012-01-21 02:54:17 +00002166#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002167void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002168 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002169 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002170 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002171 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002172 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002173 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002174 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002175 "psllw $0x8,%%xmm0 \n"
2176 "psrlw $0x8,%%xmm1 \n"
2177 "por %%xmm1,%%xmm0 \n"
2178 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2179 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2180 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2181 "sub $0x10,%2 \n"
2182 "movdqu %%xmm0,(%1) \n"
2183 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002184 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002185 : "+r"(src), // %0
2186 "+r"(dst), // %1
2187 "+r"(temp_width) // %2
2188 :
2189 : "memory", "cc"
2190#if defined(__SSE2__)
2191 , "xmm0", "xmm1"
2192#endif
2193 );
2194}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002195#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002196
fbarchard@google.com16a96642012-03-02 22:38:09 +00002197#ifdef HAS_MIRRORROW_UV_SSSE3
2198// Shuffle table for reversing the bytes of UV channels.
2199CONST uvec8 kShuffleMirrorUV = {
2200 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2201};
2202void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2203 int width) {
2204 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002205 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002206 "movdqa %4,%%xmm1 \n"
2207 "lea -16(%0,%3,2),%0 \n"
2208 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002209 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002210 "1: \n"
2211 "movdqa (%0),%%xmm0 \n"
2212 "lea -16(%0),%0 \n"
2213 "pshufb %%xmm1,%%xmm0 \n"
2214 "sub $8,%3 \n"
2215 "movlpd %%xmm0,(%1) \n"
2216 "movhpd %%xmm0,(%1,%2) \n"
2217 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002218 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002219 : "+r"(src), // %0
2220 "+r"(dst_u), // %1
2221 "+r"(dst_v), // %2
2222 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002223 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002224 : "memory", "cc"
2225#if defined(__SSE2__)
2226 , "xmm0", "xmm1"
2227#endif
2228 );
2229}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002230#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002231
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002232#ifdef HAS_ARGBMIRRORROW_SSSE3
2233// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002234CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002235 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2236};
2237
2238void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2239 intptr_t temp_width = static_cast<intptr_t>(width);
2240 asm volatile (
2241 "movdqa %3,%%xmm5 \n"
2242 "lea -0x10(%0),%0 \n"
2243 ".p2align 4 \n"
2244 "1: \n"
2245 "movdqa (%0,%2,4),%%xmm0 \n"
2246 "pshufb %%xmm5,%%xmm0 \n"
2247 "sub $0x4,%2 \n"
2248 "movdqa %%xmm0,(%1) \n"
2249 "lea 0x10(%1),%1 \n"
2250 "jg 1b \n"
2251 : "+r"(src), // %0
2252 "+r"(dst), // %1
2253 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002254 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002255 : "memory", "cc"
2256#if defined(__SSE2__)
2257 , "xmm0", "xmm5"
2258#endif
2259 );
2260}
2261#endif // HAS_ARGBMIRRORROW_SSSE3
2262
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002263#ifdef HAS_SPLITUV_SSE2
2264void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002265 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002266 "pcmpeqb %%xmm5,%%xmm5 \n"
2267 "psrlw $0x8,%%xmm5 \n"
2268 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002269 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002270 "1: \n"
2271 "movdqa (%0),%%xmm0 \n"
2272 "movdqa 0x10(%0),%%xmm1 \n"
2273 "lea 0x20(%0),%0 \n"
2274 "movdqa %%xmm0,%%xmm2 \n"
2275 "movdqa %%xmm1,%%xmm3 \n"
2276 "pand %%xmm5,%%xmm0 \n"
2277 "pand %%xmm5,%%xmm1 \n"
2278 "packuswb %%xmm1,%%xmm0 \n"
2279 "psrlw $0x8,%%xmm2 \n"
2280 "psrlw $0x8,%%xmm3 \n"
2281 "packuswb %%xmm3,%%xmm2 \n"
2282 "movdqa %%xmm0,(%1) \n"
2283 "movdqa %%xmm2,(%1,%2) \n"
2284 "lea 0x10(%1),%1 \n"
2285 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002286 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002287 : "+r"(src_uv), // %0
2288 "+r"(dst_u), // %1
2289 "+r"(dst_v), // %2
2290 "+r"(pix) // %3
2291 :
2292 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002293#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002294 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002295#endif
2296 );
2297}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002298#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002299
fbarchard@google.com19932f82012-02-16 22:19:14 +00002300#ifdef HAS_COPYROW_SSE2
2301void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002302 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002303 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002304 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002305 "1: \n"
2306 "movdqa (%0),%%xmm0 \n"
2307 "movdqa 0x10(%0),%%xmm1 \n"
2308 "movdqa %%xmm0,(%0,%1) \n"
2309 "movdqa %%xmm1,0x10(%0,%1) \n"
2310 "lea 0x20(%0),%0 \n"
2311 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002312 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002313 : "+r"(src), // %0
2314 "+r"(dst), // %1
2315 "+r"(count) // %2
2316 :
2317 : "memory", "cc"
2318#if defined(__SSE2__)
2319 , "xmm0", "xmm1"
2320#endif
2321 );
2322}
2323#endif // HAS_COPYROW_SSE2
2324
2325#ifdef HAS_COPYROW_X86
2326void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2327 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002328 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002329 "shr $0x2,%2 \n"
2330 "rep movsl \n"
2331 : "+S"(src), // %0
2332 "+D"(dst), // %1
2333 "+c"(width_tmp) // %2
2334 :
2335 : "memory", "cc"
2336 );
2337}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002338#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002339
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002340#ifdef HAS_SETROW_X86
2341void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2342 size_t width_tmp = static_cast<size_t>(width);
2343 asm volatile (
2344 "shr $0x2,%1 \n"
2345 "rep stosl \n"
2346 : "+D"(dst), // %0
2347 "+c"(width_tmp) // %1
2348 : "a"(v32) // %2
2349 : "memory", "cc");
2350}
2351
2352void SetRows32_X86(uint8* dst, uint32 v32, int width,
2353 int dst_stride, int height) {
2354 for (int y = 0; y < height; ++y) {
2355 size_t width_tmp = static_cast<size_t>(width);
2356 uint32* d = reinterpret_cast<uint32*>(dst);
2357 asm volatile (
2358 "rep stosl \n"
2359 : "+D"(d), // %0
2360 "+c"(width_tmp) // %1
2361 : "a"(v32) // %2
2362 : "memory", "cc");
2363 dst += dst_stride;
2364 }
2365}
2366#endif // HAS_SETROW_X86
2367
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002368#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002369void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002370 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002371 "pcmpeqb %%xmm5,%%xmm5 \n"
2372 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002373 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002374 "1: \n"
2375 "movdqa (%0),%%xmm0 \n"
2376 "movdqa 0x10(%0),%%xmm1 \n"
2377 "lea 0x20(%0),%0 \n"
2378 "pand %%xmm5,%%xmm0 \n"
2379 "pand %%xmm5,%%xmm1 \n"
2380 "packuswb %%xmm1,%%xmm0 \n"
2381 "movdqa %%xmm0,(%1) \n"
2382 "lea 0x10(%1),%1 \n"
2383 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002384 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002385 : "+r"(src_yuy2), // %0
2386 "+r"(dst_y), // %1
2387 "+r"(pix) // %2
2388 :
2389 : "memory", "cc"
2390#if defined(__SSE2__)
2391 , "xmm0", "xmm1", "xmm5"
2392#endif
2393 );
2394}
2395
2396void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002397 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002398 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002399 "pcmpeqb %%xmm5,%%xmm5 \n"
2400 "psrlw $0x8,%%xmm5 \n"
2401 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002402 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002403 "1: \n"
2404 "movdqa (%0),%%xmm0 \n"
2405 "movdqa 0x10(%0),%%xmm1 \n"
2406 "movdqa (%0,%4,1),%%xmm2 \n"
2407 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2408 "lea 0x20(%0),%0 \n"
2409 "pavgb %%xmm2,%%xmm0 \n"
2410 "pavgb %%xmm3,%%xmm1 \n"
2411 "psrlw $0x8,%%xmm0 \n"
2412 "psrlw $0x8,%%xmm1 \n"
2413 "packuswb %%xmm1,%%xmm0 \n"
2414 "movdqa %%xmm0,%%xmm1 \n"
2415 "pand %%xmm5,%%xmm0 \n"
2416 "packuswb %%xmm0,%%xmm0 \n"
2417 "psrlw $0x8,%%xmm1 \n"
2418 "packuswb %%xmm1,%%xmm1 \n"
2419 "movq %%xmm0,(%1) \n"
2420 "movq %%xmm1,(%1,%2) \n"
2421 "lea 0x8(%1),%1 \n"
2422 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002423 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002424 : "+r"(src_yuy2), // %0
2425 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002426 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002427 "+r"(pix) // %3
2428 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2429 : "memory", "cc"
2430#if defined(__SSE2__)
2431 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2432#endif
2433 );
2434}
2435
fbarchard@google.comc704f782012-08-30 19:53:48 +00002436void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2437 uint8* dst_u, uint8* dst_v, int pix) {
2438 asm volatile (
2439 "pcmpeqb %%xmm5,%%xmm5 \n"
2440 "psrlw $0x8,%%xmm5 \n"
2441 "sub %1,%2 \n"
2442 ".p2align 4 \n"
2443 "1: \n"
2444 "movdqa (%0),%%xmm0 \n"
2445 "movdqa 0x10(%0),%%xmm1 \n"
2446 "lea 0x20(%0),%0 \n"
2447 "psrlw $0x8,%%xmm0 \n"
2448 "psrlw $0x8,%%xmm1 \n"
2449 "packuswb %%xmm1,%%xmm0 \n"
2450 "movdqa %%xmm0,%%xmm1 \n"
2451 "pand %%xmm5,%%xmm0 \n"
2452 "packuswb %%xmm0,%%xmm0 \n"
2453 "psrlw $0x8,%%xmm1 \n"
2454 "packuswb %%xmm1,%%xmm1 \n"
2455 "movq %%xmm0,(%1) \n"
2456 "movq %%xmm1,(%1,%2) \n"
2457 "lea 0x8(%1),%1 \n"
2458 "sub $0x10,%3 \n"
2459 "jg 1b \n"
2460 : "+r"(src_yuy2), // %0
2461 "+r"(dst_u), // %1
2462 "+r"(dst_v), // %2
2463 "+r"(pix) // %3
2464 :
2465 : "memory", "cc"
2466#if defined(__SSE2__)
2467 , "xmm0", "xmm1", "xmm5"
2468#endif
2469 );
2470}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002471
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002472void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2473 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002474 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002475 "pcmpeqb %%xmm5,%%xmm5 \n"
2476 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002477 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002478 "1: \n"
2479 "movdqu (%0),%%xmm0 \n"
2480 "movdqu 0x10(%0),%%xmm1 \n"
2481 "lea 0x20(%0),%0 \n"
2482 "pand %%xmm5,%%xmm0 \n"
2483 "pand %%xmm5,%%xmm1 \n"
2484 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002485 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002486 "movdqu %%xmm0,(%1) \n"
2487 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002488 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002489 : "+r"(src_yuy2), // %0
2490 "+r"(dst_y), // %1
2491 "+r"(pix) // %2
2492 :
2493 : "memory", "cc"
2494#if defined(__SSE2__)
2495 , "xmm0", "xmm1", "xmm5"
2496#endif
2497 );
2498}
2499
2500void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2501 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002502 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002503 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002504 "pcmpeqb %%xmm5,%%xmm5 \n"
2505 "psrlw $0x8,%%xmm5 \n"
2506 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002507 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002508 "1: \n"
2509 "movdqu (%0),%%xmm0 \n"
2510 "movdqu 0x10(%0),%%xmm1 \n"
2511 "movdqu (%0,%4,1),%%xmm2 \n"
2512 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2513 "lea 0x20(%0),%0 \n"
2514 "pavgb %%xmm2,%%xmm0 \n"
2515 "pavgb %%xmm3,%%xmm1 \n"
2516 "psrlw $0x8,%%xmm0 \n"
2517 "psrlw $0x8,%%xmm1 \n"
2518 "packuswb %%xmm1,%%xmm0 \n"
2519 "movdqa %%xmm0,%%xmm1 \n"
2520 "pand %%xmm5,%%xmm0 \n"
2521 "packuswb %%xmm0,%%xmm0 \n"
2522 "psrlw $0x8,%%xmm1 \n"
2523 "packuswb %%xmm1,%%xmm1 \n"
2524 "movq %%xmm0,(%1) \n"
2525 "movq %%xmm1,(%1,%2) \n"
2526 "lea 0x8(%1),%1 \n"
2527 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002528 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002529 : "+r"(src_yuy2), // %0
2530 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002531 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002532 "+r"(pix) // %3
2533 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2534 : "memory", "cc"
2535#if defined(__SSE2__)
2536 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2537#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002538 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002539}
2540
fbarchard@google.comc704f782012-08-30 19:53:48 +00002541void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2542 uint8* dst_u, uint8* dst_v, int pix) {
2543 asm volatile (
2544 "pcmpeqb %%xmm5,%%xmm5 \n"
2545 "psrlw $0x8,%%xmm5 \n"
2546 "sub %1,%2 \n"
2547 ".p2align 4 \n"
2548 "1: \n"
2549 "movdqu (%0),%%xmm0 \n"
2550 "movdqu 0x10(%0),%%xmm1 \n"
2551 "lea 0x20(%0),%0 \n"
2552 "psrlw $0x8,%%xmm0 \n"
2553 "psrlw $0x8,%%xmm1 \n"
2554 "packuswb %%xmm1,%%xmm0 \n"
2555 "movdqa %%xmm0,%%xmm1 \n"
2556 "pand %%xmm5,%%xmm0 \n"
2557 "packuswb %%xmm0,%%xmm0 \n"
2558 "psrlw $0x8,%%xmm1 \n"
2559 "packuswb %%xmm1,%%xmm1 \n"
2560 "movq %%xmm0,(%1) \n"
2561 "movq %%xmm1,(%1,%2) \n"
2562 "lea 0x8(%1),%1 \n"
2563 "sub $0x10,%3 \n"
2564 "jg 1b \n"
2565 : "+r"(src_yuy2), // %0
2566 "+r"(dst_u), // %1
2567 "+r"(dst_v), // %2
2568 "+r"(pix) // %3
2569 :
2570 : "memory", "cc"
2571#if defined(__SSE2__)
2572 , "xmm0", "xmm1", "xmm5"
2573#endif
2574 );
2575}
2576
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002577void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002578 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002579 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002580 "1: \n"
2581 "movdqa (%0),%%xmm0 \n"
2582 "movdqa 0x10(%0),%%xmm1 \n"
2583 "lea 0x20(%0),%0 \n"
2584 "psrlw $0x8,%%xmm0 \n"
2585 "psrlw $0x8,%%xmm1 \n"
2586 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002587 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002588 "movdqa %%xmm0,(%1) \n"
2589 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002590 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002591 : "+r"(src_uyvy), // %0
2592 "+r"(dst_y), // %1
2593 "+r"(pix) // %2
2594 :
2595 : "memory", "cc"
2596#if defined(__SSE2__)
2597 , "xmm0", "xmm1"
2598#endif
2599 );
2600}
2601
2602void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002603 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002604 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002605 "pcmpeqb %%xmm5,%%xmm5 \n"
2606 "psrlw $0x8,%%xmm5 \n"
2607 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002608 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002609 "1: \n"
2610 "movdqa (%0),%%xmm0 \n"
2611 "movdqa 0x10(%0),%%xmm1 \n"
2612 "movdqa (%0,%4,1),%%xmm2 \n"
2613 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2614 "lea 0x20(%0),%0 \n"
2615 "pavgb %%xmm2,%%xmm0 \n"
2616 "pavgb %%xmm3,%%xmm1 \n"
2617 "pand %%xmm5,%%xmm0 \n"
2618 "pand %%xmm5,%%xmm1 \n"
2619 "packuswb %%xmm1,%%xmm0 \n"
2620 "movdqa %%xmm0,%%xmm1 \n"
2621 "pand %%xmm5,%%xmm0 \n"
2622 "packuswb %%xmm0,%%xmm0 \n"
2623 "psrlw $0x8,%%xmm1 \n"
2624 "packuswb %%xmm1,%%xmm1 \n"
2625 "movq %%xmm0,(%1) \n"
2626 "movq %%xmm1,(%1,%2) \n"
2627 "lea 0x8(%1),%1 \n"
2628 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002629 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002630 : "+r"(src_uyvy), // %0
2631 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002632 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002633 "+r"(pix) // %3
2634 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2635 : "memory", "cc"
2636#if defined(__SSE2__)
2637 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2638#endif
2639 );
2640}
2641
fbarchard@google.comc704f782012-08-30 19:53:48 +00002642void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2643 uint8* dst_u, uint8* dst_v, int pix) {
2644 asm volatile (
2645 "pcmpeqb %%xmm5,%%xmm5 \n"
2646 "psrlw $0x8,%%xmm5 \n"
2647 "sub %1,%2 \n"
2648 ".p2align 4 \n"
2649 "1: \n"
2650 "movdqa (%0),%%xmm0 \n"
2651 "movdqa 0x10(%0),%%xmm1 \n"
2652 "lea 0x20(%0),%0 \n"
2653 "pand %%xmm5,%%xmm0 \n"
2654 "pand %%xmm5,%%xmm1 \n"
2655 "packuswb %%xmm1,%%xmm0 \n"
2656 "movdqa %%xmm0,%%xmm1 \n"
2657 "pand %%xmm5,%%xmm0 \n"
2658 "packuswb %%xmm0,%%xmm0 \n"
2659 "psrlw $0x8,%%xmm1 \n"
2660 "packuswb %%xmm1,%%xmm1 \n"
2661 "movq %%xmm0,(%1) \n"
2662 "movq %%xmm1,(%1,%2) \n"
2663 "lea 0x8(%1),%1 \n"
2664 "sub $0x10,%3 \n"
2665 "jg 1b \n"
2666 : "+r"(src_uyvy), // %0
2667 "+r"(dst_u), // %1
2668 "+r"(dst_v), // %2
2669 "+r"(pix) // %3
2670 :
2671 : "memory", "cc"
2672#if defined(__SSE2__)
2673 , "xmm0", "xmm1", "xmm5"
2674#endif
2675 );
2676}
2677
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002678void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2679 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002680 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002681 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002682 "1: \n"
2683 "movdqu (%0),%%xmm0 \n"
2684 "movdqu 0x10(%0),%%xmm1 \n"
2685 "lea 0x20(%0),%0 \n"
2686 "psrlw $0x8,%%xmm0 \n"
2687 "psrlw $0x8,%%xmm1 \n"
2688 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002689 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002690 "movdqu %%xmm0,(%1) \n"
2691 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002692 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002693 : "+r"(src_uyvy), // %0
2694 "+r"(dst_y), // %1
2695 "+r"(pix) // %2
2696 :
2697 : "memory", "cc"
2698#if defined(__SSE2__)
2699 , "xmm0", "xmm1"
2700#endif
2701 );
2702}
2703
2704void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002705 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002706 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002707 "pcmpeqb %%xmm5,%%xmm5 \n"
2708 "psrlw $0x8,%%xmm5 \n"
2709 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002710 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002711 "1: \n"
2712 "movdqu (%0),%%xmm0 \n"
2713 "movdqu 0x10(%0),%%xmm1 \n"
2714 "movdqu (%0,%4,1),%%xmm2 \n"
2715 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2716 "lea 0x20(%0),%0 \n"
2717 "pavgb %%xmm2,%%xmm0 \n"
2718 "pavgb %%xmm3,%%xmm1 \n"
2719 "pand %%xmm5,%%xmm0 \n"
2720 "pand %%xmm5,%%xmm1 \n"
2721 "packuswb %%xmm1,%%xmm0 \n"
2722 "movdqa %%xmm0,%%xmm1 \n"
2723 "pand %%xmm5,%%xmm0 \n"
2724 "packuswb %%xmm0,%%xmm0 \n"
2725 "psrlw $0x8,%%xmm1 \n"
2726 "packuswb %%xmm1,%%xmm1 \n"
2727 "movq %%xmm0,(%1) \n"
2728 "movq %%xmm1,(%1,%2) \n"
2729 "lea 0x8(%1),%1 \n"
2730 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002731 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002732 : "+r"(src_uyvy), // %0
2733 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002734 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002735 "+r"(pix) // %3
2736 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2737 : "memory", "cc"
2738#if defined(__SSE2__)
2739 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2740#endif
2741 );
2742}
fbarchard@google.comc704f782012-08-30 19:53:48 +00002743
2744void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2745 uint8* dst_u, uint8* dst_v, int pix) {
2746 asm volatile (
2747 "pcmpeqb %%xmm5,%%xmm5 \n"
2748 "psrlw $0x8,%%xmm5 \n"
2749 "sub %1,%2 \n"
2750 ".p2align 4 \n"
2751 "1: \n"
2752 "movdqu (%0),%%xmm0 \n"
2753 "movdqu 0x10(%0),%%xmm1 \n"
2754 "lea 0x20(%0),%0 \n"
2755 "pand %%xmm5,%%xmm0 \n"
2756 "pand %%xmm5,%%xmm1 \n"
2757 "packuswb %%xmm1,%%xmm0 \n"
2758 "movdqa %%xmm0,%%xmm1 \n"
2759 "pand %%xmm5,%%xmm0 \n"
2760 "packuswb %%xmm0,%%xmm0 \n"
2761 "psrlw $0x8,%%xmm1 \n"
2762 "packuswb %%xmm1,%%xmm1 \n"
2763 "movq %%xmm0,(%1) \n"
2764 "movq %%xmm1,(%1,%2) \n"
2765 "lea 0x8(%1),%1 \n"
2766 "sub $0x10,%3 \n"
2767 "jg 1b \n"
2768 : "+r"(src_uyvy), // %0
2769 "+r"(dst_u), // %1
2770 "+r"(dst_v), // %2
2771 "+r"(pix) // %3
2772 :
2773 : "memory", "cc"
2774#if defined(__SSE2__)
2775 , "xmm0", "xmm1", "xmm5"
2776#endif
2777 );
2778}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002779#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002780
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00002781#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00002782// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002783void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
2784 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00002785 asm volatile (
2786 "pcmpeqb %%xmm7,%%xmm7 \n"
2787 "psrlw $0xf,%%xmm7 \n"
2788 "pcmpeqb %%xmm6,%%xmm6 \n"
2789 "psrlw $0x8,%%xmm6 \n"
2790 "pcmpeqb %%xmm5,%%xmm5 \n"
2791 "psllw $0x8,%%xmm5 \n"
2792 "pcmpeqb %%xmm4,%%xmm4 \n"
2793 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002794 "sub $0x1,%3 \n"
2795 "je 91f \n"
2796 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002797
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002798 // 1 pixel loop until destination pointer is aligned.
2799 "10: \n"
2800 "test $0xf,%2 \n"
2801 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002802 "movd (%0),%%xmm3 \n"
2803 "lea 0x4(%0),%0 \n"
2804 "movdqa %%xmm3,%%xmm0 \n"
2805 "pxor %%xmm4,%%xmm3 \n"
2806 "movd (%1),%%xmm2 \n"
2807 "psrlw $0x8,%%xmm3 \n"
2808 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2809 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2810 "pand %%xmm6,%%xmm2 \n"
2811 "paddw %%xmm7,%%xmm3 \n"
2812 "pmullw %%xmm3,%%xmm2 \n"
2813 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002814 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00002815 "psrlw $0x8,%%xmm1 \n"
2816 "por %%xmm4,%%xmm0 \n"
2817 "pmullw %%xmm3,%%xmm1 \n"
2818 "psrlw $0x8,%%xmm2 \n"
2819 "paddusb %%xmm2,%%xmm0 \n"
2820 "pand %%xmm5,%%xmm1 \n"
2821 "paddusb %%xmm1,%%xmm0 \n"
2822 "sub $0x1,%3 \n"
2823 "movd %%xmm0,(%2) \n"
2824 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002825 "jge 10b \n"
2826
2827 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002828 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002829 "jl 49f \n"
2830
fbarchard@google.com794fe122012-06-15 01:05:01 +00002831 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002832 ".p2align 2 \n"
2833 "41: \n"
2834 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002835 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002836 "movdqa %%xmm3,%%xmm0 \n"
2837 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002838 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002839 "psrlw $0x8,%%xmm3 \n"
2840 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2841 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002842 "pand %%xmm6,%%xmm2 \n"
2843 "paddw %%xmm7,%%xmm3 \n"
2844 "pmullw %%xmm3,%%xmm2 \n"
2845 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002846 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002847 "psrlw $0x8,%%xmm1 \n"
2848 "por %%xmm4,%%xmm0 \n"
2849 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002850 "psrlw $0x8,%%xmm2 \n"
2851 "paddusb %%xmm2,%%xmm0 \n"
2852 "pand %%xmm5,%%xmm1 \n"
2853 "paddusb %%xmm1,%%xmm0 \n"
2854 "sub $0x4,%3 \n"
2855 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00002856 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002857 "jge 41b \n"
2858
2859 "49: \n"
2860 "add $0x3,%3 \n"
2861 "jl 99f \n"
2862
fbarchard@google.com794fe122012-06-15 01:05:01 +00002863 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002864 "91: \n"
2865 "movd (%0),%%xmm3 \n"
2866 "lea 0x4(%0),%0 \n"
2867 "movdqa %%xmm3,%%xmm0 \n"
2868 "pxor %%xmm4,%%xmm3 \n"
2869 "movd (%1),%%xmm2 \n"
2870 "psrlw $0x8,%%xmm3 \n"
2871 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2872 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2873 "pand %%xmm6,%%xmm2 \n"
2874 "paddw %%xmm7,%%xmm3 \n"
2875 "pmullw %%xmm3,%%xmm2 \n"
2876 "movd (%1),%%xmm1 \n"
2877 "lea 0x4(%1),%1 \n"
2878 "psrlw $0x8,%%xmm1 \n"
2879 "por %%xmm4,%%xmm0 \n"
2880 "pmullw %%xmm3,%%xmm1 \n"
2881 "psrlw $0x8,%%xmm2 \n"
2882 "paddusb %%xmm2,%%xmm0 \n"
2883 "pand %%xmm5,%%xmm1 \n"
2884 "paddusb %%xmm1,%%xmm0 \n"
2885 "sub $0x1,%3 \n"
2886 "movd %%xmm0,(%2) \n"
2887 "lea 0x4(%2),%2 \n"
2888 "jge 91b \n"
2889 "99: \n"
2890 : "+r"(src_argb0), // %0
2891 "+r"(src_argb1), // %1
2892 "+r"(dst_argb), // %2
2893 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00002894 :
2895 : "memory", "cc"
2896#if defined(__SSE2__)
2897 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2898#endif
2899 );
2900}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002901#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00002902
fbarchard@google.com96af8702012-04-06 18:22:27 +00002903#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002904// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00002905CONST uvec8 kShuffleAlpha = {
2906 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2907 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2908};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002909
2910// Blend 8 pixels at a time
2911// Shuffle table for reversing the bytes.
2912
2913// Same as SSE2, but replaces
2914// psrlw xmm3, 8 // alpha
2915// pshufhw xmm3, xmm3,0F5h // 8 alpha words
2916// pshuflw xmm3, xmm3,0F5h
2917// with..
2918// pshufb xmm3, kShuffleAlpha // alpha
2919
2920void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2921 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00002922 asm volatile (
2923 "pcmpeqb %%xmm7,%%xmm7 \n"
2924 "psrlw $0xf,%%xmm7 \n"
2925 "pcmpeqb %%xmm6,%%xmm6 \n"
2926 "psrlw $0x8,%%xmm6 \n"
2927 "pcmpeqb %%xmm5,%%xmm5 \n"
2928 "psllw $0x8,%%xmm5 \n"
2929 "pcmpeqb %%xmm4,%%xmm4 \n"
2930 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002931 "sub $0x1,%3 \n"
2932 "je 91f \n"
2933 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00002934
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002935 // 1 pixel loop until destination pointer is aligned.
2936 "10: \n"
2937 "test $0xf,%2 \n"
2938 "je 19f \n"
2939 "movd (%0),%%xmm3 \n"
2940 "lea 0x4(%0),%0 \n"
2941 "movdqa %%xmm3,%%xmm0 \n"
2942 "pxor %%xmm4,%%xmm3 \n"
2943 "movd (%1),%%xmm2 \n"
2944 "pshufb %4,%%xmm3 \n"
2945 "pand %%xmm6,%%xmm2 \n"
2946 "paddw %%xmm7,%%xmm3 \n"
2947 "pmullw %%xmm3,%%xmm2 \n"
2948 "movd (%1),%%xmm1 \n"
2949 "lea 0x4(%1),%1 \n"
2950 "psrlw $0x8,%%xmm1 \n"
2951 "por %%xmm4,%%xmm0 \n"
2952 "pmullw %%xmm3,%%xmm1 \n"
2953 "psrlw $0x8,%%xmm2 \n"
2954 "paddusb %%xmm2,%%xmm0 \n"
2955 "pand %%xmm5,%%xmm1 \n"
2956 "paddusb %%xmm1,%%xmm0 \n"
2957 "sub $0x1,%3 \n"
2958 "movd %%xmm0,(%2) \n"
2959 "lea 0x4(%2),%2 \n"
2960 "jge 10b \n"
2961
2962 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00002963 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002964 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002965 "test $0xf,%0 \n"
2966 "jne 41f \n"
2967 "test $0xf,%1 \n"
2968 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002969
fbarchard@google.com794fe122012-06-15 01:05:01 +00002970 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002971 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002972 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002973 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002974 "lea 0x10(%0),%0 \n"
2975 "movdqa %%xmm3,%%xmm0 \n"
2976 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002977 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002978 "pshufb %4,%%xmm3 \n"
2979 "pand %%xmm6,%%xmm2 \n"
2980 "paddw %%xmm7,%%xmm3 \n"
2981 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00002982 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00002983 "lea 0x10(%1),%1 \n"
2984 "psrlw $0x8,%%xmm1 \n"
2985 "por %%xmm4,%%xmm0 \n"
2986 "pmullw %%xmm3,%%xmm1 \n"
2987 "psrlw $0x8,%%xmm2 \n"
2988 "paddusb %%xmm2,%%xmm0 \n"
2989 "pand %%xmm5,%%xmm1 \n"
2990 "paddusb %%xmm1,%%xmm0 \n"
2991 "sub $0x4,%3 \n"
2992 "movdqa %%xmm0,(%2) \n"
2993 "lea 0x10(%2),%2 \n"
2994 "jge 40b \n"
2995 "jmp 49f \n"
2996
2997 // 4 pixel unaligned loop.
2998 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00002999 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003000 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003001 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003002 "movdqa %%xmm3,%%xmm0 \n"
3003 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003004 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003005 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003006 "pand %%xmm6,%%xmm2 \n"
3007 "paddw %%xmm7,%%xmm3 \n"
3008 "pmullw %%xmm3,%%xmm2 \n"
3009 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003010 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003011 "psrlw $0x8,%%xmm1 \n"
3012 "por %%xmm4,%%xmm0 \n"
3013 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003014 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003015 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003016 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003017 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003018 "sub $0x4,%3 \n"
3019 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003020 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003021 "jge 41b \n"
3022
3023 "49: \n"
3024 "add $0x3,%3 \n"
3025 "jl 99f \n"
3026
fbarchard@google.com794fe122012-06-15 01:05:01 +00003027 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003028 "91: \n"
3029 "movd (%0),%%xmm3 \n"
3030 "lea 0x4(%0),%0 \n"
3031 "movdqa %%xmm3,%%xmm0 \n"
3032 "pxor %%xmm4,%%xmm3 \n"
3033 "movd (%1),%%xmm2 \n"
3034 "pshufb %4,%%xmm3 \n"
3035 "pand %%xmm6,%%xmm2 \n"
3036 "paddw %%xmm7,%%xmm3 \n"
3037 "pmullw %%xmm3,%%xmm2 \n"
3038 "movd (%1),%%xmm1 \n"
3039 "lea 0x4(%1),%1 \n"
3040 "psrlw $0x8,%%xmm1 \n"
3041 "por %%xmm4,%%xmm0 \n"
3042 "pmullw %%xmm3,%%xmm1 \n"
3043 "psrlw $0x8,%%xmm2 \n"
3044 "paddusb %%xmm2,%%xmm0 \n"
3045 "pand %%xmm5,%%xmm1 \n"
3046 "paddusb %%xmm1,%%xmm0 \n"
3047 "sub $0x1,%3 \n"
3048 "movd %%xmm0,(%2) \n"
3049 "lea 0x4(%2),%2 \n"
3050 "jge 91b \n"
3051 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003052 : "+r"(src_argb0), // %0
3053 "+r"(src_argb1), // %1
3054 "+r"(dst_argb), // %2
3055 "+r"(width) // %3
3056 : "m"(kShuffleAlpha) // %4
3057 : "memory", "cc"
3058#if defined(__SSE2__)
3059 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3060#endif
3061 );
3062}
3063#endif // HAS_ARGBBLENDROW_SSSE3
3064
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003065#ifdef HAS_ARGBATTENUATE_SSE2
3066// Attenuate 4 pixels at a time.
3067// aligned to 16 bytes
3068void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3069 asm volatile (
3070 "sub %0,%1 \n"
3071 "pcmpeqb %%xmm4,%%xmm4 \n"
3072 "pslld $0x18,%%xmm4 \n"
3073 "pcmpeqb %%xmm5,%%xmm5 \n"
3074 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003075
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003076 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003077 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003078 "1: \n"
3079 "movdqa (%0),%%xmm0 \n"
3080 "punpcklbw %%xmm0,%%xmm0 \n"
3081 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3082 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3083 "pmulhuw %%xmm2,%%xmm0 \n"
3084 "movdqa (%0),%%xmm1 \n"
3085 "punpckhbw %%xmm1,%%xmm1 \n"
3086 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3087 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3088 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003089 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003090 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003091 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003092 "psrlw $0x8,%%xmm1 \n"
3093 "packuswb %%xmm1,%%xmm0 \n"
3094 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003095 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003096 "sub $0x4,%2 \n"
3097 "movdqa %%xmm0,(%0,%1,1) \n"
3098 "lea 0x10(%0),%0 \n"
3099 "jg 1b \n"
3100 : "+r"(src_argb), // %0
3101 "+r"(dst_argb), // %1
3102 "+r"(width) // %2
3103 :
3104 : "memory", "cc"
3105#if defined(__SSE2__)
3106 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3107#endif
3108 );
3109}
3110#endif // HAS_ARGBATTENUATE_SSE2
3111
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003112#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003113// Shuffle table duplicating alpha
3114CONST uvec8 kShuffleAlpha0 = {
3115 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3116};
3117CONST uvec8 kShuffleAlpha1 = {
3118 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3119 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3120};
3121// Attenuate 4 pixels at a time.
3122// aligned to 16 bytes
3123void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3124 asm volatile (
3125 "sub %0,%1 \n"
3126 "pcmpeqb %%xmm3,%%xmm3 \n"
3127 "pslld $0x18,%%xmm3 \n"
3128 "movdqa %3,%%xmm4 \n"
3129 "movdqa %4,%%xmm5 \n"
3130
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003131 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003132 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003133 "1: \n"
3134 "movdqa (%0),%%xmm0 \n"
3135 "pshufb %%xmm4,%%xmm0 \n"
3136 "movdqa (%0),%%xmm1 \n"
3137 "punpcklbw %%xmm1,%%xmm1 \n"
3138 "pmulhuw %%xmm1,%%xmm0 \n"
3139 "movdqa (%0),%%xmm1 \n"
3140 "pshufb %%xmm5,%%xmm1 \n"
3141 "movdqa (%0),%%xmm2 \n"
3142 "punpckhbw %%xmm2,%%xmm2 \n"
3143 "pmulhuw %%xmm2,%%xmm1 \n"
3144 "movdqa (%0),%%xmm2 \n"
3145 "pand %%xmm3,%%xmm2 \n"
3146 "psrlw $0x8,%%xmm0 \n"
3147 "psrlw $0x8,%%xmm1 \n"
3148 "packuswb %%xmm1,%%xmm0 \n"
3149 "por %%xmm2,%%xmm0 \n"
3150 "sub $0x4,%2 \n"
3151 "movdqa %%xmm0,(%0,%1,1) \n"
3152 "lea 0x10(%0),%0 \n"
3153 "jg 1b \n"
3154 : "+r"(src_argb), // %0
3155 "+r"(dst_argb), // %1
3156 "+r"(width) // %2
3157 : "m"(kShuffleAlpha0), // %3
3158 "m"(kShuffleAlpha1) // %4
3159 : "memory", "cc"
3160#if defined(__SSE2__)
3161 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3162#endif
3163 );
3164}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003165#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003166
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003167#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003168// Unattenuate 4 pixels at a time.
3169// aligned to 16 bytes
3170void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3171 int width) {
3172 uintptr_t alpha = 0;
3173 asm volatile (
3174 "sub %0,%1 \n"
3175 "pcmpeqb %%xmm4,%%xmm4 \n"
3176 "pslld $0x18,%%xmm4 \n"
3177
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003178 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003179 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003180 "1: \n"
3181 "movdqa (%0),%%xmm0 \n"
3182 "movzb 0x3(%0),%3 \n"
3183 "punpcklbw %%xmm0,%%xmm0 \n"
3184 "movd 0x0(%4,%3,4),%%xmm2 \n"
3185 "movzb 0x7(%0),%3 \n"
3186 "movd 0x0(%4,%3,4),%%xmm3 \n"
3187 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3188 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3189 "movlhps %%xmm3,%%xmm2 \n"
3190 "pmulhuw %%xmm2,%%xmm0 \n"
3191 "movdqa (%0),%%xmm1 \n"
3192 "movzb 0xb(%0),%3 \n"
3193 "punpckhbw %%xmm1,%%xmm1 \n"
3194 "movd 0x0(%4,%3,4),%%xmm2 \n"
3195 "movzb 0xf(%0),%3 \n"
3196 "movd 0x0(%4,%3,4),%%xmm3 \n"
3197 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3198 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3199 "movlhps %%xmm3,%%xmm2 \n"
3200 "pmulhuw %%xmm2,%%xmm1 \n"
3201 "movdqa (%0),%%xmm2 \n"
3202 "pand %%xmm4,%%xmm2 \n"
3203 "packuswb %%xmm1,%%xmm0 \n"
3204 "por %%xmm2,%%xmm0 \n"
3205 "sub $0x4,%2 \n"
3206 "movdqa %%xmm0,(%0,%1,1) \n"
3207 "lea 0x10(%0),%0 \n"
3208 "jg 1b \n"
3209 : "+r"(src_argb), // %0
3210 "+r"(dst_argb), // %1
3211 "+r"(width), // %2
3212 "+r"(alpha) // %3
3213 : "r"(fixed_invtbl8) // %4
3214 : "memory", "cc"
3215#if defined(__SSE2__)
3216 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3217#endif
3218 );
3219}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003220#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003221
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003222#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003223// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003224CONST vec8 kARGBToGray = {
3225 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3226};
3227
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003228// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003229void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003230 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003231 "movdqa %3,%%xmm4 \n"
3232 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003233
3234 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003235 ".p2align 4 \n"
3236 "1: \n"
3237 "movdqa (%0),%%xmm0 \n"
3238 "movdqa 0x10(%0),%%xmm1 \n"
3239 "pmaddubsw %%xmm4,%%xmm0 \n"
3240 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003241 "phaddw %%xmm1,%%xmm0 \n"
3242 "psrlw $0x7,%%xmm0 \n"
3243 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003244 "movdqa (%0),%%xmm2 \n"
3245 "movdqa 0x10(%0),%%xmm3 \n"
3246 "psrld $0x18,%%xmm2 \n"
3247 "psrld $0x18,%%xmm3 \n"
3248 "packuswb %%xmm3,%%xmm2 \n"
3249 "packuswb %%xmm2,%%xmm2 \n"
3250 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003251 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003252 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003253 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003254 "punpcklwd %%xmm3,%%xmm0 \n"
3255 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003256 "sub $0x8,%2 \n"
3257 "movdqa %%xmm0,(%0,%1,1) \n"
3258 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003259 "lea 0x20(%0),%0 \n"
3260 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003261 : "+r"(src_argb), // %0
3262 "+r"(dst_argb), // %1
3263 "+r"(width) // %2
3264 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003265 : "memory", "cc"
3266#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003267 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003268#endif
3269 );
3270}
3271#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003272
3273#ifdef HAS_ARGBSEPIAROW_SSSE3
3274// b = (r * 35 + g * 68 + b * 17) >> 7
3275// g = (r * 45 + g * 88 + b * 22) >> 7
3276// r = (r * 50 + g * 98 + b * 24) >> 7
3277// Constant for ARGB color to sepia tone
3278CONST vec8 kARGBToSepiaB = {
3279 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3280};
3281
3282CONST vec8 kARGBToSepiaG = {
3283 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3284};
3285
3286CONST vec8 kARGBToSepiaR = {
3287 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3288};
3289
fbarchard@google.come442dc42012-06-18 17:37:09 +00003290// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003291void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3292 asm volatile (
3293 "movdqa %2,%%xmm2 \n"
3294 "movdqa %3,%%xmm3 \n"
3295 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003296
3297 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003298 ".p2align 4 \n"
3299 "1: \n"
3300 "movdqa (%0),%%xmm0 \n"
3301 "movdqa 0x10(%0),%%xmm6 \n"
3302 "pmaddubsw %%xmm2,%%xmm0 \n"
3303 "pmaddubsw %%xmm2,%%xmm6 \n"
3304 "phaddw %%xmm6,%%xmm0 \n"
3305 "psrlw $0x7,%%xmm0 \n"
3306 "packuswb %%xmm0,%%xmm0 \n"
3307 "movdqa (%0),%%xmm5 \n"
3308 "movdqa 0x10(%0),%%xmm1 \n"
3309 "pmaddubsw %%xmm3,%%xmm5 \n"
3310 "pmaddubsw %%xmm3,%%xmm1 \n"
3311 "phaddw %%xmm1,%%xmm5 \n"
3312 "psrlw $0x7,%%xmm5 \n"
3313 "packuswb %%xmm5,%%xmm5 \n"
3314 "punpcklbw %%xmm5,%%xmm0 \n"
3315 "movdqa (%0),%%xmm5 \n"
3316 "movdqa 0x10(%0),%%xmm1 \n"
3317 "pmaddubsw %%xmm4,%%xmm5 \n"
3318 "pmaddubsw %%xmm4,%%xmm1 \n"
3319 "phaddw %%xmm1,%%xmm5 \n"
3320 "psrlw $0x7,%%xmm5 \n"
3321 "packuswb %%xmm5,%%xmm5 \n"
3322 "movdqa (%0),%%xmm6 \n"
3323 "movdqa 0x10(%0),%%xmm1 \n"
3324 "psrld $0x18,%%xmm6 \n"
3325 "psrld $0x18,%%xmm1 \n"
3326 "packuswb %%xmm1,%%xmm6 \n"
3327 "packuswb %%xmm6,%%xmm6 \n"
3328 "punpcklbw %%xmm6,%%xmm5 \n"
3329 "movdqa %%xmm0,%%xmm1 \n"
3330 "punpcklwd %%xmm5,%%xmm0 \n"
3331 "punpckhwd %%xmm5,%%xmm1 \n"
3332 "sub $0x8,%1 \n"
3333 "movdqa %%xmm0,(%0) \n"
3334 "movdqa %%xmm1,0x10(%0) \n"
3335 "lea 0x20(%0),%0 \n"
3336 "jg 1b \n"
3337 : "+r"(dst_argb), // %0
3338 "+r"(width) // %1
3339 : "m"(kARGBToSepiaB), // %2
3340 "m"(kARGBToSepiaG), // %3
3341 "m"(kARGBToSepiaR) // %4
3342 : "memory", "cc"
3343#if defined(__SSE2__)
3344 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3345#endif
3346 );
3347}
3348#endif // HAS_ARGBSEPIAROW_SSSE3
3349
fbarchard@google.come442dc42012-06-18 17:37:09 +00003350#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3351// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3352// Same as Sepia except matrix is provided.
3353void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3354 int width) {
3355 asm volatile (
3356 "movd (%2),%%xmm2 \n"
3357 "movd 0x4(%2),%%xmm3 \n"
3358 "movd 0x8(%2),%%xmm4 \n"
3359 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3360 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3361 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003362
3363 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003364 ".p2align 4 \n"
3365 "1: \n"
3366 "movdqa (%0),%%xmm0 \n"
3367 "movdqa 0x10(%0),%%xmm6 \n"
3368 "pmaddubsw %%xmm2,%%xmm0 \n"
3369 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003370 "movdqa (%0),%%xmm5 \n"
3371 "movdqa 0x10(%0),%%xmm1 \n"
3372 "pmaddubsw %%xmm3,%%xmm5 \n"
3373 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003374 "phaddsw %%xmm6,%%xmm0 \n"
3375 "phaddsw %%xmm1,%%xmm5 \n"
3376 "psraw $0x7,%%xmm0 \n"
3377 "psraw $0x7,%%xmm5 \n"
3378 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003379 "packuswb %%xmm5,%%xmm5 \n"
3380 "punpcklbw %%xmm5,%%xmm0 \n"
3381 "movdqa (%0),%%xmm5 \n"
3382 "movdqa 0x10(%0),%%xmm1 \n"
3383 "pmaddubsw %%xmm4,%%xmm5 \n"
3384 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003385 "phaddsw %%xmm1,%%xmm5 \n"
3386 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003387 "packuswb %%xmm5,%%xmm5 \n"
3388 "movdqa (%0),%%xmm6 \n"
3389 "movdqa 0x10(%0),%%xmm1 \n"
3390 "psrld $0x18,%%xmm6 \n"
3391 "psrld $0x18,%%xmm1 \n"
3392 "packuswb %%xmm1,%%xmm6 \n"
3393 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003394 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003395 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003396 "punpcklwd %%xmm5,%%xmm0 \n"
3397 "punpckhwd %%xmm5,%%xmm1 \n"
3398 "sub $0x8,%1 \n"
3399 "movdqa %%xmm0,(%0) \n"
3400 "movdqa %%xmm1,0x10(%0) \n"
3401 "lea 0x20(%0),%0 \n"
3402 "jg 1b \n"
3403 : "+r"(dst_argb), // %0
3404 "+r"(width) // %1
3405 : "r"(matrix_argb) // %2
3406 : "memory", "cc"
3407#if defined(__SSE2__)
3408 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3409#endif
3410 );
3411}
3412#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3413
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003414#ifdef HAS_ARGBQUANTIZEROW_SSE2
3415// Quantize 4 ARGB pixels (16 bytes).
3416// aligned to 16 bytes
3417void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3418 int interval_offset, int width) {
3419 asm volatile (
3420 "movd %2,%%xmm2 \n"
3421 "movd %3,%%xmm3 \n"
3422 "movd %4,%%xmm4 \n"
3423 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3424 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3425 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3426 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3427 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3428 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3429 "pxor %%xmm5,%%xmm5 \n"
3430 "pcmpeqb %%xmm6,%%xmm6 \n"
3431 "pslld $0x18,%%xmm6 \n"
3432
3433 // 4 pixel loop.
3434 ".p2align 2 \n"
3435 "1: \n"
3436 "movdqa (%0),%%xmm0 \n"
3437 "punpcklbw %%xmm5,%%xmm0 \n"
3438 "pmulhuw %%xmm2,%%xmm0 \n"
3439 "movdqa (%0),%%xmm1 \n"
3440 "punpckhbw %%xmm5,%%xmm1 \n"
3441 "pmulhuw %%xmm2,%%xmm1 \n"
3442 "pmullw %%xmm3,%%xmm0 \n"
3443 "movdqa (%0),%%xmm7 \n"
3444 "pmullw %%xmm3,%%xmm1 \n"
3445 "pand %%xmm6,%%xmm7 \n"
3446 "paddw %%xmm4,%%xmm0 \n"
3447 "paddw %%xmm4,%%xmm1 \n"
3448 "packuswb %%xmm1,%%xmm0 \n"
3449 "por %%xmm7,%%xmm0 \n"
3450 "sub $0x4,%1 \n"
3451 "movdqa %%xmm0,(%0) \n"
3452 "lea 0x10(%0),%0 \n"
3453 "jg 1b \n"
3454 : "+r"(dst_argb), // %0
3455 "+r"(width) // %1
3456 : "r"(scale), // %2
3457 "r"(interval_size), // %3
3458 "r"(interval_offset) // %4
3459 : "memory", "cc"
3460#if defined(__SSE2__)
3461 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3462#endif
3463 );
3464}
3465#endif // HAS_ARGBQUANTIZEROW_SSE2
3466
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003467#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3468// Creates a table of cumulative sums where each value is a sum of all values
3469// above and to the left of the value, inclusive of the value.
3470void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003471 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003472 asm volatile (
3473 "sub %1,%2 \n"
3474 "pxor %%xmm0,%%xmm0 \n"
3475 "pxor %%xmm1,%%xmm1 \n"
3476 "sub $0x4,%3 \n"
3477 "jl 49f \n"
3478 "test $0xf,%1 \n"
3479 "jne 49f \n"
3480
3481 // 4 pixel loop \n"
3482 ".p2align 2 \n"
3483 "40: \n"
3484 "movdqu (%0),%%xmm2 \n"
3485 "lea 0x10(%0),%0 \n"
3486 "movdqa %%xmm2,%%xmm4 \n"
3487 "punpcklbw %%xmm1,%%xmm2 \n"
3488 "movdqa %%xmm2,%%xmm3 \n"
3489 "punpcklwd %%xmm1,%%xmm2 \n"
3490 "punpckhwd %%xmm1,%%xmm3 \n"
3491 "punpckhbw %%xmm1,%%xmm4 \n"
3492 "movdqa %%xmm4,%%xmm5 \n"
3493 "punpcklwd %%xmm1,%%xmm4 \n"
3494 "punpckhwd %%xmm1,%%xmm5 \n"
3495 "paddd %%xmm2,%%xmm0 \n"
3496 "movdqa (%1,%2,1),%%xmm2 \n"
3497 "paddd %%xmm0,%%xmm2 \n"
3498 "paddd %%xmm3,%%xmm0 \n"
3499 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3500 "paddd %%xmm0,%%xmm3 \n"
3501 "paddd %%xmm4,%%xmm0 \n"
3502 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3503 "paddd %%xmm0,%%xmm4 \n"
3504 "paddd %%xmm5,%%xmm0 \n"
3505 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3506 "paddd %%xmm0,%%xmm5 \n"
3507 "movdqa %%xmm2,(%1) \n"
3508 "movdqa %%xmm3,0x10(%1) \n"
3509 "movdqa %%xmm4,0x20(%1) \n"
3510 "movdqa %%xmm5,0x30(%1) \n"
3511 "lea 0x40(%1),%1 \n"
3512 "sub $0x4,%3 \n"
3513 "jge 40b \n"
3514
3515 "49: \n"
3516 "add $0x3,%3 \n"
3517 "jl 19f \n"
3518
3519 // 1 pixel loop \n"
3520 ".p2align 2 \n"
3521 "10: \n"
3522 "movd (%0),%%xmm2 \n"
3523 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003524 "punpcklbw %%xmm1,%%xmm2 \n"
3525 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003526 "paddd %%xmm2,%%xmm0 \n"
3527 "movdqu (%1,%2,1),%%xmm2 \n"
3528 "paddd %%xmm0,%%xmm2 \n"
3529 "movdqu %%xmm2,(%1) \n"
3530 "lea 0x10(%1),%1 \n"
3531 "sub $0x1,%3 \n"
3532 "jge 10b \n"
3533
3534 "19: \n"
3535 : "+r"(row), // %0
3536 "+r"(cumsum), // %1
3537 "+r"(previous_cumsum), // %2
3538 "+r"(width) // %3
3539 :
3540 : "memory", "cc"
3541#if defined(__SSE2__)
3542 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3543#endif
3544 );
3545}
3546#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3547
3548#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3549void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3550 int width, int area, uint8* dst, int count) {
3551 asm volatile (
3552 "movd %5,%%xmm4 \n"
3553 "cvtdq2ps %%xmm4,%%xmm4 \n"
3554 "rcpss %%xmm4,%%xmm4 \n"
3555 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3556 "sub $0x4,%3 \n"
3557 "jl 49f \n"
3558
3559 // 4 pixel loop \n"
3560 ".p2align 2 \n"
3561 "40: \n"
3562 "movdqa (%0),%%xmm0 \n"
3563 "movdqa 0x10(%0),%%xmm1 \n"
3564 "movdqa 0x20(%0),%%xmm2 \n"
3565 "movdqa 0x30(%0),%%xmm3 \n"
3566 "psubd (%0,%4,4),%%xmm0 \n"
3567 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3568 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3569 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3570 "lea 0x40(%0),%0 \n"
3571 "psubd (%1),%%xmm0 \n"
3572 "psubd 0x10(%1),%%xmm1 \n"
3573 "psubd 0x20(%1),%%xmm2 \n"
3574 "psubd 0x30(%1),%%xmm3 \n"
3575 "paddd (%1,%4,4),%%xmm0 \n"
3576 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3577 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3578 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3579 "lea 0x40(%1),%1 \n"
3580 "cvtdq2ps %%xmm0,%%xmm0 \n"
3581 "cvtdq2ps %%xmm1,%%xmm1 \n"
3582 "mulps %%xmm4,%%xmm0 \n"
3583 "mulps %%xmm4,%%xmm1 \n"
3584 "cvtdq2ps %%xmm2,%%xmm2 \n"
3585 "cvtdq2ps %%xmm3,%%xmm3 \n"
3586 "mulps %%xmm4,%%xmm2 \n"
3587 "mulps %%xmm4,%%xmm3 \n"
3588 "cvtps2dq %%xmm0,%%xmm0 \n"
3589 "cvtps2dq %%xmm1,%%xmm1 \n"
3590 "cvtps2dq %%xmm2,%%xmm2 \n"
3591 "cvtps2dq %%xmm3,%%xmm3 \n"
3592 "packssdw %%xmm1,%%xmm0 \n"
3593 "packssdw %%xmm3,%%xmm2 \n"
3594 "packuswb %%xmm2,%%xmm0 \n"
3595 "movdqu %%xmm0,(%2) \n"
3596 "lea 0x10(%2),%2 \n"
3597 "sub $0x4,%3 \n"
3598 "jge 40b \n"
3599
3600 "49: \n"
3601 "add $0x3,%3 \n"
3602 "jl 19f \n"
3603
3604 // 1 pixel loop \n"
3605 ".p2align 2 \n"
3606 "10: \n"
3607 "movdqa (%0),%%xmm0 \n"
3608 "psubd (%0,%4,4),%%xmm0 \n"
3609 "lea 0x10(%0),%0 \n"
3610 "psubd (%1),%%xmm0 \n"
3611 "paddd (%1,%4,4),%%xmm0 \n"
3612 "lea 0x10(%1),%1 \n"
3613 "cvtdq2ps %%xmm0,%%xmm0 \n"
3614 "mulps %%xmm4,%%xmm0 \n"
3615 "cvtps2dq %%xmm0,%%xmm0 \n"
3616 "packssdw %%xmm0,%%xmm0 \n"
3617 "packuswb %%xmm0,%%xmm0 \n"
3618 "movd %%xmm0,(%2) \n"
3619 "lea 0x4(%2),%2 \n"
3620 "sub $0x1,%3 \n"
3621 "jge 10b \n"
3622 "19: \n"
3623 : "+r"(topleft), // %0
3624 "+r"(botleft), // %1
3625 "+r"(dst), // %2
3626 "+rm"(count) // %3
3627 : "r"(static_cast<intptr_t>(width)), // %4
3628 "rm"(area) // %5
3629 : "memory", "cc"
3630#if defined(__SSE2__)
3631 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3632#endif
3633 );
3634}
3635#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003636#ifdef HAS_ARGBSHADE_SSE2
3637// Shade 4 pixels at a time by specified value.
3638// Aligned to 16 bytes.
3639void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3640 uint32 value) {
3641 asm volatile (
3642 "movd %3,%%xmm2 \n"
3643 "sub %0,%1 \n"
3644 "punpcklbw %%xmm2,%%xmm2 \n"
3645 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003646
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003647 // 4 pixel loop.
3648 ".p2align 2 \n"
3649 "1: \n"
3650 "movdqa (%0),%%xmm0 \n"
3651 "movdqa %%xmm0,%%xmm1 \n"
3652 "punpcklbw %%xmm0,%%xmm0 \n"
3653 "punpckhbw %%xmm1,%%xmm1 \n"
3654 "pmulhuw %%xmm2,%%xmm0 \n"
3655 "pmulhuw %%xmm2,%%xmm1 \n"
3656 "psrlw $0x8,%%xmm0 \n"
3657 "psrlw $0x8,%%xmm1 \n"
3658 "packuswb %%xmm1,%%xmm0 \n"
3659 "sub $0x4,%2 \n"
3660 "movdqa %%xmm0,(%0,%1,1) \n"
3661 "lea 0x10(%0),%0 \n"
3662 "jg 1b \n"
3663 : "+r"(src_argb), // %0
3664 "+r"(dst_argb), // %1
3665 "+r"(width) // %2
3666 : "r"(value) // %3
3667 : "memory", "cc"
3668#if defined(__SSE2__)
3669 , "xmm0", "xmm1", "xmm2"
3670#endif
3671 );
3672}
3673#endif // HAS_ARGBSHADE_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003674
fbarchard@google.com73444402012-08-09 17:33:29 +00003675#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003676// TODO(fbarchard): Find 64 bit way to avoid masking.
3677// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
fbarchard@google.com73444402012-08-09 17:33:29 +00003678// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003679// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003680// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003681
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00003682LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00003683void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3684 uint8* dst_argb, const float* uv_dudv, int width) {
3685 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003686 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00003687 asm volatile (
3688 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003689 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003690 "shl $0x10,%1 \n"
3691 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003692 "movd %1,%%xmm5 \n"
3693 "sub $0x4,%4 \n"
3694 "jl 49f \n"
3695
3696 "pshufd $0x44,%%xmm7,%%xmm7 \n"
3697 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003698 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003699 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003700 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003701 "movdqa %%xmm7,%%xmm4 \n"
3702 "addps %%xmm4,%%xmm4 \n"
3703 "movdqa %%xmm2,%%xmm3 \n"
3704 "addps %%xmm4,%%xmm3 \n"
3705 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003706
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003707 // 4 pixel loop \n"
3708 ".p2align 4 \n"
3709 "40: \n"
3710 "cvttps2dq %%xmm2,%%xmm0 \n"
3711 "cvttps2dq %%xmm3,%%xmm1 \n"
3712 "packssdw %%xmm1,%%xmm0 \n"
3713 "pmaddwd %%xmm5,%%xmm0 \n"
3714#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003715 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003716 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003717 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003718 "shr $32,%5 \n"
3719 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
3720#else
3721 "movd %%xmm0,%1 \n"
3722 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3723 "movd %%xmm0,%5 \n"
3724 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3725#endif
3726 "movd (%0,%1,1),%%xmm1 \n"
3727 "movd (%0,%5,1),%%xmm6 \n"
3728 "punpckldq %%xmm6,%%xmm1 \n"
3729 "addps %%xmm4,%%xmm2 \n"
3730 "movq %%xmm1,(%2) \n"
3731#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00003732 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003733 "mov %1,%5 \n"
3734 "and $0x0fffffff,%1 \n"
3735 "shr $32,%5 \n"
3736#else
3737 "movd %%xmm0,%1 \n"
3738 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3739 "movd %%xmm0,%5 \n"
3740#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00003741 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003742 "movd (%0,%5,1),%%xmm6 \n"
3743 "punpckldq %%xmm6,%%xmm0 \n"
3744 "addps %%xmm4,%%xmm3 \n"
3745 "sub $0x4,%4 \n"
3746 "movq %%xmm0,0x08(%2) \n"
3747 "lea 0x10(%2),%2 \n"
3748 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003749
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003750 "49: \n"
3751 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003752 "jl 19f \n"
3753
3754 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003755 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00003756 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003757 "cvttps2dq %%xmm2,%%xmm0 \n"
3758 "packssdw %%xmm0,%%xmm0 \n"
3759 "pmaddwd %%xmm5,%%xmm0 \n"
3760 "addps %%xmm7,%%xmm2 \n"
3761 "movd %%xmm0,%1 \n"
3762#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00003763 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003764#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00003765 "movd (%0,%1,1),%%xmm0 \n"
3766 "sub $0x1,%4 \n"
3767 "movd %%xmm0,(%2) \n"
3768 "lea 0x4(%2),%2 \n"
3769 "jge 10b \n"
3770 "19: \n"
3771 : "+r"(src_argb), // %0
3772 "+r"(src_argb_stride_temp), // %1
3773 "+r"(dst_argb), // %2
3774 "+r"(uv_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003775 "+rm"(width), // %4
3776 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00003777 :
3778 : "memory", "cc"
3779#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00003780 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00003781#endif
3782 );
3783}
3784#endif // HAS_ARGBAFFINEROW_SSE2
3785
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00003786// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
3787void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
3788 ptrdiff_t src_stride, int dst_width,
3789 int source_y_fraction) {
3790 asm volatile (
3791 "sub %1,%0 \n"
3792 "shr %3 \n"
3793 "cmp $0x0,%3 \n"
3794 "je 2f \n"
3795 "cmp $0x40,%3 \n"
3796 "je 3f \n"
3797 "movd %3,%%xmm0 \n"
3798 "neg %3 \n"
3799 "add $0x80,%3 \n"
3800 "movd %3,%%xmm5 \n"
3801 "punpcklbw %%xmm0,%%xmm5 \n"
3802 "punpcklwd %%xmm5,%%xmm5 \n"
3803 "pshufd $0x0,%%xmm5,%%xmm5 \n"
3804 ".p2align 4 \n"
3805 "1: \n"
3806 "movdqa (%1),%%xmm0 \n"
3807 "movdqa (%1,%4,1),%%xmm2 \n"
3808 "movdqa %%xmm0,%%xmm1 \n"
3809 "punpcklbw %%xmm2,%%xmm0 \n"
3810 "punpckhbw %%xmm2,%%xmm1 \n"
3811 "pmaddubsw %%xmm5,%%xmm0 \n"
3812 "pmaddubsw %%xmm5,%%xmm1 \n"
3813 "psrlw $0x7,%%xmm0 \n"
3814 "psrlw $0x7,%%xmm1 \n"
3815 "packuswb %%xmm1,%%xmm0 \n"
3816 "sub $0x4,%2 \n"
3817 "movdqa %%xmm0,(%1,%0,1) \n"
3818 "lea 0x10(%1),%1 \n"
3819 "jg 1b \n"
3820 "jmp 4f \n"
3821 ".p2align 4 \n"
3822 "2: \n"
3823 "movdqa (%1),%%xmm0 \n"
3824 "sub $0x4,%2 \n"
3825 "movdqa %%xmm0,(%1,%0,1) \n"
3826 "lea 0x10(%1),%1 \n"
3827 "jg 2b \n"
3828 "jmp 4f \n"
3829 ".p2align 4 \n"
3830 "3: \n"
3831 "movdqa (%1),%%xmm0 \n"
3832 "pavgb (%1,%4,1),%%xmm0 \n"
3833 "sub $0x4,%2 \n"
3834 "movdqa %%xmm0,(%1,%0,1) \n"
3835 "lea 0x10(%1),%1 \n"
3836 "jg 3b \n"
3837 "4: \n"
3838 ".p2align 4 \n"
3839 : "+r"(dst_ptr), // %0
3840 "+r"(src_ptr), // %1
3841 "+r"(dst_width), // %2
3842 "+r"(source_y_fraction) // %3
3843 : "r"(static_cast<intptr_t>(src_stride)) // %4
3844 : "memory", "cc"
3845#if defined(__SSE2__)
3846 , "xmm0", "xmm1", "xmm2", "xmm5"
3847#endif
3848 );
3849}
3850
fbarchard@google.come91bdac2012-10-09 21:09:33 +00003851void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
3852 uint8* dst_uv, int pix) {
3853 asm volatile (
3854 "sub %0,%1 \n"
3855 ".p2align 4 \n"
3856 "1: \n"
3857 "movdqa (%0),%%xmm0 \n"
3858 "pavgb (%0,%3),%%xmm0 \n"
3859 "sub $0x10,%2 \n"
3860 "movdqa %%xmm0,(%0,%1) \n"
3861 "lea 0x10(%0),%0 \n"
3862 "jg 1b \n"
3863 : "+r"(src_uv), // %0
3864 "+r"(dst_uv), // %1
3865 "+r"(pix) // %2
3866 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
3867 : "memory", "cc"
3868#if defined(__SSE2__)
3869 , "xmm0"
3870#endif
3871 );
3872}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00003873
3874void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
3875 uint32 selector, int pix) {
3876 asm volatile (
3877 "movd %3,%%xmm5 \n"
3878 "pshufd $0x0,%%xmm5,%%xmm5 \n"
3879 ".p2align 4 \n"
3880 "1: \n"
3881 "movdqa (%0),%%xmm0 \n"
3882 "lea 0x10(%0),%0 \n"
3883 "pshufb %%xmm5,%%xmm0 \n"
3884 "sub $0x4,%2 \n"
3885 "movd %%xmm0,(%1) \n"
3886 "lea 0x4(%1),%1 \n"
3887 "jg 1b \n"
3888 : "+r"(src_argb), // %0
3889 "+r"(dst_bayer), // %1
3890 "+r"(pix) // %2
3891 : "g"(selector) // %3
3892 : "memory", "cc"
3893#if defined(__SSE2__)
3894 , "xmm0", "xmm5"
3895#endif
3896 );
3897}
fbarchard@google.com2d11d432012-02-16 02:50:39 +00003898#endif // defined(__x86_64__) || defined(__i386__)
3899
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003900#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003901} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00003902} // namespace libyuv
3903#endif