blob: b92a9f5c13bdc8cc7ffaae7bdb8be015ec47a796 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
fbarchard@google.comcde58702013-01-28 00:02:35 +00007 * in the file PATENTS. All contributing project authors may
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00008 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.com83a63e62013-02-27 00:20:29 +000021#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +000038// JPeg full range.
39CONST vec8 kARGBToYJ = {
fbarchard@google.com050b39a2013-04-01 20:07:14 +000040 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +000041};
42
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000043CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000044 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
45};
46
fbarchard@google.com050b39a2013-04-01 20:07:14 +000047CONST vec8 kARGBToUJ = {
48 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
49};
50
fbarchard@google.com714050a2012-02-17 22:59:56 +000051CONST vec8 kARGBToV = {
52 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
53};
54
fbarchard@google.com050b39a2013-04-01 20:07:14 +000055CONST vec8 kARGBToVJ = {
56 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
57};
58
fbarchard@google.com714050a2012-02-17 22:59:56 +000059// Constants for BGRA
60CONST vec8 kBGRAToY = {
61 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
62};
63
64CONST vec8 kBGRAToU = {
65 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
66};
67
68CONST vec8 kBGRAToV = {
69 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
70};
71
72// Constants for ABGR
73CONST vec8 kABGRToY = {
74 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
75};
76
77CONST vec8 kABGRToU = {
78 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
79};
80
81CONST vec8 kABGRToV = {
82 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
83};
84
fbarchard@google.com4de0c432012-10-11 01:25:46 +000085// Constants for RGBA.
86CONST vec8 kRGBAToY = {
87 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
88};
89
90CONST vec8 kRGBAToU = {
91 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
92};
93
94CONST vec8 kRGBAToV = {
95 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
96};
97
fbarchard@google.com714050a2012-02-17 22:59:56 +000098CONST uvec8 kAddY16 = {
99 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +0000100};
fbarchard@google.com2430e042011-11-11 21:57:06 +0000101
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000102CONST vec16 kAddYJ64 = {
103 64, 64, 64, 64, 64, 64, 64, 64
104};
105
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000106CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000107 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
109};
fbarchard@google.com228bdc22011-11-15 21:58:26 +0000110
fbarchard@google.com050b39a2013-04-01 20:07:14 +0000111CONST uvec16 kAddUVJ128 = {
112 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
113};
114
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000115// Shuffle table for converting RGB24 to ARGB.
116CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000117 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
118};
119
120// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000121CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000122 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
123};
124
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000125// Shuffle table for converting ARGB to RGB24.
126CONST uvec8 kShuffleMaskARGBToRGB24 = {
127 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
128};
129
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000130// Shuffle table for converting ARGB to RAW.
131CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000132 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000133};
134
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000135// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000136CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
137 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
138};
139
140// Shuffle table for converting ARGB to RAW.
141CONST uvec8 kShuffleMaskARGBToRAW_0 = {
142 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
143};
144
fbarchard@google.comb6149762011-11-07 21:58:52 +0000145void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000146 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000147 "pcmpeqb %%xmm5,%%xmm5 \n"
148 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000149 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000150 "1: \n"
151 "movq (%0),%%xmm0 \n"
152 "lea 0x8(%0),%0 \n"
153 "punpcklbw %%xmm0,%%xmm0 \n"
154 "movdqa %%xmm0,%%xmm1 \n"
155 "punpcklwd %%xmm0,%%xmm0 \n"
156 "punpckhwd %%xmm1,%%xmm1 \n"
157 "por %%xmm5,%%xmm0 \n"
158 "por %%xmm5,%%xmm1 \n"
159 "movdqa %%xmm0,(%1) \n"
160 "movdqa %%xmm1,0x10(%1) \n"
161 "lea 0x20(%1),%1 \n"
162 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000163 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000164 : "+r"(src_y), // %0
165 "+r"(dst_argb), // %1
166 "+r"(pix) // %2
167 :
168 : "memory", "cc"
169#if defined(__SSE2__)
170 , "xmm0", "xmm1", "xmm5"
171#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000172 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000173}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000174
fbarchard@google.com00b69a22012-11-02 06:03:28 +0000175void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
176 int pix) {
177 asm volatile (
178 "pcmpeqb %%xmm5,%%xmm5 \n"
179 "pslld $0x18,%%xmm5 \n"
180 ".p2align 4 \n"
181 "1: \n"
182 "movq (%0),%%xmm0 \n"
183 "lea 0x8(%0),%0 \n"
184 "punpcklbw %%xmm0,%%xmm0 \n"
185 "movdqa %%xmm0,%%xmm1 \n"
186 "punpcklwd %%xmm0,%%xmm0 \n"
187 "punpckhwd %%xmm1,%%xmm1 \n"
188 "por %%xmm5,%%xmm0 \n"
189 "por %%xmm5,%%xmm1 \n"
190 "movdqu %%xmm0,(%1) \n"
191 "movdqu %%xmm1,0x10(%1) \n"
192 "lea 0x20(%1),%1 \n"
193 "sub $0x8,%2 \n"
194 "jg 1b \n"
195 : "+r"(src_y), // %0
196 "+r"(dst_argb), // %1
197 "+r"(pix) // %2
198 :
199 : "memory", "cc"
200#if defined(__SSE2__)
201 , "xmm0", "xmm1", "xmm5"
202#endif
203 );
204}
205
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000206void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000207 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000208 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
209 "pslld $0x18,%%xmm5 \n"
210 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000211 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000212 "1: \n"
213 "movdqu (%0),%%xmm0 \n"
214 "movdqu 0x10(%0),%%xmm1 \n"
215 "movdqu 0x20(%0),%%xmm3 \n"
216 "lea 0x30(%0),%0 \n"
217 "movdqa %%xmm3,%%xmm2 \n"
218 "palignr $0x8,%%xmm1,%%xmm2 \n"
219 "pshufb %%xmm4,%%xmm2 \n"
220 "por %%xmm5,%%xmm2 \n"
221 "palignr $0xc,%%xmm0,%%xmm1 \n"
222 "pshufb %%xmm4,%%xmm0 \n"
223 "movdqa %%xmm2,0x20(%1) \n"
224 "por %%xmm5,%%xmm0 \n"
225 "pshufb %%xmm4,%%xmm1 \n"
226 "movdqa %%xmm0,(%1) \n"
227 "por %%xmm5,%%xmm1 \n"
228 "palignr $0x4,%%xmm3,%%xmm3 \n"
229 "pshufb %%xmm4,%%xmm3 \n"
230 "movdqa %%xmm1,0x10(%1) \n"
231 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000232 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000233 "movdqa %%xmm3,0x30(%1) \n"
234 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000235 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000236 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000237 "+r"(dst_argb), // %1
238 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000239 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000240 : "memory", "cc"
241#if defined(__SSE2__)
242 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
243#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000244 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000245}
246
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000247void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000248 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000249 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
250 "pslld $0x18,%%xmm5 \n"
251 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000252 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000253 "1: \n"
254 "movdqu (%0),%%xmm0 \n"
255 "movdqu 0x10(%0),%%xmm1 \n"
256 "movdqu 0x20(%0),%%xmm3 \n"
257 "lea 0x30(%0),%0 \n"
258 "movdqa %%xmm3,%%xmm2 \n"
259 "palignr $0x8,%%xmm1,%%xmm2 \n"
260 "pshufb %%xmm4,%%xmm2 \n"
261 "por %%xmm5,%%xmm2 \n"
262 "palignr $0xc,%%xmm0,%%xmm1 \n"
263 "pshufb %%xmm4,%%xmm0 \n"
264 "movdqa %%xmm2,0x20(%1) \n"
265 "por %%xmm5,%%xmm0 \n"
266 "pshufb %%xmm4,%%xmm1 \n"
267 "movdqa %%xmm0,(%1) \n"
268 "por %%xmm5,%%xmm1 \n"
269 "palignr $0x4,%%xmm3,%%xmm3 \n"
270 "pshufb %%xmm4,%%xmm3 \n"
271 "movdqa %%xmm1,0x10(%1) \n"
272 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000273 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000274 "movdqa %%xmm3,0x30(%1) \n"
275 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000276 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000277 : "+r"(src_raw), // %0
278 "+r"(dst_argb), // %1
279 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000280 : "m"(kShuffleMaskRAWToARGB) // %3
281 : "memory", "cc"
282#if defined(__SSE2__)
283 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
284#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000285 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000286}
287
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000288void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000289 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000290 "mov $0x1080108,%%eax \n"
291 "movd %%eax,%%xmm5 \n"
292 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000293 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000294 "movd %%eax,%%xmm6 \n"
295 "pshufd $0x0,%%xmm6,%%xmm6 \n"
296 "pcmpeqb %%xmm3,%%xmm3 \n"
297 "psllw $0xb,%%xmm3 \n"
298 "pcmpeqb %%xmm4,%%xmm4 \n"
299 "psllw $0xa,%%xmm4 \n"
300 "psrlw $0x5,%%xmm4 \n"
301 "pcmpeqb %%xmm7,%%xmm7 \n"
302 "psllw $0x8,%%xmm7 \n"
303 "sub %0,%1 \n"
304 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000305 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000306 "1: \n"
307 "movdqu (%0),%%xmm0 \n"
308 "movdqa %%xmm0,%%xmm1 \n"
309 "movdqa %%xmm0,%%xmm2 \n"
310 "pand %%xmm3,%%xmm1 \n"
311 "psllw $0xb,%%xmm2 \n"
312 "pmulhuw %%xmm5,%%xmm1 \n"
313 "pmulhuw %%xmm5,%%xmm2 \n"
314 "psllw $0x8,%%xmm1 \n"
315 "por %%xmm2,%%xmm1 \n"
316 "pand %%xmm4,%%xmm0 \n"
317 "pmulhuw %%xmm6,%%xmm0 \n"
318 "por %%xmm7,%%xmm0 \n"
319 "movdqa %%xmm1,%%xmm2 \n"
320 "punpcklbw %%xmm0,%%xmm1 \n"
321 "punpckhbw %%xmm0,%%xmm2 \n"
322 "movdqa %%xmm1,(%1,%0,2) \n"
323 "movdqa %%xmm2,0x10(%1,%0,2) \n"
324 "lea 0x10(%0),%0 \n"
325 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000326 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000327 : "+r"(src), // %0
328 "+r"(dst), // %1
329 "+r"(pix) // %2
330 :
331 : "memory", "cc", "eax"
332#if defined(__SSE2__)
333 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
334#endif
335 );
336}
337
338void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000339 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000340 "mov $0x1080108,%%eax \n"
341 "movd %%eax,%%xmm5 \n"
342 "pshufd $0x0,%%xmm5,%%xmm5 \n"
343 "mov $0x42004200,%%eax \n"
344 "movd %%eax,%%xmm6 \n"
345 "pshufd $0x0,%%xmm6,%%xmm6 \n"
346 "pcmpeqb %%xmm3,%%xmm3 \n"
347 "psllw $0xb,%%xmm3 \n"
348 "movdqa %%xmm3,%%xmm4 \n"
349 "psrlw $0x6,%%xmm4 \n"
350 "pcmpeqb %%xmm7,%%xmm7 \n"
351 "psllw $0x8,%%xmm7 \n"
352 "sub %0,%1 \n"
353 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000354 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000355 "1: \n"
356 "movdqu (%0),%%xmm0 \n"
357 "movdqa %%xmm0,%%xmm1 \n"
358 "movdqa %%xmm0,%%xmm2 \n"
359 "psllw $0x1,%%xmm1 \n"
360 "psllw $0xb,%%xmm2 \n"
361 "pand %%xmm3,%%xmm1 \n"
362 "pmulhuw %%xmm5,%%xmm2 \n"
363 "pmulhuw %%xmm5,%%xmm1 \n"
364 "psllw $0x8,%%xmm1 \n"
365 "por %%xmm2,%%xmm1 \n"
366 "movdqa %%xmm0,%%xmm2 \n"
367 "pand %%xmm4,%%xmm0 \n"
368 "psraw $0x8,%%xmm2 \n"
369 "pmulhuw %%xmm6,%%xmm0 \n"
370 "pand %%xmm7,%%xmm2 \n"
371 "por %%xmm2,%%xmm0 \n"
372 "movdqa %%xmm1,%%xmm2 \n"
373 "punpcklbw %%xmm0,%%xmm1 \n"
374 "punpckhbw %%xmm0,%%xmm2 \n"
375 "movdqa %%xmm1,(%1,%0,2) \n"
376 "movdqa %%xmm2,0x10(%1,%0,2) \n"
377 "lea 0x10(%0),%0 \n"
378 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000379 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000380 : "+r"(src), // %0
381 "+r"(dst), // %1
382 "+r"(pix) // %2
383 :
384 : "memory", "cc", "eax"
385#if defined(__SSE2__)
386 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
387#endif
388 );
389}
390
391void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000392 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000393 "mov $0xf0f0f0f,%%eax \n"
394 "movd %%eax,%%xmm4 \n"
395 "pshufd $0x0,%%xmm4,%%xmm4 \n"
396 "movdqa %%xmm4,%%xmm5 \n"
397 "pslld $0x4,%%xmm5 \n"
398 "sub %0,%1 \n"
399 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000400 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000401 "1: \n"
402 "movdqu (%0),%%xmm0 \n"
403 "movdqa %%xmm0,%%xmm2 \n"
404 "pand %%xmm4,%%xmm0 \n"
405 "pand %%xmm5,%%xmm2 \n"
406 "movdqa %%xmm0,%%xmm1 \n"
407 "movdqa %%xmm2,%%xmm3 \n"
408 "psllw $0x4,%%xmm1 \n"
409 "psrlw $0x4,%%xmm3 \n"
410 "por %%xmm1,%%xmm0 \n"
411 "por %%xmm3,%%xmm2 \n"
412 "movdqa %%xmm0,%%xmm1 \n"
413 "punpcklbw %%xmm2,%%xmm0 \n"
414 "punpckhbw %%xmm2,%%xmm1 \n"
415 "movdqa %%xmm0,(%1,%0,2) \n"
416 "movdqa %%xmm1,0x10(%1,%0,2) \n"
417 "lea 0x10(%0),%0 \n"
418 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000419 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000420 : "+r"(src), // %0
421 "+r"(dst), // %1
422 "+r"(pix) // %2
423 :
424 : "memory", "cc", "eax"
425#if defined(__SSE2__)
426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
427#endif
428 );
429}
430
431void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000432 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000433 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000434 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000435 "1: \n"
436 "movdqa (%0),%%xmm0 \n"
437 "movdqa 0x10(%0),%%xmm1 \n"
438 "movdqa 0x20(%0),%%xmm2 \n"
439 "movdqa 0x30(%0),%%xmm3 \n"
440 "lea 0x40(%0),%0 \n"
441 "pshufb %%xmm6,%%xmm0 \n"
442 "pshufb %%xmm6,%%xmm1 \n"
443 "pshufb %%xmm6,%%xmm2 \n"
444 "pshufb %%xmm6,%%xmm3 \n"
445 "movdqa %%xmm1,%%xmm4 \n"
446 "psrldq $0x4,%%xmm1 \n"
447 "pslldq $0xc,%%xmm4 \n"
448 "movdqa %%xmm2,%%xmm5 \n"
449 "por %%xmm4,%%xmm0 \n"
450 "pslldq $0x8,%%xmm5 \n"
451 "movdqa %%xmm0,(%1) \n"
452 "por %%xmm5,%%xmm1 \n"
453 "psrldq $0x8,%%xmm2 \n"
454 "pslldq $0x4,%%xmm3 \n"
455 "por %%xmm3,%%xmm2 \n"
456 "movdqa %%xmm1,0x10(%1) \n"
457 "movdqa %%xmm2,0x20(%1) \n"
458 "lea 0x30(%1),%1 \n"
459 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000460 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000461 : "+r"(src), // %0
462 "+r"(dst), // %1
463 "+r"(pix) // %2
464 : "m"(kShuffleMaskARGBToRGB24) // %3
465 : "memory", "cc"
466#if defined(__SSE2__)
467 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
468#endif
469 );
470}
471
472void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000473 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000474 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000475 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000476 "1: \n"
477 "movdqa (%0),%%xmm0 \n"
478 "movdqa 0x10(%0),%%xmm1 \n"
479 "movdqa 0x20(%0),%%xmm2 \n"
480 "movdqa 0x30(%0),%%xmm3 \n"
481 "lea 0x40(%0),%0 \n"
482 "pshufb %%xmm6,%%xmm0 \n"
483 "pshufb %%xmm6,%%xmm1 \n"
484 "pshufb %%xmm6,%%xmm2 \n"
485 "pshufb %%xmm6,%%xmm3 \n"
486 "movdqa %%xmm1,%%xmm4 \n"
487 "psrldq $0x4,%%xmm1 \n"
488 "pslldq $0xc,%%xmm4 \n"
489 "movdqa %%xmm2,%%xmm5 \n"
490 "por %%xmm4,%%xmm0 \n"
491 "pslldq $0x8,%%xmm5 \n"
492 "movdqa %%xmm0,(%1) \n"
493 "por %%xmm5,%%xmm1 \n"
494 "psrldq $0x8,%%xmm2 \n"
495 "pslldq $0x4,%%xmm3 \n"
496 "por %%xmm3,%%xmm2 \n"
497 "movdqa %%xmm1,0x10(%1) \n"
498 "movdqa %%xmm2,0x20(%1) \n"
499 "lea 0x30(%1),%1 \n"
500 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000501 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000502 : "+r"(src), // %0
503 "+r"(dst), // %1
504 "+r"(pix) // %2
505 : "m"(kShuffleMaskARGBToRAW) // %3
506 : "memory", "cc"
507#if defined(__SSE2__)
508 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
509#endif
510 );
511}
512
fbarchard@google.comdbcabea2012-10-29 21:20:25 +0000513void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000514 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000515 "pcmpeqb %%xmm3,%%xmm3 \n"
516 "psrld $0x1b,%%xmm3 \n"
517 "pcmpeqb %%xmm4,%%xmm4 \n"
518 "psrld $0x1a,%%xmm4 \n"
519 "pslld $0x5,%%xmm4 \n"
520 "pcmpeqb %%xmm5,%%xmm5 \n"
521 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000522 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000523 "1: \n"
524 "movdqa (%0),%%xmm0 \n"
525 "movdqa %%xmm0,%%xmm1 \n"
526 "movdqa %%xmm0,%%xmm2 \n"
527 "pslld $0x8,%%xmm0 \n"
528 "psrld $0x3,%%xmm1 \n"
529 "psrld $0x5,%%xmm2 \n"
530 "psrad $0x10,%%xmm0 \n"
531 "pand %%xmm3,%%xmm1 \n"
532 "pand %%xmm4,%%xmm2 \n"
533 "pand %%xmm5,%%xmm0 \n"
534 "por %%xmm2,%%xmm1 \n"
535 "por %%xmm1,%%xmm0 \n"
536 "packssdw %%xmm0,%%xmm0 \n"
537 "lea 0x10(%0),%0 \n"
538 "movq %%xmm0,(%1) \n"
539 "lea 0x8(%1),%1 \n"
540 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000541 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000542 : "+r"(src), // %0
543 "+r"(dst), // %1
544 "+r"(pix) // %2
545 :
546 : "memory", "cc"
547#if defined(__SSE2__)
548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
549#endif
550 );
551}
552
553void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000554 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000555 "pcmpeqb %%xmm4,%%xmm4 \n"
556 "psrld $0x1b,%%xmm4 \n"
557 "movdqa %%xmm4,%%xmm5 \n"
558 "pslld $0x5,%%xmm5 \n"
559 "movdqa %%xmm4,%%xmm6 \n"
560 "pslld $0xa,%%xmm6 \n"
561 "pcmpeqb %%xmm7,%%xmm7 \n"
562 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000563 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000564 "1: \n"
565 "movdqa (%0),%%xmm0 \n"
566 "movdqa %%xmm0,%%xmm1 \n"
567 "movdqa %%xmm0,%%xmm2 \n"
568 "movdqa %%xmm0,%%xmm3 \n"
569 "psrad $0x10,%%xmm0 \n"
570 "psrld $0x3,%%xmm1 \n"
571 "psrld $0x6,%%xmm2 \n"
572 "psrld $0x9,%%xmm3 \n"
573 "pand %%xmm7,%%xmm0 \n"
574 "pand %%xmm4,%%xmm1 \n"
575 "pand %%xmm5,%%xmm2 \n"
576 "pand %%xmm6,%%xmm3 \n"
577 "por %%xmm1,%%xmm0 \n"
578 "por %%xmm3,%%xmm2 \n"
579 "por %%xmm2,%%xmm0 \n"
580 "packssdw %%xmm0,%%xmm0 \n"
581 "lea 0x10(%0),%0 \n"
582 "movq %%xmm0,(%1) \n"
583 "lea 0x8(%1),%1 \n"
584 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000585 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000586 : "+r"(src), // %0
587 "+r"(dst), // %1
588 "+r"(pix) // %2
589 :
590 : "memory", "cc"
591#if defined(__SSE2__)
592 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
593#endif
594 );
595}
596
597void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000598 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000599 "pcmpeqb %%xmm4,%%xmm4 \n"
600 "psllw $0xc,%%xmm4 \n"
601 "movdqa %%xmm4,%%xmm3 \n"
602 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000603 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000604 "1: \n"
605 "movdqa (%0),%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "pand %%xmm3,%%xmm0 \n"
608 "pand %%xmm4,%%xmm1 \n"
609 "psrlq $0x4,%%xmm0 \n"
610 "psrlq $0x8,%%xmm1 \n"
611 "por %%xmm1,%%xmm0 \n"
612 "packuswb %%xmm0,%%xmm0 \n"
613 "lea 0x10(%0),%0 \n"
614 "movq %%xmm0,(%1) \n"
615 "lea 0x8(%1),%1 \n"
616 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000617 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000618 : "+r"(src), // %0
619 "+r"(dst), // %1
620 "+r"(pix) // %2
621 :
622 : "memory", "cc"
623#if defined(__SSE2__)
624 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
625#endif
626 );
627}
628
fbarchard@google.comb6149762011-11-07 21:58:52 +0000629void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000630 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000631 "movdqa %4,%%xmm5 \n"
632 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000633 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000634 "1: \n"
635 "movdqa (%0),%%xmm0 \n"
636 "movdqa 0x10(%0),%%xmm1 \n"
637 "movdqa 0x20(%0),%%xmm2 \n"
638 "movdqa 0x30(%0),%%xmm3 \n"
639 "pmaddubsw %%xmm4,%%xmm0 \n"
640 "pmaddubsw %%xmm4,%%xmm1 \n"
641 "pmaddubsw %%xmm4,%%xmm2 \n"
642 "pmaddubsw %%xmm4,%%xmm3 \n"
643 "lea 0x40(%0),%0 \n"
644 "phaddw %%xmm1,%%xmm0 \n"
645 "phaddw %%xmm3,%%xmm2 \n"
646 "psrlw $0x7,%%xmm0 \n"
647 "psrlw $0x7,%%xmm2 \n"
648 "packuswb %%xmm2,%%xmm0 \n"
649 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000650 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000651 "movdqa %%xmm0,(%1) \n"
652 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000653 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000654 : "+r"(src_argb), // %0
655 "+r"(dst_y), // %1
656 "+r"(pix) // %2
657 : "m"(kARGBToY), // %3
658 "m"(kAddY16) // %4
659 : "memory", "cc"
660#if defined(__SSE2__)
661 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
662#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000663 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000664}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000665
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000666void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
667 asm volatile (
668 "movdqa %3,%%xmm4 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000669 "movdqa %4,%%xmm5 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000670 ".p2align 4 \n"
671 "1: \n"
672 "movdqa (%0),%%xmm0 \n"
673 "movdqa 0x10(%0),%%xmm1 \n"
674 "movdqa 0x20(%0),%%xmm2 \n"
675 "movdqa 0x30(%0),%%xmm3 \n"
676 "pmaddubsw %%xmm4,%%xmm0 \n"
677 "pmaddubsw %%xmm4,%%xmm1 \n"
678 "pmaddubsw %%xmm4,%%xmm2 \n"
679 "pmaddubsw %%xmm4,%%xmm3 \n"
680 "lea 0x40(%0),%0 \n"
681 "phaddw %%xmm1,%%xmm0 \n"
682 "phaddw %%xmm3,%%xmm2 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000683 "paddw %%xmm5,%%xmm0 \n"
684 "paddw %%xmm5,%%xmm2 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000685 "psrlw $0x7,%%xmm0 \n"
686 "psrlw $0x7,%%xmm2 \n"
687 "packuswb %%xmm2,%%xmm0 \n"
688 "sub $0x10,%2 \n"
689 "movdqa %%xmm0,(%1) \n"
690 "lea 0x10(%1),%1 \n"
691 "jg 1b \n"
692 : "+r"(src_argb), // %0
693 "+r"(dst_y), // %1
694 "+r"(pix) // %2
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000695 : "m"(kARGBToYJ), // %3
696 "m"(kAddYJ64) // %4
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000697 : "memory", "cc"
698#if defined(__SSE2__)
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000699 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000700#endif
701 );
702}
703
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000704void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000705 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000706 "movdqa %4,%%xmm5 \n"
707 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000708 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000709 "1: \n"
710 "movdqu (%0),%%xmm0 \n"
711 "movdqu 0x10(%0),%%xmm1 \n"
712 "movdqu 0x20(%0),%%xmm2 \n"
713 "movdqu 0x30(%0),%%xmm3 \n"
714 "pmaddubsw %%xmm4,%%xmm0 \n"
715 "pmaddubsw %%xmm4,%%xmm1 \n"
716 "pmaddubsw %%xmm4,%%xmm2 \n"
717 "pmaddubsw %%xmm4,%%xmm3 \n"
718 "lea 0x40(%0),%0 \n"
719 "phaddw %%xmm1,%%xmm0 \n"
720 "phaddw %%xmm3,%%xmm2 \n"
721 "psrlw $0x7,%%xmm0 \n"
722 "psrlw $0x7,%%xmm2 \n"
723 "packuswb %%xmm2,%%xmm0 \n"
724 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000725 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000726 "movdqu %%xmm0,(%1) \n"
727 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000728 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000729 : "+r"(src_argb), // %0
730 "+r"(dst_y), // %1
731 "+r"(pix) // %2
732 : "m"(kARGBToY), // %3
733 "m"(kAddY16) // %4
734 : "memory", "cc"
735#if defined(__SSE2__)
736 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
737#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000738 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000739}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000740
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000741void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
742 asm volatile (
743 "movdqa %3,%%xmm4 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000744 "movdqa %4,%%xmm5 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000745 ".p2align 4 \n"
746 "1: \n"
747 "movdqu (%0),%%xmm0 \n"
748 "movdqu 0x10(%0),%%xmm1 \n"
749 "movdqu 0x20(%0),%%xmm2 \n"
750 "movdqu 0x30(%0),%%xmm3 \n"
751 "pmaddubsw %%xmm4,%%xmm0 \n"
752 "pmaddubsw %%xmm4,%%xmm1 \n"
753 "pmaddubsw %%xmm4,%%xmm2 \n"
754 "pmaddubsw %%xmm4,%%xmm3 \n"
755 "lea 0x40(%0),%0 \n"
756 "phaddw %%xmm1,%%xmm0 \n"
757 "phaddw %%xmm3,%%xmm2 \n"
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000758 "paddw %%xmm5,%%xmm0 \n"
759 "paddw %%xmm5,%%xmm2 \n"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000760 "psrlw $0x7,%%xmm0 \n"
761 "psrlw $0x7,%%xmm2 \n"
762 "packuswb %%xmm2,%%xmm0 \n"
763 "sub $0x10,%2 \n"
764 "movdqu %%xmm0,(%1) \n"
765 "lea 0x10(%1),%1 \n"
766 "jg 1b \n"
767 : "+r"(src_argb), // %0
768 "+r"(dst_y), // %1
769 "+r"(pix) // %2
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000770 : "m"(kARGBToYJ), // %3
771 "m"(kAddYJ64) // %4
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000772 : "memory", "cc"
773#if defined(__SSE2__)
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000774 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comcfaa66c2013-03-26 09:14:46 +0000775#endif
776 );
777}
fbarchard@google.com4e0d7cc2013-03-27 07:35:03 +0000778
fbarchard@google.com714050a2012-02-17 22:59:56 +0000779// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000780// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
781// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
782// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000783// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000784void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
785 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000786 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000787 "movdqa %0,%%xmm4 \n"
788 "movdqa %1,%%xmm3 \n"
789 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000790 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000791 : "m"(kARGBToU), // %0
792 "m"(kARGBToV), // %1
793 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000794 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000795 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000796 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000797 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000798 "1: \n"
799 "movdqa (%0),%%xmm0 \n"
800 "movdqa 0x10(%0),%%xmm1 \n"
801 "movdqa 0x20(%0),%%xmm2 \n"
802 "movdqa 0x30(%0),%%xmm6 \n"
803 "pavgb (%0,%4,1),%%xmm0 \n"
804 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
805 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
806 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
807 "lea 0x40(%0),%0 \n"
808 "movdqa %%xmm0,%%xmm7 \n"
809 "shufps $0x88,%%xmm1,%%xmm0 \n"
810 "shufps $0xdd,%%xmm1,%%xmm7 \n"
811 "pavgb %%xmm7,%%xmm0 \n"
812 "movdqa %%xmm2,%%xmm7 \n"
813 "shufps $0x88,%%xmm6,%%xmm2 \n"
814 "shufps $0xdd,%%xmm6,%%xmm7 \n"
815 "pavgb %%xmm7,%%xmm2 \n"
816 "movdqa %%xmm0,%%xmm1 \n"
817 "movdqa %%xmm2,%%xmm6 \n"
818 "pmaddubsw %%xmm4,%%xmm0 \n"
819 "pmaddubsw %%xmm4,%%xmm2 \n"
820 "pmaddubsw %%xmm3,%%xmm1 \n"
821 "pmaddubsw %%xmm3,%%xmm6 \n"
822 "phaddw %%xmm2,%%xmm0 \n"
823 "phaddw %%xmm6,%%xmm1 \n"
824 "psraw $0x8,%%xmm0 \n"
825 "psraw $0x8,%%xmm1 \n"
826 "packsswb %%xmm1,%%xmm0 \n"
827 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000828 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000829 "movlps %%xmm0,(%1) \n"
830 "movhps %%xmm0,(%1,%2,1) \n"
831 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000832 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000833 : "+r"(src_argb0), // %0
834 "+r"(dst_u), // %1
835 "+r"(dst_v), // %2
836 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000837 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000838 : "memory", "cc"
839#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000840 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000841#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000842 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000843}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000844
fbarchard@google.com050b39a2013-04-01 20:07:14 +0000845// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
846void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
847 uint8* dst_u, uint8* dst_v, int width) {
848 asm volatile (
849 "movdqa %0,%%xmm4 \n"
850 "movdqa %1,%%xmm3 \n"
851 "movdqa %2,%%xmm5 \n"
852 :
853 : "m"(kARGBToUJ), // %0
854 "m"(kARGBToVJ), // %1
855 "m"(kAddUVJ128) // %2
856 );
857 asm volatile (
858 "sub %1,%2 \n"
859 ".p2align 4 \n"
860 "1: \n"
861 "movdqa (%0),%%xmm0 \n"
862 "movdqa 0x10(%0),%%xmm1 \n"
863 "movdqa 0x20(%0),%%xmm2 \n"
864 "movdqa 0x30(%0),%%xmm6 \n"
865 "pavgb (%0,%4,1),%%xmm0 \n"
866 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
867 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
868 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
869 "lea 0x40(%0),%0 \n"
870 "movdqa %%xmm0,%%xmm7 \n"
871 "shufps $0x88,%%xmm1,%%xmm0 \n"
872 "shufps $0xdd,%%xmm1,%%xmm7 \n"
873 "pavgb %%xmm7,%%xmm0 \n"
874 "movdqa %%xmm2,%%xmm7 \n"
875 "shufps $0x88,%%xmm6,%%xmm2 \n"
876 "shufps $0xdd,%%xmm6,%%xmm7 \n"
877 "pavgb %%xmm7,%%xmm2 \n"
878 "movdqa %%xmm0,%%xmm1 \n"
879 "movdqa %%xmm2,%%xmm6 \n"
880 "pmaddubsw %%xmm4,%%xmm0 \n"
881 "pmaddubsw %%xmm4,%%xmm2 \n"
882 "pmaddubsw %%xmm3,%%xmm1 \n"
883 "pmaddubsw %%xmm3,%%xmm6 \n"
884 "phaddw %%xmm2,%%xmm0 \n"
885 "phaddw %%xmm6,%%xmm1 \n"
886 "paddw %%xmm5,%%xmm0 \n"
887 "paddw %%xmm5,%%xmm1 \n"
888 "psraw $0x8,%%xmm0 \n"
889 "psraw $0x8,%%xmm1 \n"
890 "packsswb %%xmm1,%%xmm0 \n"
891 "sub $0x10,%3 \n"
892 "movlps %%xmm0,(%1) \n"
893 "movhps %%xmm0,(%1,%2,1) \n"
894 "lea 0x8(%1),%1 \n"
895 "jg 1b \n"
896 : "+r"(src_argb0), // %0
897 "+r"(dst_u), // %1
898 "+r"(dst_v), // %2
899 "+rm"(width) // %3
900 : "r"(static_cast<intptr_t>(src_stride_argb))
901 : "memory", "cc"
902#if defined(__SSE2__)
903 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
904#endif
905 );
906}
907
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000908void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
909 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000910 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000911 "movdqa %0,%%xmm4 \n"
912 "movdqa %1,%%xmm3 \n"
913 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000914 :
915 : "m"(kARGBToU), // %0
916 "m"(kARGBToV), // %1
917 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000918 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000919 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000920 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000921 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000922 "1: \n"
923 "movdqu (%0),%%xmm0 \n"
924 "movdqu 0x10(%0),%%xmm1 \n"
925 "movdqu 0x20(%0),%%xmm2 \n"
926 "movdqu 0x30(%0),%%xmm6 \n"
927 "movdqu (%0,%4,1),%%xmm7 \n"
928 "pavgb %%xmm7,%%xmm0 \n"
929 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
930 "pavgb %%xmm7,%%xmm1 \n"
931 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
932 "pavgb %%xmm7,%%xmm2 \n"
933 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
934 "pavgb %%xmm7,%%xmm6 \n"
935 "lea 0x40(%0),%0 \n"
936 "movdqa %%xmm0,%%xmm7 \n"
937 "shufps $0x88,%%xmm1,%%xmm0 \n"
938 "shufps $0xdd,%%xmm1,%%xmm7 \n"
939 "pavgb %%xmm7,%%xmm0 \n"
940 "movdqa %%xmm2,%%xmm7 \n"
941 "shufps $0x88,%%xmm6,%%xmm2 \n"
942 "shufps $0xdd,%%xmm6,%%xmm7 \n"
943 "pavgb %%xmm7,%%xmm2 \n"
944 "movdqa %%xmm0,%%xmm1 \n"
945 "movdqa %%xmm2,%%xmm6 \n"
946 "pmaddubsw %%xmm4,%%xmm0 \n"
947 "pmaddubsw %%xmm4,%%xmm2 \n"
948 "pmaddubsw %%xmm3,%%xmm1 \n"
949 "pmaddubsw %%xmm3,%%xmm6 \n"
950 "phaddw %%xmm2,%%xmm0 \n"
951 "phaddw %%xmm6,%%xmm1 \n"
952 "psraw $0x8,%%xmm0 \n"
953 "psraw $0x8,%%xmm1 \n"
954 "packsswb %%xmm1,%%xmm0 \n"
955 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000956 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000957 "movlps %%xmm0,(%1) \n"
958 "movhps %%xmm0,(%1,%2,1) \n"
959 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000960 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000961 : "+r"(src_argb0), // %0
962 "+r"(dst_u), // %1
963 "+r"(dst_v), // %2
964 "+rm"(width) // %3
965 : "r"(static_cast<intptr_t>(src_stride_argb))
966 : "memory", "cc"
967#if defined(__SSE2__)
968 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
969#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000970 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000971}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000972
fbarchard@google.com050b39a2013-04-01 20:07:14 +0000973void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
974 uint8* dst_u, uint8* dst_v, int width) {
975 asm volatile (
976 "movdqa %0,%%xmm4 \n"
977 "movdqa %1,%%xmm3 \n"
978 "movdqa %2,%%xmm5 \n"
979 :
980 : "m"(kARGBToUJ), // %0
981 "m"(kARGBToVJ), // %1
982 "m"(kAddUVJ128) // %2
983 );
984 asm volatile (
985 "sub %1,%2 \n"
986 ".p2align 4 \n"
987 "1: \n"
988 "movdqu (%0),%%xmm0 \n"
989 "movdqu 0x10(%0),%%xmm1 \n"
990 "movdqu 0x20(%0),%%xmm2 \n"
991 "movdqu 0x30(%0),%%xmm6 \n"
992 "movdqu (%0,%4,1),%%xmm7 \n"
993 "pavgb %%xmm7,%%xmm0 \n"
994 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
995 "pavgb %%xmm7,%%xmm1 \n"
996 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
997 "pavgb %%xmm7,%%xmm2 \n"
998 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
999 "pavgb %%xmm7,%%xmm6 \n"
1000 "lea 0x40(%0),%0 \n"
1001 "movdqa %%xmm0,%%xmm7 \n"
1002 "shufps $0x88,%%xmm1,%%xmm0 \n"
1003 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1004 "pavgb %%xmm7,%%xmm0 \n"
1005 "movdqa %%xmm2,%%xmm7 \n"
1006 "shufps $0x88,%%xmm6,%%xmm2 \n"
1007 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1008 "pavgb %%xmm7,%%xmm2 \n"
1009 "movdqa %%xmm0,%%xmm1 \n"
1010 "movdqa %%xmm2,%%xmm6 \n"
1011 "pmaddubsw %%xmm4,%%xmm0 \n"
1012 "pmaddubsw %%xmm4,%%xmm2 \n"
1013 "pmaddubsw %%xmm3,%%xmm1 \n"
1014 "pmaddubsw %%xmm3,%%xmm6 \n"
1015 "phaddw %%xmm2,%%xmm0 \n"
1016 "phaddw %%xmm6,%%xmm1 \n"
1017 "paddw %%xmm5,%%xmm0 \n"
1018 "paddw %%xmm5,%%xmm1 \n"
1019 "psraw $0x8,%%xmm0 \n"
1020 "psraw $0x8,%%xmm1 \n"
1021 "packsswb %%xmm1,%%xmm0 \n"
1022 "sub $0x10,%3 \n"
1023 "movlps %%xmm0,(%1) \n"
1024 "movhps %%xmm0,(%1,%2,1) \n"
1025 "lea 0x8(%1),%1 \n"
1026 "jg 1b \n"
1027 : "+r"(src_argb0), // %0
1028 "+r"(dst_u), // %1
1029 "+r"(dst_v), // %2
1030 "+rm"(width) // %3
1031 : "r"(static_cast<intptr_t>(src_stride_argb))
1032 : "memory", "cc"
1033#if defined(__SSE2__)
1034 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1035#endif
1036 );
1037}
1038
fbarchard@google.com762c0502013-02-04 18:47:21 +00001039void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1040 int width) {
1041 asm volatile (
1042 "movdqa %0,%%xmm4 \n"
1043 "movdqa %1,%%xmm3 \n"
1044 "movdqa %2,%%xmm5 \n"
1045 :
1046 : "m"(kARGBToU), // %0
1047 "m"(kARGBToV), // %1
1048 "m"(kAddUV128) // %2
1049 );
1050 asm volatile (
1051 "sub %1,%2 \n"
1052 ".p2align 4 \n"
1053 "1: \n"
1054 "movdqa (%0),%%xmm0 \n"
1055 "movdqa 0x10(%0),%%xmm1 \n"
1056 "movdqa 0x20(%0),%%xmm2 \n"
1057 "movdqa 0x30(%0),%%xmm6 \n"
1058 "pmaddubsw %%xmm4,%%xmm0 \n"
1059 "pmaddubsw %%xmm4,%%xmm1 \n"
1060 "pmaddubsw %%xmm4,%%xmm2 \n"
1061 "pmaddubsw %%xmm4,%%xmm6 \n"
1062 "phaddw %%xmm1,%%xmm0 \n"
1063 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +00001064 "psraw $0x8,%%xmm0 \n"
1065 "psraw $0x8,%%xmm2 \n"
1066 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +00001067 "paddb %%xmm5,%%xmm0 \n"
1068 "sub $0x10,%3 \n"
1069 "movdqa %%xmm0,(%1) \n"
1070 "movdqa (%0),%%xmm0 \n"
1071 "movdqa 0x10(%0),%%xmm1 \n"
1072 "movdqa 0x20(%0),%%xmm2 \n"
1073 "movdqa 0x30(%0),%%xmm6 \n"
1074 "pmaddubsw %%xmm3,%%xmm0 \n"
1075 "pmaddubsw %%xmm3,%%xmm1 \n"
1076 "pmaddubsw %%xmm3,%%xmm2 \n"
1077 "pmaddubsw %%xmm3,%%xmm6 \n"
1078 "phaddw %%xmm1,%%xmm0 \n"
1079 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +00001080 "psraw $0x8,%%xmm0 \n"
1081 "psraw $0x8,%%xmm2 \n"
1082 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +00001083 "paddb %%xmm5,%%xmm0 \n"
1084 "lea 0x40(%0),%0 \n"
1085 "movdqa %%xmm0,(%1,%2,1) \n"
1086 "lea 0x10(%1),%1 \n"
1087 "jg 1b \n"
1088 : "+r"(src_argb), // %0
1089 "+r"(dst_u), // %1
1090 "+r"(dst_v), // %2
1091 "+rm"(width) // %3
1092 :
1093 : "memory", "cc"
1094#if defined(__SSE2__)
1095 , "xmm0", "xmm1", "xmm2", "xmm6"
1096#endif
1097 );
1098}
1099
1100void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
1101 uint8* dst_v, int width) {
1102 asm volatile (
1103 "movdqa %0,%%xmm4 \n"
1104 "movdqa %1,%%xmm3 \n"
1105 "movdqa %2,%%xmm5 \n"
1106 :
1107 : "m"(kARGBToU), // %0
1108 "m"(kARGBToV), // %1
1109 "m"(kAddUV128) // %2
1110 );
1111 asm volatile (
1112 "sub %1,%2 \n"
1113 ".p2align 4 \n"
1114 "1: \n"
1115 "movdqu (%0),%%xmm0 \n"
1116 "movdqu 0x10(%0),%%xmm1 \n"
1117 "movdqu 0x20(%0),%%xmm2 \n"
1118 "movdqu 0x30(%0),%%xmm6 \n"
1119 "pmaddubsw %%xmm4,%%xmm0 \n"
1120 "pmaddubsw %%xmm4,%%xmm1 \n"
1121 "pmaddubsw %%xmm4,%%xmm2 \n"
1122 "pmaddubsw %%xmm4,%%xmm6 \n"
1123 "phaddw %%xmm1,%%xmm0 \n"
1124 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +00001125 "psraw $0x8,%%xmm0 \n"
1126 "psraw $0x8,%%xmm2 \n"
1127 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +00001128 "paddb %%xmm5,%%xmm0 \n"
1129 "sub $0x10,%3 \n"
1130 "movdqu %%xmm0,(%1) \n"
1131 "movdqu (%0),%%xmm0 \n"
1132 "movdqu 0x10(%0),%%xmm1 \n"
1133 "movdqu 0x20(%0),%%xmm2 \n"
1134 "movdqu 0x30(%0),%%xmm6 \n"
1135 "pmaddubsw %%xmm3,%%xmm0 \n"
1136 "pmaddubsw %%xmm3,%%xmm1 \n"
1137 "pmaddubsw %%xmm3,%%xmm2 \n"
1138 "pmaddubsw %%xmm3,%%xmm6 \n"
1139 "phaddw %%xmm1,%%xmm0 \n"
1140 "phaddw %%xmm6,%%xmm2 \n"
fbarchard@google.comd8b73ca2013-02-15 07:49:15 +00001141 "psraw $0x8,%%xmm0 \n"
1142 "psraw $0x8,%%xmm2 \n"
1143 "packsswb %%xmm2,%%xmm0 \n"
fbarchard@google.com762c0502013-02-04 18:47:21 +00001144 "paddb %%xmm5,%%xmm0 \n"
1145 "lea 0x40(%0),%0 \n"
1146 "movdqu %%xmm0,(%1,%2,1) \n"
1147 "lea 0x10(%1),%1 \n"
1148 "jg 1b \n"
1149 : "+r"(src_argb), // %0
1150 "+r"(dst_u), // %1
1151 "+r"(dst_v), // %2
1152 "+rm"(width) // %3
1153 :
1154 : "memory", "cc"
1155#if defined(__SSE2__)
1156 , "xmm0", "xmm1", "xmm2", "xmm6"
1157#endif
1158 );
1159}
1160
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00001161void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1162 uint8* dst_u, uint8* dst_v, int width) {
1163 asm volatile (
1164 "movdqa %0,%%xmm4 \n"
1165 "movdqa %1,%%xmm3 \n"
1166 "movdqa %2,%%xmm5 \n"
1167 :
1168 : "m"(kARGBToU), // %0
1169 "m"(kARGBToV), // %1
1170 "m"(kAddUV128) // %2
1171 );
1172 asm volatile (
1173 "sub %1,%2 \n"
1174 ".p2align 4 \n"
1175 "1: \n"
1176 "movdqa (%0),%%xmm0 \n"
1177 "movdqa 0x10(%0),%%xmm1 \n"
1178 "movdqa 0x20(%0),%%xmm2 \n"
1179 "movdqa 0x30(%0),%%xmm6 \n"
1180 "lea 0x40(%0),%0 \n"
1181 "movdqa %%xmm0,%%xmm7 \n"
1182 "shufps $0x88,%%xmm1,%%xmm0 \n"
1183 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1184 "pavgb %%xmm7,%%xmm0 \n"
1185 "movdqa %%xmm2,%%xmm7 \n"
1186 "shufps $0x88,%%xmm6,%%xmm2 \n"
1187 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1188 "pavgb %%xmm7,%%xmm2 \n"
1189 "movdqa %%xmm0,%%xmm1 \n"
1190 "movdqa %%xmm2,%%xmm6 \n"
1191 "pmaddubsw %%xmm4,%%xmm0 \n"
1192 "pmaddubsw %%xmm4,%%xmm2 \n"
1193 "pmaddubsw %%xmm3,%%xmm1 \n"
1194 "pmaddubsw %%xmm3,%%xmm6 \n"
1195 "phaddw %%xmm2,%%xmm0 \n"
1196 "phaddw %%xmm6,%%xmm1 \n"
1197 "psraw $0x8,%%xmm0 \n"
1198 "psraw $0x8,%%xmm1 \n"
1199 "packsswb %%xmm1,%%xmm0 \n"
1200 "paddb %%xmm5,%%xmm0 \n"
1201 "sub $0x10,%3 \n"
1202 "movlps %%xmm0,(%1) \n"
1203 "movhps %%xmm0,(%1,%2,1) \n"
1204 "lea 0x8(%1),%1 \n"
1205 "jg 1b \n"
1206 : "+r"(src_argb0), // %0
1207 "+r"(dst_u), // %1
1208 "+r"(dst_v), // %2
1209 "+rm"(width) // %3
1210 :
1211 : "memory", "cc"
1212#if defined(__SSE2__)
1213 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1214#endif
1215 );
1216}
1217
1218void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1219 uint8* dst_u, uint8* dst_v, int width) {
1220 asm volatile (
1221 "movdqa %0,%%xmm4 \n"
1222 "movdqa %1,%%xmm3 \n"
1223 "movdqa %2,%%xmm5 \n"
1224 :
1225 : "m"(kARGBToU), // %0
1226 "m"(kARGBToV), // %1
1227 "m"(kAddUV128) // %2
1228 );
1229 asm volatile (
1230 "sub %1,%2 \n"
1231 ".p2align 4 \n"
1232 "1: \n"
1233 "movdqu (%0),%%xmm0 \n"
1234 "movdqu 0x10(%0),%%xmm1 \n"
1235 "movdqu 0x20(%0),%%xmm2 \n"
1236 "movdqu 0x30(%0),%%xmm6 \n"
1237 "lea 0x40(%0),%0 \n"
1238 "movdqa %%xmm0,%%xmm7 \n"
1239 "shufps $0x88,%%xmm1,%%xmm0 \n"
1240 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1241 "pavgb %%xmm7,%%xmm0 \n"
1242 "movdqa %%xmm2,%%xmm7 \n"
1243 "shufps $0x88,%%xmm6,%%xmm2 \n"
1244 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1245 "pavgb %%xmm7,%%xmm2 \n"
1246 "movdqa %%xmm0,%%xmm1 \n"
1247 "movdqa %%xmm2,%%xmm6 \n"
1248 "pmaddubsw %%xmm4,%%xmm0 \n"
1249 "pmaddubsw %%xmm4,%%xmm2 \n"
1250 "pmaddubsw %%xmm3,%%xmm1 \n"
1251 "pmaddubsw %%xmm3,%%xmm6 \n"
1252 "phaddw %%xmm2,%%xmm0 \n"
1253 "phaddw %%xmm6,%%xmm1 \n"
1254 "psraw $0x8,%%xmm0 \n"
1255 "psraw $0x8,%%xmm1 \n"
1256 "packsswb %%xmm1,%%xmm0 \n"
1257 "paddb %%xmm5,%%xmm0 \n"
1258 "sub $0x10,%3 \n"
1259 "movlps %%xmm0,(%1) \n"
1260 "movhps %%xmm0,(%1,%2,1) \n"
1261 "lea 0x8(%1),%1 \n"
1262 "jg 1b \n"
1263 : "+r"(src_argb0), // %0
1264 "+r"(dst_u), // %1
1265 "+r"(dst_v), // %2
1266 "+rm"(width) // %3
1267 :
1268 : "memory", "cc"
1269#if defined(__SSE2__)
1270 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1271#endif
1272 );
1273}
1274
fbarchard@google.com714050a2012-02-17 22:59:56 +00001275void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001276 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001277 "movdqa %4,%%xmm5 \n"
1278 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001279 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001280 "1: \n"
1281 "movdqa (%0),%%xmm0 \n"
1282 "movdqa 0x10(%0),%%xmm1 \n"
1283 "movdqa 0x20(%0),%%xmm2 \n"
1284 "movdqa 0x30(%0),%%xmm3 \n"
1285 "pmaddubsw %%xmm4,%%xmm0 \n"
1286 "pmaddubsw %%xmm4,%%xmm1 \n"
1287 "pmaddubsw %%xmm4,%%xmm2 \n"
1288 "pmaddubsw %%xmm4,%%xmm3 \n"
1289 "lea 0x40(%0),%0 \n"
1290 "phaddw %%xmm1,%%xmm0 \n"
1291 "phaddw %%xmm3,%%xmm2 \n"
1292 "psrlw $0x7,%%xmm0 \n"
1293 "psrlw $0x7,%%xmm2 \n"
1294 "packuswb %%xmm2,%%xmm0 \n"
1295 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001296 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001297 "movdqa %%xmm0,(%1) \n"
1298 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001299 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001300 : "+r"(src_bgra), // %0
1301 "+r"(dst_y), // %1
1302 "+r"(pix) // %2
1303 : "m"(kBGRAToY), // %3
1304 "m"(kAddY16) // %4
1305 : "memory", "cc"
1306#if defined(__SSE2__)
1307 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +00001308#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +00001309 );
1310}
1311
1312void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001313 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001314 "movdqa %4,%%xmm5 \n"
1315 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001316 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001317 "1: \n"
1318 "movdqu (%0),%%xmm0 \n"
1319 "movdqu 0x10(%0),%%xmm1 \n"
1320 "movdqu 0x20(%0),%%xmm2 \n"
1321 "movdqu 0x30(%0),%%xmm3 \n"
1322 "pmaddubsw %%xmm4,%%xmm0 \n"
1323 "pmaddubsw %%xmm4,%%xmm1 \n"
1324 "pmaddubsw %%xmm4,%%xmm2 \n"
1325 "pmaddubsw %%xmm4,%%xmm3 \n"
1326 "lea 0x40(%0),%0 \n"
1327 "phaddw %%xmm1,%%xmm0 \n"
1328 "phaddw %%xmm3,%%xmm2 \n"
1329 "psrlw $0x7,%%xmm0 \n"
1330 "psrlw $0x7,%%xmm2 \n"
1331 "packuswb %%xmm2,%%xmm0 \n"
1332 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001333 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001334 "movdqu %%xmm0,(%1) \n"
1335 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001336 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001337 : "+r"(src_bgra), // %0
1338 "+r"(dst_y), // %1
1339 "+r"(pix) // %2
1340 : "m"(kBGRAToY), // %3
1341 "m"(kAddY16) // %4
1342 : "memory", "cc"
1343#if defined(__SSE2__)
1344 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1345#endif
1346 );
1347}
1348
1349void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1350 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001351 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001352 "movdqa %0,%%xmm4 \n"
1353 "movdqa %1,%%xmm3 \n"
1354 "movdqa %2,%%xmm5 \n"
1355 :
1356 : "m"(kBGRAToU), // %0
1357 "m"(kBGRAToV), // %1
1358 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001359 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001360 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001361 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001362 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001363 "1: \n"
1364 "movdqa (%0),%%xmm0 \n"
1365 "movdqa 0x10(%0),%%xmm1 \n"
1366 "movdqa 0x20(%0),%%xmm2 \n"
1367 "movdqa 0x30(%0),%%xmm6 \n"
1368 "pavgb (%0,%4,1),%%xmm0 \n"
1369 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1370 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1371 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1372 "lea 0x40(%0),%0 \n"
1373 "movdqa %%xmm0,%%xmm7 \n"
1374 "shufps $0x88,%%xmm1,%%xmm0 \n"
1375 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1376 "pavgb %%xmm7,%%xmm0 \n"
1377 "movdqa %%xmm2,%%xmm7 \n"
1378 "shufps $0x88,%%xmm6,%%xmm2 \n"
1379 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1380 "pavgb %%xmm7,%%xmm2 \n"
1381 "movdqa %%xmm0,%%xmm1 \n"
1382 "movdqa %%xmm2,%%xmm6 \n"
1383 "pmaddubsw %%xmm4,%%xmm0 \n"
1384 "pmaddubsw %%xmm4,%%xmm2 \n"
1385 "pmaddubsw %%xmm3,%%xmm1 \n"
1386 "pmaddubsw %%xmm3,%%xmm6 \n"
1387 "phaddw %%xmm2,%%xmm0 \n"
1388 "phaddw %%xmm6,%%xmm1 \n"
1389 "psraw $0x8,%%xmm0 \n"
1390 "psraw $0x8,%%xmm1 \n"
1391 "packsswb %%xmm1,%%xmm0 \n"
1392 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001393 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001394 "movlps %%xmm0,(%1) \n"
1395 "movhps %%xmm0,(%1,%2,1) \n"
1396 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001397 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001398 : "+r"(src_bgra0), // %0
1399 "+r"(dst_u), // %1
1400 "+r"(dst_v), // %2
1401 "+rm"(width) // %3
1402 : "r"(static_cast<intptr_t>(src_stride_bgra))
1403 : "memory", "cc"
1404#if defined(__SSE2__)
1405 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1406#endif
1407 );
1408}
1409
1410void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1411 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001412 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001413 "movdqa %0,%%xmm4 \n"
1414 "movdqa %1,%%xmm3 \n"
1415 "movdqa %2,%%xmm5 \n"
1416 :
1417 : "m"(kBGRAToU), // %0
1418 "m"(kBGRAToV), // %1
1419 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001420 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001421 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001422 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001423 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001424 "1: \n"
1425 "movdqu (%0),%%xmm0 \n"
1426 "movdqu 0x10(%0),%%xmm1 \n"
1427 "movdqu 0x20(%0),%%xmm2 \n"
1428 "movdqu 0x30(%0),%%xmm6 \n"
1429 "movdqu (%0,%4,1),%%xmm7 \n"
1430 "pavgb %%xmm7,%%xmm0 \n"
1431 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1432 "pavgb %%xmm7,%%xmm1 \n"
1433 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1434 "pavgb %%xmm7,%%xmm2 \n"
1435 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1436 "pavgb %%xmm7,%%xmm6 \n"
1437 "lea 0x40(%0),%0 \n"
1438 "movdqa %%xmm0,%%xmm7 \n"
1439 "shufps $0x88,%%xmm1,%%xmm0 \n"
1440 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1441 "pavgb %%xmm7,%%xmm0 \n"
1442 "movdqa %%xmm2,%%xmm7 \n"
1443 "shufps $0x88,%%xmm6,%%xmm2 \n"
1444 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1445 "pavgb %%xmm7,%%xmm2 \n"
1446 "movdqa %%xmm0,%%xmm1 \n"
1447 "movdqa %%xmm2,%%xmm6 \n"
1448 "pmaddubsw %%xmm4,%%xmm0 \n"
1449 "pmaddubsw %%xmm4,%%xmm2 \n"
1450 "pmaddubsw %%xmm3,%%xmm1 \n"
1451 "pmaddubsw %%xmm3,%%xmm6 \n"
1452 "phaddw %%xmm2,%%xmm0 \n"
1453 "phaddw %%xmm6,%%xmm1 \n"
1454 "psraw $0x8,%%xmm0 \n"
1455 "psraw $0x8,%%xmm1 \n"
1456 "packsswb %%xmm1,%%xmm0 \n"
1457 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001458 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001459 "movlps %%xmm0,(%1) \n"
1460 "movhps %%xmm0,(%1,%2,1) \n"
1461 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001462 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001463 : "+r"(src_bgra0), // %0
1464 "+r"(dst_u), // %1
1465 "+r"(dst_v), // %2
1466 "+rm"(width) // %3
1467 : "r"(static_cast<intptr_t>(src_stride_bgra))
1468 : "memory", "cc"
1469#if defined(__SSE2__)
1470 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1471#endif
1472 );
1473}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001474
1475void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001476 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001477 "movdqa %4,%%xmm5 \n"
1478 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001479 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001480 "1: \n"
1481 "movdqa (%0),%%xmm0 \n"
1482 "movdqa 0x10(%0),%%xmm1 \n"
1483 "movdqa 0x20(%0),%%xmm2 \n"
1484 "movdqa 0x30(%0),%%xmm3 \n"
1485 "pmaddubsw %%xmm4,%%xmm0 \n"
1486 "pmaddubsw %%xmm4,%%xmm1 \n"
1487 "pmaddubsw %%xmm4,%%xmm2 \n"
1488 "pmaddubsw %%xmm4,%%xmm3 \n"
1489 "lea 0x40(%0),%0 \n"
1490 "phaddw %%xmm1,%%xmm0 \n"
1491 "phaddw %%xmm3,%%xmm2 \n"
1492 "psrlw $0x7,%%xmm0 \n"
1493 "psrlw $0x7,%%xmm2 \n"
1494 "packuswb %%xmm2,%%xmm0 \n"
1495 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001496 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001497 "movdqa %%xmm0,(%1) \n"
1498 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001499 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001500 : "+r"(src_abgr), // %0
1501 "+r"(dst_y), // %1
1502 "+r"(pix) // %2
1503 : "m"(kABGRToY), // %3
1504 "m"(kAddY16) // %4
1505 : "memory", "cc"
1506#if defined(__SSE2__)
1507 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1508#endif
1509 );
1510}
1511
1512void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001513 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001514 "movdqa %4,%%xmm5 \n"
1515 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001516 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001517 "1: \n"
1518 "movdqu (%0),%%xmm0 \n"
1519 "movdqu 0x10(%0),%%xmm1 \n"
1520 "movdqu 0x20(%0),%%xmm2 \n"
1521 "movdqu 0x30(%0),%%xmm3 \n"
1522 "pmaddubsw %%xmm4,%%xmm0 \n"
1523 "pmaddubsw %%xmm4,%%xmm1 \n"
1524 "pmaddubsw %%xmm4,%%xmm2 \n"
1525 "pmaddubsw %%xmm4,%%xmm3 \n"
1526 "lea 0x40(%0),%0 \n"
1527 "phaddw %%xmm1,%%xmm0 \n"
1528 "phaddw %%xmm3,%%xmm2 \n"
1529 "psrlw $0x7,%%xmm0 \n"
1530 "psrlw $0x7,%%xmm2 \n"
1531 "packuswb %%xmm2,%%xmm0 \n"
1532 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001533 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001534 "movdqu %%xmm0,(%1) \n"
1535 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001536 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001537 : "+r"(src_abgr), // %0
1538 "+r"(dst_y), // %1
1539 "+r"(pix) // %2
1540 : "m"(kABGRToY), // %3
1541 "m"(kAddY16) // %4
1542 : "memory", "cc"
1543#if defined(__SSE2__)
1544 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1545#endif
1546 );
1547}
1548
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001549void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1550 asm volatile (
1551 "movdqa %4,%%xmm5 \n"
1552 "movdqa %3,%%xmm4 \n"
1553 ".p2align 4 \n"
1554 "1: \n"
1555 "movdqa (%0),%%xmm0 \n"
1556 "movdqa 0x10(%0),%%xmm1 \n"
1557 "movdqa 0x20(%0),%%xmm2 \n"
1558 "movdqa 0x30(%0),%%xmm3 \n"
1559 "pmaddubsw %%xmm4,%%xmm0 \n"
1560 "pmaddubsw %%xmm4,%%xmm1 \n"
1561 "pmaddubsw %%xmm4,%%xmm2 \n"
1562 "pmaddubsw %%xmm4,%%xmm3 \n"
1563 "lea 0x40(%0),%0 \n"
1564 "phaddw %%xmm1,%%xmm0 \n"
1565 "phaddw %%xmm3,%%xmm2 \n"
1566 "psrlw $0x7,%%xmm0 \n"
1567 "psrlw $0x7,%%xmm2 \n"
1568 "packuswb %%xmm2,%%xmm0 \n"
1569 "paddb %%xmm5,%%xmm0 \n"
1570 "sub $0x10,%2 \n"
1571 "movdqa %%xmm0,(%1) \n"
1572 "lea 0x10(%1),%1 \n"
1573 "jg 1b \n"
1574 : "+r"(src_rgba), // %0
1575 "+r"(dst_y), // %1
1576 "+r"(pix) // %2
1577 : "m"(kRGBAToY), // %3
1578 "m"(kAddY16) // %4
1579 : "memory", "cc"
1580#if defined(__SSE2__)
1581 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1582#endif
1583 );
1584}
1585
1586void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1587 asm volatile (
1588 "movdqa %4,%%xmm5 \n"
1589 "movdqa %3,%%xmm4 \n"
1590 ".p2align 4 \n"
1591 "1: \n"
1592 "movdqu (%0),%%xmm0 \n"
1593 "movdqu 0x10(%0),%%xmm1 \n"
1594 "movdqu 0x20(%0),%%xmm2 \n"
1595 "movdqu 0x30(%0),%%xmm3 \n"
1596 "pmaddubsw %%xmm4,%%xmm0 \n"
1597 "pmaddubsw %%xmm4,%%xmm1 \n"
1598 "pmaddubsw %%xmm4,%%xmm2 \n"
1599 "pmaddubsw %%xmm4,%%xmm3 \n"
1600 "lea 0x40(%0),%0 \n"
1601 "phaddw %%xmm1,%%xmm0 \n"
1602 "phaddw %%xmm3,%%xmm2 \n"
1603 "psrlw $0x7,%%xmm0 \n"
1604 "psrlw $0x7,%%xmm2 \n"
1605 "packuswb %%xmm2,%%xmm0 \n"
1606 "paddb %%xmm5,%%xmm0 \n"
1607 "sub $0x10,%2 \n"
1608 "movdqu %%xmm0,(%1) \n"
1609 "lea 0x10(%1),%1 \n"
1610 "jg 1b \n"
1611 : "+r"(src_rgba), // %0
1612 "+r"(dst_y), // %1
1613 "+r"(pix) // %2
1614 : "m"(kRGBAToY), // %3
1615 "m"(kAddY16) // %4
1616 : "memory", "cc"
1617#if defined(__SSE2__)
1618 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1619#endif
1620 );
1621}
1622
fbarchard@google.com714050a2012-02-17 22:59:56 +00001623void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1624 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001625 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001626 "movdqa %0,%%xmm4 \n"
1627 "movdqa %1,%%xmm3 \n"
1628 "movdqa %2,%%xmm5 \n"
1629 :
1630 : "m"(kABGRToU), // %0
1631 "m"(kABGRToV), // %1
1632 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001633 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001634 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001635 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001636 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001637 "1: \n"
1638 "movdqa (%0),%%xmm0 \n"
1639 "movdqa 0x10(%0),%%xmm1 \n"
1640 "movdqa 0x20(%0),%%xmm2 \n"
1641 "movdqa 0x30(%0),%%xmm6 \n"
1642 "pavgb (%0,%4,1),%%xmm0 \n"
1643 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1644 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1645 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1646 "lea 0x40(%0),%0 \n"
1647 "movdqa %%xmm0,%%xmm7 \n"
1648 "shufps $0x88,%%xmm1,%%xmm0 \n"
1649 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1650 "pavgb %%xmm7,%%xmm0 \n"
1651 "movdqa %%xmm2,%%xmm7 \n"
1652 "shufps $0x88,%%xmm6,%%xmm2 \n"
1653 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1654 "pavgb %%xmm7,%%xmm2 \n"
1655 "movdqa %%xmm0,%%xmm1 \n"
1656 "movdqa %%xmm2,%%xmm6 \n"
1657 "pmaddubsw %%xmm4,%%xmm0 \n"
1658 "pmaddubsw %%xmm4,%%xmm2 \n"
1659 "pmaddubsw %%xmm3,%%xmm1 \n"
1660 "pmaddubsw %%xmm3,%%xmm6 \n"
1661 "phaddw %%xmm2,%%xmm0 \n"
1662 "phaddw %%xmm6,%%xmm1 \n"
1663 "psraw $0x8,%%xmm0 \n"
1664 "psraw $0x8,%%xmm1 \n"
1665 "packsswb %%xmm1,%%xmm0 \n"
1666 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001667 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001668 "movlps %%xmm0,(%1) \n"
1669 "movhps %%xmm0,(%1,%2,1) \n"
1670 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001671 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001672 : "+r"(src_abgr0), // %0
1673 "+r"(dst_u), // %1
1674 "+r"(dst_v), // %2
1675 "+rm"(width) // %3
1676 : "r"(static_cast<intptr_t>(src_stride_abgr))
1677 : "memory", "cc"
1678#if defined(__SSE2__)
1679 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1680#endif
1681 );
1682}
1683
1684void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1685 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001686 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001687 "movdqa %0,%%xmm4 \n"
1688 "movdqa %1,%%xmm3 \n"
1689 "movdqa %2,%%xmm5 \n"
1690 :
1691 : "m"(kABGRToU), // %0
1692 "m"(kABGRToV), // %1
1693 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001694 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001695 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001696 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001697 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001698 "1: \n"
1699 "movdqu (%0),%%xmm0 \n"
1700 "movdqu 0x10(%0),%%xmm1 \n"
1701 "movdqu 0x20(%0),%%xmm2 \n"
1702 "movdqu 0x30(%0),%%xmm6 \n"
1703 "movdqu (%0,%4,1),%%xmm7 \n"
1704 "pavgb %%xmm7,%%xmm0 \n"
1705 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1706 "pavgb %%xmm7,%%xmm1 \n"
1707 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1708 "pavgb %%xmm7,%%xmm2 \n"
1709 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1710 "pavgb %%xmm7,%%xmm6 \n"
1711 "lea 0x40(%0),%0 \n"
1712 "movdqa %%xmm0,%%xmm7 \n"
1713 "shufps $0x88,%%xmm1,%%xmm0 \n"
1714 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1715 "pavgb %%xmm7,%%xmm0 \n"
1716 "movdqa %%xmm2,%%xmm7 \n"
1717 "shufps $0x88,%%xmm6,%%xmm2 \n"
1718 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1719 "pavgb %%xmm7,%%xmm2 \n"
1720 "movdqa %%xmm0,%%xmm1 \n"
1721 "movdqa %%xmm2,%%xmm6 \n"
1722 "pmaddubsw %%xmm4,%%xmm0 \n"
1723 "pmaddubsw %%xmm4,%%xmm2 \n"
1724 "pmaddubsw %%xmm3,%%xmm1 \n"
1725 "pmaddubsw %%xmm3,%%xmm6 \n"
1726 "phaddw %%xmm2,%%xmm0 \n"
1727 "phaddw %%xmm6,%%xmm1 \n"
1728 "psraw $0x8,%%xmm0 \n"
1729 "psraw $0x8,%%xmm1 \n"
1730 "packsswb %%xmm1,%%xmm0 \n"
1731 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001732 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001733 "movlps %%xmm0,(%1) \n"
1734 "movhps %%xmm0,(%1,%2,1) \n"
1735 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001736 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001737 : "+r"(src_abgr0), // %0
1738 "+r"(dst_u), // %1
1739 "+r"(dst_v), // %2
1740 "+rm"(width) // %3
1741 : "r"(static_cast<intptr_t>(src_stride_abgr))
1742 : "memory", "cc"
1743#if defined(__SSE2__)
1744 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1745#endif
1746 );
1747}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001748
1749void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1750 uint8* dst_u, uint8* dst_v, int width) {
1751 asm volatile (
1752 "movdqa %0,%%xmm4 \n"
1753 "movdqa %1,%%xmm3 \n"
1754 "movdqa %2,%%xmm5 \n"
1755 :
1756 : "m"(kRGBAToU), // %0
1757 "m"(kRGBAToV), // %1
1758 "m"(kAddUV128) // %2
1759 );
1760 asm volatile (
1761 "sub %1,%2 \n"
1762 ".p2align 4 \n"
1763 "1: \n"
1764 "movdqa (%0),%%xmm0 \n"
1765 "movdqa 0x10(%0),%%xmm1 \n"
1766 "movdqa 0x20(%0),%%xmm2 \n"
1767 "movdqa 0x30(%0),%%xmm6 \n"
1768 "pavgb (%0,%4,1),%%xmm0 \n"
1769 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1770 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1771 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1772 "lea 0x40(%0),%0 \n"
1773 "movdqa %%xmm0,%%xmm7 \n"
1774 "shufps $0x88,%%xmm1,%%xmm0 \n"
1775 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1776 "pavgb %%xmm7,%%xmm0 \n"
1777 "movdqa %%xmm2,%%xmm7 \n"
1778 "shufps $0x88,%%xmm6,%%xmm2 \n"
1779 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1780 "pavgb %%xmm7,%%xmm2 \n"
1781 "movdqa %%xmm0,%%xmm1 \n"
1782 "movdqa %%xmm2,%%xmm6 \n"
1783 "pmaddubsw %%xmm4,%%xmm0 \n"
1784 "pmaddubsw %%xmm4,%%xmm2 \n"
1785 "pmaddubsw %%xmm3,%%xmm1 \n"
1786 "pmaddubsw %%xmm3,%%xmm6 \n"
1787 "phaddw %%xmm2,%%xmm0 \n"
1788 "phaddw %%xmm6,%%xmm1 \n"
1789 "psraw $0x8,%%xmm0 \n"
1790 "psraw $0x8,%%xmm1 \n"
1791 "packsswb %%xmm1,%%xmm0 \n"
1792 "paddb %%xmm5,%%xmm0 \n"
1793 "sub $0x10,%3 \n"
1794 "movlps %%xmm0,(%1) \n"
1795 "movhps %%xmm0,(%1,%2,1) \n"
1796 "lea 0x8(%1),%1 \n"
1797 "jg 1b \n"
1798 : "+r"(src_rgba0), // %0
1799 "+r"(dst_u), // %1
1800 "+r"(dst_v), // %2
1801 "+rm"(width) // %3
1802 : "r"(static_cast<intptr_t>(src_stride_rgba))
1803 : "memory", "cc"
1804#if defined(__SSE2__)
1805 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1806#endif
1807 );
1808}
1809
1810void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1811 uint8* dst_u, uint8* dst_v, int width) {
1812 asm volatile (
1813 "movdqa %0,%%xmm4 \n"
1814 "movdqa %1,%%xmm3 \n"
1815 "movdqa %2,%%xmm5 \n"
1816 :
1817 : "m"(kRGBAToU), // %0
1818 "m"(kRGBAToV), // %1
1819 "m"(kAddUV128) // %2
1820 );
1821 asm volatile (
1822 "sub %1,%2 \n"
1823 ".p2align 4 \n"
1824 "1: \n"
1825 "movdqu (%0),%%xmm0 \n"
1826 "movdqu 0x10(%0),%%xmm1 \n"
1827 "movdqu 0x20(%0),%%xmm2 \n"
1828 "movdqu 0x30(%0),%%xmm6 \n"
1829 "movdqu (%0,%4,1),%%xmm7 \n"
1830 "pavgb %%xmm7,%%xmm0 \n"
1831 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1832 "pavgb %%xmm7,%%xmm1 \n"
1833 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1834 "pavgb %%xmm7,%%xmm2 \n"
1835 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1836 "pavgb %%xmm7,%%xmm6 \n"
1837 "lea 0x40(%0),%0 \n"
1838 "movdqa %%xmm0,%%xmm7 \n"
1839 "shufps $0x88,%%xmm1,%%xmm0 \n"
1840 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1841 "pavgb %%xmm7,%%xmm0 \n"
1842 "movdqa %%xmm2,%%xmm7 \n"
1843 "shufps $0x88,%%xmm6,%%xmm2 \n"
1844 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1845 "pavgb %%xmm7,%%xmm2 \n"
1846 "movdqa %%xmm0,%%xmm1 \n"
1847 "movdqa %%xmm2,%%xmm6 \n"
1848 "pmaddubsw %%xmm4,%%xmm0 \n"
1849 "pmaddubsw %%xmm4,%%xmm2 \n"
1850 "pmaddubsw %%xmm3,%%xmm1 \n"
1851 "pmaddubsw %%xmm3,%%xmm6 \n"
1852 "phaddw %%xmm2,%%xmm0 \n"
1853 "phaddw %%xmm6,%%xmm1 \n"
1854 "psraw $0x8,%%xmm0 \n"
1855 "psraw $0x8,%%xmm1 \n"
1856 "packsswb %%xmm1,%%xmm0 \n"
1857 "paddb %%xmm5,%%xmm0 \n"
1858 "sub $0x10,%3 \n"
1859 "movlps %%xmm0,(%1) \n"
1860 "movhps %%xmm0,(%1,%2,1) \n"
1861 "lea 0x8(%1),%1 \n"
1862 "jg 1b \n"
1863 : "+r"(src_rgba0), // %0
1864 "+r"(dst_u), // %1
1865 "+r"(dst_v), // %2
1866 "+rm"(width) // %3
1867 : "r"(static_cast<intptr_t>(src_stride_rgba))
1868 : "memory", "cc"
1869#if defined(__SSE2__)
1870 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1871#endif
1872 );
1873}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001874#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001875
fbarchard@google.come214fe32012-06-04 23:47:11 +00001876#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001877#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1878#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1879#define UR 0
1880
1881#define VB 0
1882#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1883#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1884
1885// Bias
1886#define BB UB * 128 + VB * 128
1887#define BG UG * 128 + VG * 128
1888#define BR UR * 128 + VR * 128
1889
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001890#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001891
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001892struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001893 vec8 kUVToB; // 0
1894 vec8 kUVToG; // 16
1895 vec8 kUVToR; // 32
1896 vec16 kUVBiasB; // 48
1897 vec16 kUVBiasG; // 64
1898 vec16 kUVBiasR; // 80
1899 vec16 kYSub16; // 96
1900 vec16 kYToRgb; // 112
1901 vec8 kVUToB; // 128
1902 vec8 kVUToG; // 144
1903 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001904} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001905 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1906 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1907 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1908 { BB, BB, BB, BB, BB, BB, BB, BB },
1909 { BG, BG, BG, BG, BG, BG, BG, BG },
1910 { BR, BR, BR, BR, BR, BR, BR, BR },
1911 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001912 { YG, YG, YG, YG, YG, YG, YG, YG },
1913 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1914 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1915 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001916};
1917
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001918
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001919// Read 8 UV from 411
1920#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001921 "movq (%[u_buf]),%%xmm0 \n" \
1922 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1923 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001924 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001925
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001926// Read 4 UV from 422, upsample to 8 UV
1927#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001928 "movd (%[u_buf]),%%xmm0 \n" \
1929 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1930 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001931 "punpcklbw %%xmm1,%%xmm0 \n" \
1932 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001933
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001934// Read 2 UV from 411, upsample to 8 UV
1935#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001936 "movd (%[u_buf]),%%xmm0 \n" \
1937 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1938 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001939 "punpcklbw %%xmm1,%%xmm0 \n" \
1940 "punpcklwd %%xmm0,%%xmm0 \n" \
1941 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001942
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001943// Read 4 UV from NV12, upsample to 8 UV
1944#define READNV12 \
1945 "movq (%[uv_buf]),%%xmm0 \n" \
1946 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001947 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001948
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001949// Convert 8 pixels: 8 UV and 8 Y
1950#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001951 "movdqa %%xmm0,%%xmm1 \n" \
1952 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001953 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1954 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1955 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1956 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1957 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1958 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1959 "movq (%[y_buf]),%%xmm3 \n" \
1960 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001961 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001962 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1963 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001964 "paddsw %%xmm3,%%xmm0 \n" \
1965 "paddsw %%xmm3,%%xmm1 \n" \
1966 "paddsw %%xmm3,%%xmm2 \n" \
1967 "psraw $0x6,%%xmm0 \n" \
1968 "psraw $0x6,%%xmm1 \n" \
1969 "psraw $0x6,%%xmm2 \n" \
1970 "packuswb %%xmm0,%%xmm0 \n" \
1971 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001972 "packuswb %%xmm2,%%xmm2 \n" \
1973
1974// Convert 8 pixels: 8 VU and 8 Y
1975#define YVUTORGB \
1976 "movdqa %%xmm0,%%xmm1 \n" \
1977 "movdqa %%xmm0,%%xmm2 \n" \
1978 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1979 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1980 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1981 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1982 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1983 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1984 "movq (%[y_buf]),%%xmm3 \n" \
1985 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1986 "punpcklbw %%xmm4,%%xmm3 \n" \
1987 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1988 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1989 "paddsw %%xmm3,%%xmm0 \n" \
1990 "paddsw %%xmm3,%%xmm1 \n" \
1991 "paddsw %%xmm3,%%xmm2 \n" \
1992 "psraw $0x6,%%xmm0 \n" \
1993 "psraw $0x6,%%xmm1 \n" \
1994 "psraw $0x6,%%xmm2 \n" \
1995 "packuswb %%xmm0,%%xmm0 \n" \
1996 "packuswb %%xmm1,%%xmm1 \n" \
1997 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001998
1999void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002000 const uint8* u_buf,
2001 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002002 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002003 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002004 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002005 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002006 "pcmpeqb %%xmm5,%%xmm5 \n"
2007 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002008 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002009 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002010 READYUV444
2011 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002012 "punpcklbw %%xmm1,%%xmm0 \n"
2013 "punpcklbw %%xmm5,%%xmm2 \n"
2014 "movdqa %%xmm0,%%xmm1 \n"
2015 "punpcklwd %%xmm2,%%xmm0 \n"
2016 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002017 "movdqa %%xmm0,(%[dst_argb]) \n"
2018 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2019 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002020 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002021 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002022 : [y_buf]"+r"(y_buf), // %[y_buf]
2023 [u_buf]"+r"(u_buf), // %[u_buf]
2024 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002025 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002026 [width]"+rm"(width) // %[width]
2027 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002028 : "memory", "cc"
2029#if defined(__SSE2__)
2030 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2031#endif
2032 );
2033}
2034
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002035void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
2036 const uint8* u_buf,
2037 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002038 uint8* dst_rgb24,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002039 int width) {
2040// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.come24ba5c2013-04-23 01:06:04 +00002041#if defined(__i386__)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002042 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002043 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2044 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2045 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2046 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002047#endif
2048
2049 asm volatile (
fbarchard@google.come24ba5c2013-04-23 01:06:04 +00002050#if !defined(__i386__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002051 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2052 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002053#endif
2054 "sub %[u_buf],%[v_buf] \n"
2055 "pxor %%xmm4,%%xmm4 \n"
2056 ".p2align 4 \n"
2057 "1: \n"
2058 READYUV422
2059 YUVTORGB
2060 "punpcklbw %%xmm1,%%xmm0 \n"
2061 "punpcklbw %%xmm2,%%xmm2 \n"
2062 "movdqa %%xmm0,%%xmm1 \n"
2063 "punpcklwd %%xmm2,%%xmm0 \n"
2064 "punpckhwd %%xmm2,%%xmm1 \n"
2065 "pshufb %%xmm5,%%xmm0 \n"
2066 "pshufb %%xmm6,%%xmm1 \n"
2067 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002068 "movq %%xmm0,(%[dst_rgb24]) \n"
2069 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
2070 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002071 "sub $0x8,%[width] \n"
2072 "jg 1b \n"
2073 : [y_buf]"+r"(y_buf), // %[y_buf]
2074 [u_buf]"+r"(u_buf), // %[u_buf]
2075 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002076 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002077 [width]"+rm"(width) // %[width]
2078 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
fbarchard@google.come24ba5c2013-04-23 01:06:04 +00002079#if !defined(__i386__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002080 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2081 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002082#endif
2083 : "memory", "cc"
2084#if defined(__SSE2__)
2085 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2086#endif
2087 );
2088}
2089
2090void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
2091 const uint8* u_buf,
2092 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002093 uint8* dst_raw,
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002094 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002095// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.come24ba5c2013-04-23 01:06:04 +00002096#if defined(__i386__)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002097 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002098 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
2099 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
2100 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
2101 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002102#endif
2103
2104 asm volatile (
fbarchard@google.come24ba5c2013-04-23 01:06:04 +00002105#if !defined(__i386__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002106 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
2107 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002108#endif
2109 "sub %[u_buf],%[v_buf] \n"
2110 "pxor %%xmm4,%%xmm4 \n"
2111 ".p2align 4 \n"
2112 "1: \n"
2113 READYUV422
2114 YUVTORGB
2115 "punpcklbw %%xmm1,%%xmm0 \n"
2116 "punpcklbw %%xmm2,%%xmm2 \n"
2117 "movdqa %%xmm0,%%xmm1 \n"
2118 "punpcklwd %%xmm2,%%xmm0 \n"
2119 "punpckhwd %%xmm2,%%xmm1 \n"
2120 "pshufb %%xmm5,%%xmm0 \n"
2121 "pshufb %%xmm6,%%xmm1 \n"
2122 "palignr $0xc,%%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002123 "movq %%xmm0,(%[dst_raw]) \n"
2124 "movdqu %%xmm1,0x8(%[dst_raw]) \n"
2125 "lea 0x18(%[dst_raw]),%[dst_raw] \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002126 "sub $0x8,%[width] \n"
2127 "jg 1b \n"
2128 : [y_buf]"+r"(y_buf), // %[y_buf]
2129 [u_buf]"+r"(u_buf), // %[u_buf]
2130 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002131 [dst_raw]"+r"(dst_raw), // %[dst_raw]
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002132 [width]"+rm"(width) // %[width]
2133 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
fbarchard@google.come24ba5c2013-04-23 01:06:04 +00002134#if !defined(__i386__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002135 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
2136 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002137#endif
2138 : "memory", "cc"
2139#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00002140 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00002141#endif
2142 );
2143}
2144
fbarchard@google.come214fe32012-06-04 23:47:11 +00002145void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002146 const uint8* u_buf,
2147 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002148 uint8* dst_argb,
fbarchard@google.comdbcabea2012-10-29 21:20:25 +00002149 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002150 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002151 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002152 "pcmpeqb %%xmm5,%%xmm5 \n"
2153 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002154 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002155 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002156 READYUV422
2157 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002158 "punpcklbw %%xmm1,%%xmm0 \n"
2159 "punpcklbw %%xmm5,%%xmm2 \n"
2160 "movdqa %%xmm0,%%xmm1 \n"
2161 "punpcklwd %%xmm2,%%xmm0 \n"
2162 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002163 "movdqa %%xmm0,(%[dst_argb]) \n"
2164 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2165 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002166 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002167 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002168 : [y_buf]"+r"(y_buf), // %[y_buf]
2169 [u_buf]"+r"(u_buf), // %[u_buf]
2170 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002171 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002172 [width]"+rm"(width) // %[width]
2173 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002174 : "memory", "cc"
2175#if defined(__SSE2__)
2176 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2177#endif
2178 );
2179}
2180
2181void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
2182 const uint8* u_buf,
2183 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002184 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002185 int width) {
2186 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002187 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002188 "pcmpeqb %%xmm5,%%xmm5 \n"
2189 "pxor %%xmm4,%%xmm4 \n"
2190 ".p2align 4 \n"
2191 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002192 READYUV411
2193 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002194 "punpcklbw %%xmm1,%%xmm0 \n"
2195 "punpcklbw %%xmm5,%%xmm2 \n"
2196 "movdqa %%xmm0,%%xmm1 \n"
2197 "punpcklwd %%xmm2,%%xmm0 \n"
2198 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002199 "movdqa %%xmm0,(%[dst_argb]) \n"
2200 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2201 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002202 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002203 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002204 : [y_buf]"+r"(y_buf), // %[y_buf]
2205 [u_buf]"+r"(u_buf), // %[u_buf]
2206 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002207 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002208 [width]"+rm"(width) // %[width]
2209 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2210 : "memory", "cc"
2211#if defined(__SSE2__)
2212 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2213#endif
2214 );
2215}
2216
2217void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
2218 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002219 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002220 int width) {
2221 asm volatile (
2222 "pcmpeqb %%xmm5,%%xmm5 \n"
2223 "pxor %%xmm4,%%xmm4 \n"
2224 ".p2align 4 \n"
2225 "1: \n"
2226 READNV12
2227 YUVTORGB
2228 "punpcklbw %%xmm1,%%xmm0 \n"
2229 "punpcklbw %%xmm5,%%xmm2 \n"
2230 "movdqa %%xmm0,%%xmm1 \n"
2231 "punpcklwd %%xmm2,%%xmm0 \n"
2232 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002233 "movdqa %%xmm0,(%[dst_argb]) \n"
2234 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2235 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002236 "sub $0x8,%[width] \n"
2237 "jg 1b \n"
2238 : [y_buf]"+r"(y_buf), // %[y_buf]
2239 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002240 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002241 [width]"+rm"(width) // %[width]
2242 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2243 : "memory", "cc"
2244#if defined(__SSE2__)
2245 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2246#endif
2247 );
2248}
2249
2250void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002251 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002252 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002253 int width) {
2254 asm volatile (
2255 "pcmpeqb %%xmm5,%%xmm5 \n"
2256 "pxor %%xmm4,%%xmm4 \n"
2257 ".p2align 4 \n"
2258 "1: \n"
2259 READNV12
2260 YVUTORGB
2261 "punpcklbw %%xmm1,%%xmm0 \n"
2262 "punpcklbw %%xmm5,%%xmm2 \n"
2263 "movdqa %%xmm0,%%xmm1 \n"
2264 "punpcklwd %%xmm2,%%xmm0 \n"
2265 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002266 "movdqa %%xmm0,(%[dst_argb]) \n"
2267 "movdqa %%xmm1,0x10(%[dst_argb]) \n"
2268 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002269 "sub $0x8,%[width] \n"
2270 "jg 1b \n"
2271 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002272 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2273 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002274 [width]"+rm"(width) // %[width]
2275 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002276 : "memory", "cc"
2277#if defined(__SSE2__)
2278 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2279#endif
2280 );
2281}
2282
2283void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2284 const uint8* u_buf,
2285 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002286 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002287 int width) {
2288 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002289 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002290 "pcmpeqb %%xmm5,%%xmm5 \n"
2291 "pxor %%xmm4,%%xmm4 \n"
2292 ".p2align 4 \n"
2293 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002294 READYUV444
2295 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002296 "punpcklbw %%xmm1,%%xmm0 \n"
2297 "punpcklbw %%xmm5,%%xmm2 \n"
2298 "movdqa %%xmm0,%%xmm1 \n"
2299 "punpcklwd %%xmm2,%%xmm0 \n"
2300 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002301 "movdqu %%xmm0,(%[dst_argb]) \n"
2302 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2303 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002304 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002305 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002306 : [y_buf]"+r"(y_buf), // %[y_buf]
2307 [u_buf]"+r"(u_buf), // %[u_buf]
2308 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002309 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002310 [width]"+rm"(width) // %[width]
2311 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002312 : "memory", "cc"
2313#if defined(__SSE2__)
2314 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2315#endif
2316 );
2317}
2318
2319void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2320 const uint8* u_buf,
2321 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002322 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002323 int width) {
2324 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002325 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002326 "pcmpeqb %%xmm5,%%xmm5 \n"
2327 "pxor %%xmm4,%%xmm4 \n"
2328 ".p2align 4 \n"
2329 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002330 READYUV422
2331 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002332 "punpcklbw %%xmm1,%%xmm0 \n"
2333 "punpcklbw %%xmm5,%%xmm2 \n"
2334 "movdqa %%xmm0,%%xmm1 \n"
2335 "punpcklwd %%xmm2,%%xmm0 \n"
2336 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002337 "movdqu %%xmm0,(%[dst_argb]) \n"
2338 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2339 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002340 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002341 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002342 : [y_buf]"+r"(y_buf), // %[y_buf]
2343 [u_buf]"+r"(u_buf), // %[u_buf]
2344 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002345 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002346 [width]"+rm"(width) // %[width]
2347 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002348 : "memory", "cc"
2349#if defined(__SSE2__)
2350 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2351#endif
2352 );
2353}
2354
2355void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2356 const uint8* u_buf,
2357 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002358 uint8* dst_argb,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002359 int width) {
2360 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002361 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002362 "pcmpeqb %%xmm5,%%xmm5 \n"
2363 "pxor %%xmm4,%%xmm4 \n"
2364 ".p2align 4 \n"
2365 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002366 READYUV411
2367 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002368 "punpcklbw %%xmm1,%%xmm0 \n"
2369 "punpcklbw %%xmm5,%%xmm2 \n"
2370 "movdqa %%xmm0,%%xmm1 \n"
2371 "punpcklwd %%xmm2,%%xmm0 \n"
2372 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002373 "movdqu %%xmm0,(%[dst_argb]) \n"
2374 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2375 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002376 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002377 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002378 : [y_buf]"+r"(y_buf), // %[y_buf]
2379 [u_buf]"+r"(u_buf), // %[u_buf]
2380 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002381 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002382 [width]"+rm"(width) // %[width]
2383 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2384 : "memory", "cc"
2385#if defined(__SSE2__)
2386 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2387#endif
2388 );
2389}
2390
2391void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2392 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002393 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002394 int width) {
2395 asm volatile (
2396 "pcmpeqb %%xmm5,%%xmm5 \n"
2397 "pxor %%xmm4,%%xmm4 \n"
2398 ".p2align 4 \n"
2399 "1: \n"
2400 READNV12
2401 YUVTORGB
2402 "punpcklbw %%xmm1,%%xmm0 \n"
2403 "punpcklbw %%xmm5,%%xmm2 \n"
2404 "movdqa %%xmm0,%%xmm1 \n"
2405 "punpcklwd %%xmm2,%%xmm0 \n"
2406 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002407 "movdqu %%xmm0,(%[dst_argb]) \n"
2408 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2409 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002410 "sub $0x8,%[width] \n"
2411 "jg 1b \n"
2412 : [y_buf]"+r"(y_buf), // %[y_buf]
2413 [uv_buf]"+r"(uv_buf), // %[uv_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002414 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002415 [width]"+rm"(width) // %[width]
2416 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2417 : "memory", "cc"
2418#if defined(__SSE2__)
2419 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2420#endif
2421 );
2422}
2423
2424void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002425 const uint8* uv_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002426 uint8* dst_argb,
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002427 int width) {
2428 asm volatile (
2429 "pcmpeqb %%xmm5,%%xmm5 \n"
2430 "pxor %%xmm4,%%xmm4 \n"
2431 ".p2align 4 \n"
2432 "1: \n"
2433 READNV12
2434 YVUTORGB
2435 "punpcklbw %%xmm1,%%xmm0 \n"
2436 "punpcklbw %%xmm5,%%xmm2 \n"
2437 "movdqa %%xmm0,%%xmm1 \n"
2438 "punpcklwd %%xmm2,%%xmm0 \n"
2439 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002440 "movdqu %%xmm0,(%[dst_argb]) \n"
2441 "movdqu %%xmm1,0x10(%[dst_argb]) \n"
2442 "lea 0x20(%[dst_argb]),%[dst_argb] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002443 "sub $0x8,%[width] \n"
2444 "jg 1b \n"
2445 : [y_buf]"+r"(y_buf), // %[y_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002446 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2447 [dst_argb]"+r"(dst_argb), // %[dst_argb]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002448 [width]"+rm"(width) // %[width]
2449 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002450 : "memory", "cc"
2451#if defined(__SSE2__)
2452 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2453#endif
2454 );
2455}
2456
2457void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2458 const uint8* u_buf,
2459 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002460 uint8* dst_bgra,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002461 int width) {
2462 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002463 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002464 "pcmpeqb %%xmm5,%%xmm5 \n"
2465 "pxor %%xmm4,%%xmm4 \n"
2466 ".p2align 4 \n"
2467 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002468 READYUV422
2469 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002470 "pcmpeqb %%xmm5,%%xmm5 \n"
2471 "punpcklbw %%xmm0,%%xmm1 \n"
2472 "punpcklbw %%xmm2,%%xmm5 \n"
2473 "movdqa %%xmm5,%%xmm0 \n"
2474 "punpcklwd %%xmm1,%%xmm5 \n"
2475 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002476 "movdqa %%xmm5,(%[dst_bgra]) \n"
2477 "movdqa %%xmm0,0x10(%[dst_bgra]) \n"
2478 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002479 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002480 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002481 : [y_buf]"+r"(y_buf), // %[y_buf]
2482 [u_buf]"+r"(u_buf), // %[u_buf]
2483 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002484 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002485 [width]"+rm"(width) // %[width]
2486 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002487 : "memory", "cc"
2488#if defined(__SSE2__)
2489 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2490#endif
2491 );
2492}
2493
fbarchard@google.come214fe32012-06-04 23:47:11 +00002494void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002495 const uint8* u_buf,
2496 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002497 uint8* dst_abgr,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002498 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002499 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002500 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002501 "pcmpeqb %%xmm5,%%xmm5 \n"
2502 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002503 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002504 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002505 READYUV422
2506 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002507 "punpcklbw %%xmm1,%%xmm2 \n"
2508 "punpcklbw %%xmm5,%%xmm0 \n"
2509 "movdqa %%xmm2,%%xmm1 \n"
2510 "punpcklwd %%xmm0,%%xmm2 \n"
2511 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002512 "movdqa %%xmm2,(%[dst_abgr]) \n"
2513 "movdqa %%xmm1,0x10(%[dst_abgr]) \n"
2514 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002515 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002516 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002517 : [y_buf]"+r"(y_buf), // %[y_buf]
2518 [u_buf]"+r"(u_buf), // %[u_buf]
2519 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002520 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002521 [width]"+rm"(width) // %[width]
2522 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002523 : "memory", "cc"
2524#if defined(__SSE2__)
2525 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2526#endif
2527 );
2528}
2529
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002530void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2531 const uint8* u_buf,
2532 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002533 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002534 int width) {
2535 asm volatile (
2536 "sub %[u_buf],%[v_buf] \n"
2537 "pcmpeqb %%xmm5,%%xmm5 \n"
2538 "pxor %%xmm4,%%xmm4 \n"
2539 ".p2align 4 \n"
2540 "1: \n"
2541 READYUV422
2542 YUVTORGB
2543 "pcmpeqb %%xmm5,%%xmm5 \n"
2544 "punpcklbw %%xmm2,%%xmm1 \n"
2545 "punpcklbw %%xmm0,%%xmm5 \n"
2546 "movdqa %%xmm5,%%xmm0 \n"
2547 "punpcklwd %%xmm1,%%xmm5 \n"
2548 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002549 "movdqa %%xmm5,(%[dst_rgba]) \n"
2550 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2551 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002552 "sub $0x8,%[width] \n"
2553 "jg 1b \n"
2554 : [y_buf]"+r"(y_buf), // %[y_buf]
2555 [u_buf]"+r"(u_buf), // %[u_buf]
2556 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002557 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002558 [width]"+rm"(width) // %[width]
2559 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2560 : "memory", "cc"
2561#if defined(__SSE2__)
2562 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2563#endif
2564 );
2565}
2566
fbarchard@google.come214fe32012-06-04 23:47:11 +00002567void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002568 const uint8* u_buf,
2569 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002570 uint8* dst_bgra,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002571 int width) {
2572 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002573 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002574 "pcmpeqb %%xmm5,%%xmm5 \n"
2575 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002576 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002577 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002578 READYUV422
2579 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002580 "pcmpeqb %%xmm5,%%xmm5 \n"
2581 "punpcklbw %%xmm0,%%xmm1 \n"
2582 "punpcklbw %%xmm2,%%xmm5 \n"
2583 "movdqa %%xmm5,%%xmm0 \n"
2584 "punpcklwd %%xmm1,%%xmm5 \n"
2585 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002586 "movdqu %%xmm5,(%[dst_bgra]) \n"
2587 "movdqu %%xmm0,0x10(%[dst_bgra]) \n"
2588 "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002589 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002590 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002591 : [y_buf]"+r"(y_buf), // %[y_buf]
2592 [u_buf]"+r"(u_buf), // %[u_buf]
2593 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002594 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002595 [width]"+rm"(width) // %[width]
2596 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002597 : "memory", "cc"
2598#if defined(__SSE2__)
2599 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2600#endif
2601 );
2602}
2603
fbarchard@google.come214fe32012-06-04 23:47:11 +00002604void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002605 const uint8* u_buf,
2606 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002607 uint8* dst_abgr,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002608 int width) {
2609 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002610 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002611 "pcmpeqb %%xmm5,%%xmm5 \n"
2612 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002613 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002614 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002615 READYUV422
2616 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002617 "punpcklbw %%xmm1,%%xmm2 \n"
2618 "punpcklbw %%xmm5,%%xmm0 \n"
2619 "movdqa %%xmm2,%%xmm1 \n"
2620 "punpcklwd %%xmm0,%%xmm2 \n"
2621 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002622 "movdqu %%xmm2,(%[dst_abgr]) \n"
2623 "movdqu %%xmm1,0x10(%[dst_abgr]) \n"
2624 "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002625 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002626 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002627 : [y_buf]"+r"(y_buf), // %[y_buf]
2628 [u_buf]"+r"(u_buf), // %[u_buf]
2629 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002630 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002631 [width]"+rm"(width) // %[width]
2632 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002633 : "memory", "cc"
2634#if defined(__SSE2__)
2635 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2636#endif
2637 );
2638}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002639
2640void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2641 const uint8* u_buf,
2642 const uint8* v_buf,
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002643 uint8* dst_rgba,
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002644 int width) {
2645 asm volatile (
2646 "sub %[u_buf],%[v_buf] \n"
2647 "pcmpeqb %%xmm5,%%xmm5 \n"
2648 "pxor %%xmm4,%%xmm4 \n"
2649 ".p2align 4 \n"
2650 "1: \n"
2651 READYUV422
2652 YUVTORGB
2653 "pcmpeqb %%xmm5,%%xmm5 \n"
2654 "punpcklbw %%xmm2,%%xmm1 \n"
2655 "punpcklbw %%xmm0,%%xmm5 \n"
2656 "movdqa %%xmm5,%%xmm0 \n"
2657 "punpcklwd %%xmm1,%%xmm5 \n"
2658 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002659 "movdqa %%xmm5,(%[dst_rgba]) \n"
2660 "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
2661 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002662 "sub $0x8,%[width] \n"
2663 "jg 1b \n"
2664 : [y_buf]"+r"(y_buf), // %[y_buf]
2665 [u_buf]"+r"(u_buf), // %[u_buf]
2666 [v_buf]"+r"(v_buf), // %[v_buf]
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002667 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002668 [width]"+rm"(width) // %[width]
2669 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2670 : "memory", "cc"
2671#if defined(__SSE2__)
2672 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2673#endif
2674 );
2675}
2676
fbarchard@google.come214fe32012-06-04 23:47:11 +00002677#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002678
2679#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002680void YToARGBRow_SSE2(const uint8* y_buf,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002681 uint8* dst_argb,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002682 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002683 asm volatile (
fbarchard@google.com30859f72012-11-02 09:51:29 +00002684 "pxor %%xmm5,%%xmm5 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002685 "pcmpeqb %%xmm4,%%xmm4 \n"
2686 "pslld $0x18,%%xmm4 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002687 "mov $0x00100010,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002688 "movd %%eax,%%xmm3 \n"
2689 "pshufd $0x0,%%xmm3,%%xmm3 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002690 "mov $0x004a004a,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002691 "movd %%eax,%%xmm2 \n"
2692 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002693 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002694 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002695 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002696 "movq (%0),%%xmm0 \n"
2697 "lea 0x8(%0),%0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002698 "punpcklbw %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002699 "psubusw %%xmm3,%%xmm0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002700 "pmullw %%xmm2,%%xmm0 \n"
2701 "psrlw $6, %%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002702 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002703
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002704 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002705 "punpcklbw %%xmm0,%%xmm0 \n"
2706 "movdqa %%xmm0,%%xmm1 \n"
2707 "punpcklwd %%xmm0,%%xmm0 \n"
2708 "punpckhwd %%xmm1,%%xmm1 \n"
2709 "por %%xmm4,%%xmm0 \n"
2710 "por %%xmm4,%%xmm1 \n"
2711 "movdqa %%xmm0,(%1) \n"
2712 "movdqa %%xmm1,16(%1) \n"
2713 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002714
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002715 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002716 "jg 1b \n"
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00002717 : "+r"(y_buf), // %0
2718 "+r"(dst_argb), // %1
2719 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002720 :
2721 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002722#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002723 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002724#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002725 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002726}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002727#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002728
fbarchard@google.com42831e02012-01-21 02:54:17 +00002729#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002730// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002731CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002732 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2733};
2734
fbarchard@google.com42831e02012-01-21 02:54:17 +00002735void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002736 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002737 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002738 "movdqa %3,%%xmm5 \n"
2739 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002740 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002741 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002742 "movdqa (%0,%2),%%xmm0 \n"
2743 "pshufb %%xmm5,%%xmm0 \n"
2744 "sub $0x10,%2 \n"
2745 "movdqa %%xmm0,(%1) \n"
2746 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002747 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002748 : "+r"(src), // %0
2749 "+r"(dst), // %1
2750 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002751 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002752 : "memory", "cc"
2753#if defined(__SSE2__)
2754 , "xmm0", "xmm5"
2755#endif
2756 );
2757}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002758#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002759
fbarchard@google.com42831e02012-01-21 02:54:17 +00002760#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002761void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002762 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002763 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002764 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002765 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002766 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002767 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002768 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002769 "psllw $0x8,%%xmm0 \n"
2770 "psrlw $0x8,%%xmm1 \n"
2771 "por %%xmm1,%%xmm0 \n"
2772 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2773 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2774 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2775 "sub $0x10,%2 \n"
2776 "movdqu %%xmm0,(%1) \n"
2777 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002778 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002779 : "+r"(src), // %0
2780 "+r"(dst), // %1
2781 "+r"(temp_width) // %2
2782 :
2783 : "memory", "cc"
2784#if defined(__SSE2__)
2785 , "xmm0", "xmm1"
2786#endif
2787 );
2788}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002789#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002790
fbarchard@google.com16a96642012-03-02 22:38:09 +00002791#ifdef HAS_MIRRORROW_UV_SSSE3
2792// Shuffle table for reversing the bytes of UV channels.
2793CONST uvec8 kShuffleMirrorUV = {
2794 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2795};
fbarchard@google.combdf7cb52012-11-05 23:40:11 +00002796void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
fbarchard@google.com16a96642012-03-02 22:38:09 +00002797 int width) {
2798 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002799 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002800 "movdqa %4,%%xmm1 \n"
2801 "lea -16(%0,%3,2),%0 \n"
2802 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002803 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002804 "1: \n"
2805 "movdqa (%0),%%xmm0 \n"
2806 "lea -16(%0),%0 \n"
2807 "pshufb %%xmm1,%%xmm0 \n"
2808 "sub $8,%3 \n"
2809 "movlpd %%xmm0,(%1) \n"
2810 "movhpd %%xmm0,(%1,%2) \n"
2811 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002812 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002813 : "+r"(src), // %0
2814 "+r"(dst_u), // %1
2815 "+r"(dst_v), // %2
2816 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002817 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002818 : "memory", "cc"
2819#if defined(__SSE2__)
2820 , "xmm0", "xmm1"
2821#endif
2822 );
2823}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002824#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002825
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002826#ifdef HAS_ARGBMIRRORROW_SSSE3
2827// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002828CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002829 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2830};
2831
2832void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2833 intptr_t temp_width = static_cast<intptr_t>(width);
2834 asm volatile (
2835 "movdqa %3,%%xmm5 \n"
2836 "lea -0x10(%0),%0 \n"
2837 ".p2align 4 \n"
2838 "1: \n"
2839 "movdqa (%0,%2,4),%%xmm0 \n"
2840 "pshufb %%xmm5,%%xmm0 \n"
2841 "sub $0x4,%2 \n"
2842 "movdqa %%xmm0,(%1) \n"
2843 "lea 0x10(%1),%1 \n"
2844 "jg 1b \n"
2845 : "+r"(src), // %0
2846 "+r"(dst), // %1
2847 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002848 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002849 : "memory", "cc"
2850#if defined(__SSE2__)
2851 , "xmm0", "xmm5"
2852#endif
2853 );
2854}
2855#endif // HAS_ARGBMIRRORROW_SSSE3
2856
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002857#ifdef HAS_SPLITUVROW_SSE2
2858void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002859 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002860 "pcmpeqb %%xmm5,%%xmm5 \n"
2861 "psrlw $0x8,%%xmm5 \n"
2862 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002863 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002864 "1: \n"
2865 "movdqa (%0),%%xmm0 \n"
2866 "movdqa 0x10(%0),%%xmm1 \n"
2867 "lea 0x20(%0),%0 \n"
2868 "movdqa %%xmm0,%%xmm2 \n"
2869 "movdqa %%xmm1,%%xmm3 \n"
2870 "pand %%xmm5,%%xmm0 \n"
2871 "pand %%xmm5,%%xmm1 \n"
2872 "packuswb %%xmm1,%%xmm0 \n"
2873 "psrlw $0x8,%%xmm2 \n"
2874 "psrlw $0x8,%%xmm3 \n"
2875 "packuswb %%xmm3,%%xmm2 \n"
2876 "movdqa %%xmm0,(%1) \n"
2877 "movdqa %%xmm2,(%1,%2) \n"
2878 "lea 0x10(%1),%1 \n"
2879 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002880 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002881 : "+r"(src_uv), // %0
2882 "+r"(dst_u), // %1
2883 "+r"(dst_v), // %2
2884 "+r"(pix) // %3
2885 :
2886 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002887#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002888 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002889#endif
2890 );
2891}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002892
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002893void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2894 int pix) {
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002895 asm volatile (
2896 "pcmpeqb %%xmm5,%%xmm5 \n"
2897 "psrlw $0x8,%%xmm5 \n"
2898 "sub %1,%2 \n"
2899 ".p2align 4 \n"
2900 "1: \n"
2901 "movdqu (%0),%%xmm0 \n"
2902 "movdqu 0x10(%0),%%xmm1 \n"
2903 "lea 0x20(%0),%0 \n"
2904 "movdqa %%xmm0,%%xmm2 \n"
2905 "movdqa %%xmm1,%%xmm3 \n"
2906 "pand %%xmm5,%%xmm0 \n"
2907 "pand %%xmm5,%%xmm1 \n"
2908 "packuswb %%xmm1,%%xmm0 \n"
2909 "psrlw $0x8,%%xmm2 \n"
2910 "psrlw $0x8,%%xmm3 \n"
2911 "packuswb %%xmm3,%%xmm2 \n"
2912 "movdqu %%xmm0,(%1) \n"
2913 "movdqu %%xmm2,(%1,%2) \n"
2914 "lea 0x10(%1),%1 \n"
2915 "sub $0x10,%3 \n"
2916 "jg 1b \n"
2917 : "+r"(src_uv), // %0
2918 "+r"(dst_u), // %1
2919 "+r"(dst_v), // %2
2920 "+r"(pix) // %3
2921 :
2922 : "memory", "cc"
2923#if defined(__SSE2__)
2924 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2925#endif
2926 );
2927}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002928#endif // HAS_SPLITUVROW_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002929
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002930#ifdef HAS_MERGEUVROW_SSE2
2931void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2932 int width) {
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002933 asm volatile (
2934 "sub %0,%1 \n"
2935 ".p2align 4 \n"
2936 "1: \n"
2937 "movdqa (%0),%%xmm0 \n"
2938 "movdqa (%0,%1,1),%%xmm1 \n"
2939 "lea 0x10(%0),%0 \n"
2940 "movdqa %%xmm0,%%xmm2 \n"
2941 "punpcklbw %%xmm1,%%xmm0 \n"
2942 "punpckhbw %%xmm1,%%xmm2 \n"
2943 "movdqa %%xmm0,(%2) \n"
2944 "movdqa %%xmm2,0x10(%2) \n"
2945 "lea 0x20(%2),%2 \n"
2946 "sub $0x10,%3 \n"
2947 "jg 1b \n"
2948 : "+r"(src_u), // %0
2949 "+r"(src_v), // %1
2950 "+r"(dst_uv), // %2
2951 "+r"(width) // %3
2952 :
2953 : "memory", "cc"
2954#if defined(__SSE2__)
2955 , "xmm0", "xmm1", "xmm2"
2956#endif
2957 );
2958}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002959
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002960void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2961 uint8* dst_uv, int width) {
fbarchard@google.come0d86482012-10-27 19:07:55 +00002962 asm volatile (
2963 "sub %0,%1 \n"
2964 ".p2align 4 \n"
2965 "1: \n"
2966 "movdqu (%0),%%xmm0 \n"
2967 "movdqu (%0,%1,1),%%xmm1 \n"
2968 "lea 0x10(%0),%0 \n"
2969 "movdqa %%xmm0,%%xmm2 \n"
2970 "punpcklbw %%xmm1,%%xmm0 \n"
2971 "punpckhbw %%xmm1,%%xmm2 \n"
2972 "movdqu %%xmm0,(%2) \n"
2973 "movdqu %%xmm2,0x10(%2) \n"
2974 "lea 0x20(%2),%2 \n"
2975 "sub $0x10,%3 \n"
2976 "jg 1b \n"
2977 : "+r"(src_u), // %0
2978 "+r"(src_v), // %1
2979 "+r"(dst_uv), // %2
2980 "+r"(width) // %3
2981 :
2982 : "memory", "cc"
2983#if defined(__SSE2__)
2984 , "xmm0", "xmm1", "xmm2"
2985#endif
2986 );
2987}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00002988#endif // HAS_MERGEUVROW_SSE2
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002989
fbarchard@google.com19932f82012-02-16 22:19:14 +00002990#ifdef HAS_COPYROW_SSE2
2991void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002992 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002993 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002994 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002995 "1: \n"
2996 "movdqa (%0),%%xmm0 \n"
2997 "movdqa 0x10(%0),%%xmm1 \n"
2998 "movdqa %%xmm0,(%0,%1) \n"
2999 "movdqa %%xmm1,0x10(%0,%1) \n"
3000 "lea 0x20(%0),%0 \n"
3001 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003002 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00003003 : "+r"(src), // %0
3004 "+r"(dst), // %1
3005 "+r"(count) // %2
3006 :
3007 : "memory", "cc"
3008#if defined(__SSE2__)
3009 , "xmm0", "xmm1"
3010#endif
3011 );
3012}
3013#endif // HAS_COPYROW_SSE2
3014
3015#ifdef HAS_COPYROW_X86
3016void CopyRow_X86(const uint8* src, uint8* dst, int width) {
3017 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003018 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00003019 "shr $0x2,%2 \n"
3020 "rep movsl \n"
3021 : "+S"(src), // %0
3022 "+D"(dst), // %1
3023 "+c"(width_tmp) // %2
3024 :
3025 : "memory", "cc"
3026 );
3027}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00003028#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00003029
fbarchard@google.comaa7988f2013-04-12 00:44:33 +00003030// Unaligned Multiple of 1.
3031void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
3032 size_t width_tmp = static_cast<size_t>(width);
3033 asm volatile (
3034 "rep movsb \n"
3035 : "+S"(src), // %0
3036 "+D"(dst), // %1
3037 "+c"(width_tmp) // %2
3038 :
3039 : "memory", "cc"
3040 );
3041}
3042
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003043#ifdef HAS_SETROW_X86
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00003044void SetRow_X86(uint8* dst, uint32 v32, int width) {
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003045 size_t width_tmp = static_cast<size_t>(width);
3046 asm volatile (
3047 "shr $0x2,%1 \n"
3048 "rep stosl \n"
3049 : "+D"(dst), // %0
3050 "+c"(width_tmp) // %1
3051 : "a"(v32) // %2
3052 : "memory", "cc");
3053}
3054
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00003055void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003056 int dst_stride, int height) {
3057 for (int y = 0; y < height; ++y) {
3058 size_t width_tmp = static_cast<size_t>(width);
3059 uint32* d = reinterpret_cast<uint32*>(dst);
3060 asm volatile (
3061 "rep stosl \n"
3062 : "+D"(d), // %0
3063 "+c"(width_tmp) // %1
3064 : "a"(v32) // %2
3065 : "memory", "cc");
3066 dst += dst_stride;
3067 }
3068}
3069#endif // HAS_SETROW_X86
3070
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003071#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003072void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003073 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003074 "pcmpeqb %%xmm5,%%xmm5 \n"
3075 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003076 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003077 "1: \n"
3078 "movdqa (%0),%%xmm0 \n"
3079 "movdqa 0x10(%0),%%xmm1 \n"
3080 "lea 0x20(%0),%0 \n"
3081 "pand %%xmm5,%%xmm0 \n"
3082 "pand %%xmm5,%%xmm1 \n"
3083 "packuswb %%xmm1,%%xmm0 \n"
3084 "movdqa %%xmm0,(%1) \n"
3085 "lea 0x10(%1),%1 \n"
3086 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003087 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003088 : "+r"(src_yuy2), // %0
3089 "+r"(dst_y), // %1
3090 "+r"(pix) // %2
3091 :
3092 : "memory", "cc"
3093#if defined(__SSE2__)
3094 , "xmm0", "xmm1", "xmm5"
3095#endif
3096 );
3097}
3098
3099void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003100 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003101 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003102 "pcmpeqb %%xmm5,%%xmm5 \n"
3103 "psrlw $0x8,%%xmm5 \n"
3104 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003105 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003106 "1: \n"
3107 "movdqa (%0),%%xmm0 \n"
3108 "movdqa 0x10(%0),%%xmm1 \n"
3109 "movdqa (%0,%4,1),%%xmm2 \n"
3110 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
3111 "lea 0x20(%0),%0 \n"
3112 "pavgb %%xmm2,%%xmm0 \n"
3113 "pavgb %%xmm3,%%xmm1 \n"
3114 "psrlw $0x8,%%xmm0 \n"
3115 "psrlw $0x8,%%xmm1 \n"
3116 "packuswb %%xmm1,%%xmm0 \n"
3117 "movdqa %%xmm0,%%xmm1 \n"
3118 "pand %%xmm5,%%xmm0 \n"
3119 "packuswb %%xmm0,%%xmm0 \n"
3120 "psrlw $0x8,%%xmm1 \n"
3121 "packuswb %%xmm1,%%xmm1 \n"
3122 "movq %%xmm0,(%1) \n"
3123 "movq %%xmm1,(%1,%2) \n"
3124 "lea 0x8(%1),%1 \n"
3125 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003126 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003127 : "+r"(src_yuy2), // %0
3128 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003129 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003130 "+r"(pix) // %3
3131 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
3132 : "memory", "cc"
3133#if defined(__SSE2__)
3134 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3135#endif
3136 );
3137}
3138
fbarchard@google.comc704f782012-08-30 19:53:48 +00003139void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3140 uint8* dst_u, uint8* dst_v, int pix) {
3141 asm volatile (
3142 "pcmpeqb %%xmm5,%%xmm5 \n"
3143 "psrlw $0x8,%%xmm5 \n"
3144 "sub %1,%2 \n"
3145 ".p2align 4 \n"
3146 "1: \n"
3147 "movdqa (%0),%%xmm0 \n"
3148 "movdqa 0x10(%0),%%xmm1 \n"
3149 "lea 0x20(%0),%0 \n"
3150 "psrlw $0x8,%%xmm0 \n"
3151 "psrlw $0x8,%%xmm1 \n"
3152 "packuswb %%xmm1,%%xmm0 \n"
3153 "movdqa %%xmm0,%%xmm1 \n"
3154 "pand %%xmm5,%%xmm0 \n"
3155 "packuswb %%xmm0,%%xmm0 \n"
3156 "psrlw $0x8,%%xmm1 \n"
3157 "packuswb %%xmm1,%%xmm1 \n"
3158 "movq %%xmm0,(%1) \n"
3159 "movq %%xmm1,(%1,%2) \n"
3160 "lea 0x8(%1),%1 \n"
3161 "sub $0x10,%3 \n"
3162 "jg 1b \n"
3163 : "+r"(src_yuy2), // %0
3164 "+r"(dst_u), // %1
3165 "+r"(dst_v), // %2
3166 "+r"(pix) // %3
3167 :
3168 : "memory", "cc"
3169#if defined(__SSE2__)
3170 , "xmm0", "xmm1", "xmm5"
3171#endif
3172 );
3173}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00003174
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003175void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
3176 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003177 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003178 "pcmpeqb %%xmm5,%%xmm5 \n"
3179 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003180 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003181 "1: \n"
3182 "movdqu (%0),%%xmm0 \n"
3183 "movdqu 0x10(%0),%%xmm1 \n"
3184 "lea 0x20(%0),%0 \n"
3185 "pand %%xmm5,%%xmm0 \n"
3186 "pand %%xmm5,%%xmm1 \n"
3187 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003188 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003189 "movdqu %%xmm0,(%1) \n"
3190 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003191 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003192 : "+r"(src_yuy2), // %0
3193 "+r"(dst_y), // %1
3194 "+r"(pix) // %2
3195 :
3196 : "memory", "cc"
3197#if defined(__SSE2__)
3198 , "xmm0", "xmm1", "xmm5"
3199#endif
3200 );
3201}
3202
3203void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
3204 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00003205 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003206 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003207 "pcmpeqb %%xmm5,%%xmm5 \n"
3208 "psrlw $0x8,%%xmm5 \n"
3209 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003210 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003211 "1: \n"
3212 "movdqu (%0),%%xmm0 \n"
3213 "movdqu 0x10(%0),%%xmm1 \n"
3214 "movdqu (%0,%4,1),%%xmm2 \n"
3215 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3216 "lea 0x20(%0),%0 \n"
3217 "pavgb %%xmm2,%%xmm0 \n"
3218 "pavgb %%xmm3,%%xmm1 \n"
3219 "psrlw $0x8,%%xmm0 \n"
3220 "psrlw $0x8,%%xmm1 \n"
3221 "packuswb %%xmm1,%%xmm0 \n"
3222 "movdqa %%xmm0,%%xmm1 \n"
3223 "pand %%xmm5,%%xmm0 \n"
3224 "packuswb %%xmm0,%%xmm0 \n"
3225 "psrlw $0x8,%%xmm1 \n"
3226 "packuswb %%xmm1,%%xmm1 \n"
3227 "movq %%xmm0,(%1) \n"
3228 "movq %%xmm1,(%1,%2) \n"
3229 "lea 0x8(%1),%1 \n"
3230 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003231 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003232 : "+r"(src_yuy2), // %0
3233 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003234 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003235 "+r"(pix) // %3
3236 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
3237 : "memory", "cc"
3238#if defined(__SSE2__)
3239 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3240#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00003241 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003242}
3243
fbarchard@google.comc704f782012-08-30 19:53:48 +00003244void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
3245 uint8* dst_u, uint8* dst_v, int pix) {
3246 asm volatile (
3247 "pcmpeqb %%xmm5,%%xmm5 \n"
3248 "psrlw $0x8,%%xmm5 \n"
3249 "sub %1,%2 \n"
3250 ".p2align 4 \n"
3251 "1: \n"
3252 "movdqu (%0),%%xmm0 \n"
3253 "movdqu 0x10(%0),%%xmm1 \n"
3254 "lea 0x20(%0),%0 \n"
3255 "psrlw $0x8,%%xmm0 \n"
3256 "psrlw $0x8,%%xmm1 \n"
3257 "packuswb %%xmm1,%%xmm0 \n"
3258 "movdqa %%xmm0,%%xmm1 \n"
3259 "pand %%xmm5,%%xmm0 \n"
3260 "packuswb %%xmm0,%%xmm0 \n"
3261 "psrlw $0x8,%%xmm1 \n"
3262 "packuswb %%xmm1,%%xmm1 \n"
3263 "movq %%xmm0,(%1) \n"
3264 "movq %%xmm1,(%1,%2) \n"
3265 "lea 0x8(%1),%1 \n"
3266 "sub $0x10,%3 \n"
3267 "jg 1b \n"
3268 : "+r"(src_yuy2), // %0
3269 "+r"(dst_u), // %1
3270 "+r"(dst_v), // %2
3271 "+r"(pix) // %3
3272 :
3273 : "memory", "cc"
3274#if defined(__SSE2__)
3275 , "xmm0", "xmm1", "xmm5"
3276#endif
3277 );
3278}
3279
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003280void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003281 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003282 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003283 "1: \n"
3284 "movdqa (%0),%%xmm0 \n"
3285 "movdqa 0x10(%0),%%xmm1 \n"
3286 "lea 0x20(%0),%0 \n"
3287 "psrlw $0x8,%%xmm0 \n"
3288 "psrlw $0x8,%%xmm1 \n"
3289 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003290 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003291 "movdqa %%xmm0,(%1) \n"
3292 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003293 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003294 : "+r"(src_uyvy), // %0
3295 "+r"(dst_y), // %1
3296 "+r"(pix) // %2
3297 :
3298 : "memory", "cc"
3299#if defined(__SSE2__)
3300 , "xmm0", "xmm1"
3301#endif
3302 );
3303}
3304
3305void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003306 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003307 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003308 "pcmpeqb %%xmm5,%%xmm5 \n"
3309 "psrlw $0x8,%%xmm5 \n"
3310 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003311 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003312 "1: \n"
3313 "movdqa (%0),%%xmm0 \n"
3314 "movdqa 0x10(%0),%%xmm1 \n"
3315 "movdqa (%0,%4,1),%%xmm2 \n"
3316 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
3317 "lea 0x20(%0),%0 \n"
3318 "pavgb %%xmm2,%%xmm0 \n"
3319 "pavgb %%xmm3,%%xmm1 \n"
3320 "pand %%xmm5,%%xmm0 \n"
3321 "pand %%xmm5,%%xmm1 \n"
3322 "packuswb %%xmm1,%%xmm0 \n"
3323 "movdqa %%xmm0,%%xmm1 \n"
3324 "pand %%xmm5,%%xmm0 \n"
3325 "packuswb %%xmm0,%%xmm0 \n"
3326 "psrlw $0x8,%%xmm1 \n"
3327 "packuswb %%xmm1,%%xmm1 \n"
3328 "movq %%xmm0,(%1) \n"
3329 "movq %%xmm1,(%1,%2) \n"
3330 "lea 0x8(%1),%1 \n"
3331 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003332 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003333 : "+r"(src_uyvy), // %0
3334 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003335 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003336 "+r"(pix) // %3
3337 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3338 : "memory", "cc"
3339#if defined(__SSE2__)
3340 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3341#endif
3342 );
3343}
3344
fbarchard@google.comc704f782012-08-30 19:53:48 +00003345void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3346 uint8* dst_u, uint8* dst_v, int pix) {
3347 asm volatile (
3348 "pcmpeqb %%xmm5,%%xmm5 \n"
3349 "psrlw $0x8,%%xmm5 \n"
3350 "sub %1,%2 \n"
3351 ".p2align 4 \n"
3352 "1: \n"
3353 "movdqa (%0),%%xmm0 \n"
3354 "movdqa 0x10(%0),%%xmm1 \n"
3355 "lea 0x20(%0),%0 \n"
3356 "pand %%xmm5,%%xmm0 \n"
3357 "pand %%xmm5,%%xmm1 \n"
3358 "packuswb %%xmm1,%%xmm0 \n"
3359 "movdqa %%xmm0,%%xmm1 \n"
3360 "pand %%xmm5,%%xmm0 \n"
3361 "packuswb %%xmm0,%%xmm0 \n"
3362 "psrlw $0x8,%%xmm1 \n"
3363 "packuswb %%xmm1,%%xmm1 \n"
3364 "movq %%xmm0,(%1) \n"
3365 "movq %%xmm1,(%1,%2) \n"
3366 "lea 0x8(%1),%1 \n"
3367 "sub $0x10,%3 \n"
3368 "jg 1b \n"
3369 : "+r"(src_uyvy), // %0
3370 "+r"(dst_u), // %1
3371 "+r"(dst_v), // %2
3372 "+r"(pix) // %3
3373 :
3374 : "memory", "cc"
3375#if defined(__SSE2__)
3376 , "xmm0", "xmm1", "xmm5"
3377#endif
3378 );
3379}
3380
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003381void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3382 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003383 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003384 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003385 "1: \n"
3386 "movdqu (%0),%%xmm0 \n"
3387 "movdqu 0x10(%0),%%xmm1 \n"
3388 "lea 0x20(%0),%0 \n"
3389 "psrlw $0x8,%%xmm0 \n"
3390 "psrlw $0x8,%%xmm1 \n"
3391 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003392 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003393 "movdqu %%xmm0,(%1) \n"
3394 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003395 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003396 : "+r"(src_uyvy), // %0
3397 "+r"(dst_y), // %1
3398 "+r"(pix) // %2
3399 :
3400 : "memory", "cc"
3401#if defined(__SSE2__)
3402 , "xmm0", "xmm1"
3403#endif
3404 );
3405}
3406
3407void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003408 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003409 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003410 "pcmpeqb %%xmm5,%%xmm5 \n"
3411 "psrlw $0x8,%%xmm5 \n"
3412 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003413 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003414 "1: \n"
3415 "movdqu (%0),%%xmm0 \n"
3416 "movdqu 0x10(%0),%%xmm1 \n"
3417 "movdqu (%0,%4,1),%%xmm2 \n"
3418 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3419 "lea 0x20(%0),%0 \n"
3420 "pavgb %%xmm2,%%xmm0 \n"
3421 "pavgb %%xmm3,%%xmm1 \n"
3422 "pand %%xmm5,%%xmm0 \n"
3423 "pand %%xmm5,%%xmm1 \n"
3424 "packuswb %%xmm1,%%xmm0 \n"
3425 "movdqa %%xmm0,%%xmm1 \n"
3426 "pand %%xmm5,%%xmm0 \n"
3427 "packuswb %%xmm0,%%xmm0 \n"
3428 "psrlw $0x8,%%xmm1 \n"
3429 "packuswb %%xmm1,%%xmm1 \n"
3430 "movq %%xmm0,(%1) \n"
3431 "movq %%xmm1,(%1,%2) \n"
3432 "lea 0x8(%1),%1 \n"
3433 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003434 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003435 : "+r"(src_uyvy), // %0
3436 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003437 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003438 "+r"(pix) // %3
3439 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3440 : "memory", "cc"
3441#if defined(__SSE2__)
3442 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3443#endif
3444 );
3445}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003446
3447void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3448 uint8* dst_u, uint8* dst_v, int pix) {
3449 asm volatile (
3450 "pcmpeqb %%xmm5,%%xmm5 \n"
3451 "psrlw $0x8,%%xmm5 \n"
3452 "sub %1,%2 \n"
3453 ".p2align 4 \n"
3454 "1: \n"
3455 "movdqu (%0),%%xmm0 \n"
3456 "movdqu 0x10(%0),%%xmm1 \n"
3457 "lea 0x20(%0),%0 \n"
3458 "pand %%xmm5,%%xmm0 \n"
3459 "pand %%xmm5,%%xmm1 \n"
3460 "packuswb %%xmm1,%%xmm0 \n"
3461 "movdqa %%xmm0,%%xmm1 \n"
3462 "pand %%xmm5,%%xmm0 \n"
3463 "packuswb %%xmm0,%%xmm0 \n"
3464 "psrlw $0x8,%%xmm1 \n"
3465 "packuswb %%xmm1,%%xmm1 \n"
3466 "movq %%xmm0,(%1) \n"
3467 "movq %%xmm1,(%1,%2) \n"
3468 "lea 0x8(%1),%1 \n"
3469 "sub $0x10,%3 \n"
3470 "jg 1b \n"
3471 : "+r"(src_uyvy), // %0
3472 "+r"(dst_u), // %1
3473 "+r"(dst_v), // %2
3474 "+r"(pix) // %3
3475 :
3476 : "memory", "cc"
3477#if defined(__SSE2__)
3478 , "xmm0", "xmm1", "xmm5"
3479#endif
3480 );
3481}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003482#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003483
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003484#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003485// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003486void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3487 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003488 asm volatile (
3489 "pcmpeqb %%xmm7,%%xmm7 \n"
3490 "psrlw $0xf,%%xmm7 \n"
3491 "pcmpeqb %%xmm6,%%xmm6 \n"
3492 "psrlw $0x8,%%xmm6 \n"
3493 "pcmpeqb %%xmm5,%%xmm5 \n"
3494 "psllw $0x8,%%xmm5 \n"
3495 "pcmpeqb %%xmm4,%%xmm4 \n"
3496 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003497 "sub $0x1,%3 \n"
3498 "je 91f \n"
3499 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003500
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003501 // 1 pixel loop until destination pointer is aligned.
3502 "10: \n"
3503 "test $0xf,%2 \n"
3504 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003505 "movd (%0),%%xmm3 \n"
3506 "lea 0x4(%0),%0 \n"
3507 "movdqa %%xmm3,%%xmm0 \n"
3508 "pxor %%xmm4,%%xmm3 \n"
3509 "movd (%1),%%xmm2 \n"
3510 "psrlw $0x8,%%xmm3 \n"
3511 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3512 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3513 "pand %%xmm6,%%xmm2 \n"
3514 "paddw %%xmm7,%%xmm3 \n"
3515 "pmullw %%xmm3,%%xmm2 \n"
3516 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003517 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003518 "psrlw $0x8,%%xmm1 \n"
3519 "por %%xmm4,%%xmm0 \n"
3520 "pmullw %%xmm3,%%xmm1 \n"
3521 "psrlw $0x8,%%xmm2 \n"
3522 "paddusb %%xmm2,%%xmm0 \n"
3523 "pand %%xmm5,%%xmm1 \n"
3524 "paddusb %%xmm1,%%xmm0 \n"
3525 "sub $0x1,%3 \n"
3526 "movd %%xmm0,(%2) \n"
3527 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003528 "jge 10b \n"
3529
3530 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003531 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003532 "jl 49f \n"
3533
fbarchard@google.com794fe122012-06-15 01:05:01 +00003534 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003535 ".p2align 2 \n"
3536 "41: \n"
3537 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003538 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003539 "movdqa %%xmm3,%%xmm0 \n"
3540 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003541 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003542 "psrlw $0x8,%%xmm3 \n"
3543 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3544 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003545 "pand %%xmm6,%%xmm2 \n"
3546 "paddw %%xmm7,%%xmm3 \n"
3547 "pmullw %%xmm3,%%xmm2 \n"
3548 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003549 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003550 "psrlw $0x8,%%xmm1 \n"
3551 "por %%xmm4,%%xmm0 \n"
3552 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003553 "psrlw $0x8,%%xmm2 \n"
3554 "paddusb %%xmm2,%%xmm0 \n"
3555 "pand %%xmm5,%%xmm1 \n"
3556 "paddusb %%xmm1,%%xmm0 \n"
3557 "sub $0x4,%3 \n"
3558 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003559 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003560 "jge 41b \n"
3561
3562 "49: \n"
3563 "add $0x3,%3 \n"
3564 "jl 99f \n"
3565
fbarchard@google.com794fe122012-06-15 01:05:01 +00003566 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003567 "91: \n"
3568 "movd (%0),%%xmm3 \n"
3569 "lea 0x4(%0),%0 \n"
3570 "movdqa %%xmm3,%%xmm0 \n"
3571 "pxor %%xmm4,%%xmm3 \n"
3572 "movd (%1),%%xmm2 \n"
3573 "psrlw $0x8,%%xmm3 \n"
3574 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3575 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3576 "pand %%xmm6,%%xmm2 \n"
3577 "paddw %%xmm7,%%xmm3 \n"
3578 "pmullw %%xmm3,%%xmm2 \n"
3579 "movd (%1),%%xmm1 \n"
3580 "lea 0x4(%1),%1 \n"
3581 "psrlw $0x8,%%xmm1 \n"
3582 "por %%xmm4,%%xmm0 \n"
3583 "pmullw %%xmm3,%%xmm1 \n"
3584 "psrlw $0x8,%%xmm2 \n"
3585 "paddusb %%xmm2,%%xmm0 \n"
3586 "pand %%xmm5,%%xmm1 \n"
3587 "paddusb %%xmm1,%%xmm0 \n"
3588 "sub $0x1,%3 \n"
3589 "movd %%xmm0,(%2) \n"
3590 "lea 0x4(%2),%2 \n"
3591 "jge 91b \n"
3592 "99: \n"
3593 : "+r"(src_argb0), // %0
3594 "+r"(src_argb1), // %1
3595 "+r"(dst_argb), // %2
3596 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003597 :
3598 : "memory", "cc"
3599#if defined(__SSE2__)
3600 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3601#endif
3602 );
3603}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003604#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003605
fbarchard@google.com96af8702012-04-06 18:22:27 +00003606#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003607// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003608CONST uvec8 kShuffleAlpha = {
3609 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3610 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3611};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003612
3613// Blend 8 pixels at a time
3614// Shuffle table for reversing the bytes.
3615
3616// Same as SSE2, but replaces
3617// psrlw xmm3, 8 // alpha
3618// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3619// pshuflw xmm3, xmm3,0F5h
3620// with..
3621// pshufb xmm3, kShuffleAlpha // alpha
3622
3623void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3624 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003625 asm volatile (
3626 "pcmpeqb %%xmm7,%%xmm7 \n"
3627 "psrlw $0xf,%%xmm7 \n"
3628 "pcmpeqb %%xmm6,%%xmm6 \n"
3629 "psrlw $0x8,%%xmm6 \n"
3630 "pcmpeqb %%xmm5,%%xmm5 \n"
3631 "psllw $0x8,%%xmm5 \n"
3632 "pcmpeqb %%xmm4,%%xmm4 \n"
3633 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003634 "sub $0x1,%3 \n"
3635 "je 91f \n"
3636 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003637
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003638 // 1 pixel loop until destination pointer is aligned.
3639 "10: \n"
3640 "test $0xf,%2 \n"
3641 "je 19f \n"
3642 "movd (%0),%%xmm3 \n"
3643 "lea 0x4(%0),%0 \n"
3644 "movdqa %%xmm3,%%xmm0 \n"
3645 "pxor %%xmm4,%%xmm3 \n"
3646 "movd (%1),%%xmm2 \n"
3647 "pshufb %4,%%xmm3 \n"
3648 "pand %%xmm6,%%xmm2 \n"
3649 "paddw %%xmm7,%%xmm3 \n"
3650 "pmullw %%xmm3,%%xmm2 \n"
3651 "movd (%1),%%xmm1 \n"
3652 "lea 0x4(%1),%1 \n"
3653 "psrlw $0x8,%%xmm1 \n"
3654 "por %%xmm4,%%xmm0 \n"
3655 "pmullw %%xmm3,%%xmm1 \n"
3656 "psrlw $0x8,%%xmm2 \n"
3657 "paddusb %%xmm2,%%xmm0 \n"
3658 "pand %%xmm5,%%xmm1 \n"
3659 "paddusb %%xmm1,%%xmm0 \n"
3660 "sub $0x1,%3 \n"
3661 "movd %%xmm0,(%2) \n"
3662 "lea 0x4(%2),%2 \n"
3663 "jge 10b \n"
3664
3665 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003666 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003667 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003668 "test $0xf,%0 \n"
3669 "jne 41f \n"
3670 "test $0xf,%1 \n"
3671 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003672
fbarchard@google.com794fe122012-06-15 01:05:01 +00003673 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003674 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003675 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003676 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003677 "lea 0x10(%0),%0 \n"
3678 "movdqa %%xmm3,%%xmm0 \n"
3679 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003680 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003681 "pshufb %4,%%xmm3 \n"
3682 "pand %%xmm6,%%xmm2 \n"
3683 "paddw %%xmm7,%%xmm3 \n"
3684 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003685 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003686 "lea 0x10(%1),%1 \n"
3687 "psrlw $0x8,%%xmm1 \n"
3688 "por %%xmm4,%%xmm0 \n"
3689 "pmullw %%xmm3,%%xmm1 \n"
3690 "psrlw $0x8,%%xmm2 \n"
3691 "paddusb %%xmm2,%%xmm0 \n"
3692 "pand %%xmm5,%%xmm1 \n"
3693 "paddusb %%xmm1,%%xmm0 \n"
3694 "sub $0x4,%3 \n"
3695 "movdqa %%xmm0,(%2) \n"
3696 "lea 0x10(%2),%2 \n"
3697 "jge 40b \n"
3698 "jmp 49f \n"
3699
3700 // 4 pixel unaligned loop.
3701 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003702 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003703 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003704 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003705 "movdqa %%xmm3,%%xmm0 \n"
3706 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003707 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003708 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003709 "pand %%xmm6,%%xmm2 \n"
3710 "paddw %%xmm7,%%xmm3 \n"
3711 "pmullw %%xmm3,%%xmm2 \n"
3712 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003713 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003714 "psrlw $0x8,%%xmm1 \n"
3715 "por %%xmm4,%%xmm0 \n"
3716 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003717 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003718 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003719 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003720 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003721 "sub $0x4,%3 \n"
3722 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003723 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003724 "jge 41b \n"
3725
3726 "49: \n"
3727 "add $0x3,%3 \n"
3728 "jl 99f \n"
3729
fbarchard@google.com794fe122012-06-15 01:05:01 +00003730 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003731 "91: \n"
3732 "movd (%0),%%xmm3 \n"
3733 "lea 0x4(%0),%0 \n"
3734 "movdqa %%xmm3,%%xmm0 \n"
3735 "pxor %%xmm4,%%xmm3 \n"
3736 "movd (%1),%%xmm2 \n"
3737 "pshufb %4,%%xmm3 \n"
3738 "pand %%xmm6,%%xmm2 \n"
3739 "paddw %%xmm7,%%xmm3 \n"
3740 "pmullw %%xmm3,%%xmm2 \n"
3741 "movd (%1),%%xmm1 \n"
3742 "lea 0x4(%1),%1 \n"
3743 "psrlw $0x8,%%xmm1 \n"
3744 "por %%xmm4,%%xmm0 \n"
3745 "pmullw %%xmm3,%%xmm1 \n"
3746 "psrlw $0x8,%%xmm2 \n"
3747 "paddusb %%xmm2,%%xmm0 \n"
3748 "pand %%xmm5,%%xmm1 \n"
3749 "paddusb %%xmm1,%%xmm0 \n"
3750 "sub $0x1,%3 \n"
3751 "movd %%xmm0,(%2) \n"
3752 "lea 0x4(%2),%2 \n"
3753 "jge 91b \n"
3754 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003755 : "+r"(src_argb0), // %0
3756 "+r"(src_argb1), // %1
3757 "+r"(dst_argb), // %2
3758 "+r"(width) // %3
3759 : "m"(kShuffleAlpha) // %4
3760 : "memory", "cc"
3761#if defined(__SSE2__)
3762 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3763#endif
3764 );
3765}
3766#endif // HAS_ARGBBLENDROW_SSSE3
3767
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003768#ifdef HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003769// Attenuate 4 pixels at a time.
3770// aligned to 16 bytes
3771void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3772 asm volatile (
3773 "sub %0,%1 \n"
3774 "pcmpeqb %%xmm4,%%xmm4 \n"
3775 "pslld $0x18,%%xmm4 \n"
3776 "pcmpeqb %%xmm5,%%xmm5 \n"
3777 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003778
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003779 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003780 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003781 "1: \n"
3782 "movdqa (%0),%%xmm0 \n"
3783 "punpcklbw %%xmm0,%%xmm0 \n"
3784 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3785 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3786 "pmulhuw %%xmm2,%%xmm0 \n"
3787 "movdqa (%0),%%xmm1 \n"
3788 "punpckhbw %%xmm1,%%xmm1 \n"
3789 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3790 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3791 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003792 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003793 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003794 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003795 "psrlw $0x8,%%xmm1 \n"
3796 "packuswb %%xmm1,%%xmm0 \n"
3797 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003798 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003799 "sub $0x4,%2 \n"
3800 "movdqa %%xmm0,(%0,%1,1) \n"
3801 "lea 0x10(%0),%0 \n"
3802 "jg 1b \n"
3803 : "+r"(src_argb), // %0
3804 "+r"(dst_argb), // %1
3805 "+r"(width) // %2
3806 :
3807 : "memory", "cc"
3808#if defined(__SSE2__)
3809 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3810#endif
3811 );
3812}
fbarchard@google.com1d160cb2012-11-28 20:02:55 +00003813#endif // HAS_ARGBATTENUATEROW_SSE2
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003814
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003815#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003816// Shuffle table duplicating alpha
3817CONST uvec8 kShuffleAlpha0 = {
3818 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3819};
3820CONST uvec8 kShuffleAlpha1 = {
3821 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3822 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3823};
3824// Attenuate 4 pixels at a time.
3825// aligned to 16 bytes
3826void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3827 asm volatile (
3828 "sub %0,%1 \n"
3829 "pcmpeqb %%xmm3,%%xmm3 \n"
3830 "pslld $0x18,%%xmm3 \n"
3831 "movdqa %3,%%xmm4 \n"
3832 "movdqa %4,%%xmm5 \n"
3833
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003834 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003835 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003836 "1: \n"
3837 "movdqa (%0),%%xmm0 \n"
3838 "pshufb %%xmm4,%%xmm0 \n"
3839 "movdqa (%0),%%xmm1 \n"
3840 "punpcklbw %%xmm1,%%xmm1 \n"
3841 "pmulhuw %%xmm1,%%xmm0 \n"
3842 "movdqa (%0),%%xmm1 \n"
3843 "pshufb %%xmm5,%%xmm1 \n"
3844 "movdqa (%0),%%xmm2 \n"
3845 "punpckhbw %%xmm2,%%xmm2 \n"
3846 "pmulhuw %%xmm2,%%xmm1 \n"
3847 "movdqa (%0),%%xmm2 \n"
3848 "pand %%xmm3,%%xmm2 \n"
3849 "psrlw $0x8,%%xmm0 \n"
3850 "psrlw $0x8,%%xmm1 \n"
3851 "packuswb %%xmm1,%%xmm0 \n"
3852 "por %%xmm2,%%xmm0 \n"
3853 "sub $0x4,%2 \n"
3854 "movdqa %%xmm0,(%0,%1,1) \n"
3855 "lea 0x10(%0),%0 \n"
3856 "jg 1b \n"
3857 : "+r"(src_argb), // %0
3858 "+r"(dst_argb), // %1
3859 "+r"(width) // %2
3860 : "m"(kShuffleAlpha0), // %3
3861 "m"(kShuffleAlpha1) // %4
3862 : "memory", "cc"
3863#if defined(__SSE2__)
3864 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3865#endif
3866 );
3867}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003868#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003869
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003870#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003871// Unattenuate 4 pixels at a time.
3872// aligned to 16 bytes
3873void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3874 int width) {
3875 uintptr_t alpha = 0;
3876 asm volatile (
3877 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003878
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003879 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003880 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003881 "1: \n"
3882 "movdqa (%0),%%xmm0 \n"
3883 "movzb 0x3(%0),%3 \n"
3884 "punpcklbw %%xmm0,%%xmm0 \n"
3885 "movd 0x0(%4,%3,4),%%xmm2 \n"
3886 "movzb 0x7(%0),%3 \n"
3887 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003888 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3889 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003890 "movlhps %%xmm3,%%xmm2 \n"
3891 "pmulhuw %%xmm2,%%xmm0 \n"
3892 "movdqa (%0),%%xmm1 \n"
3893 "movzb 0xb(%0),%3 \n"
3894 "punpckhbw %%xmm1,%%xmm1 \n"
3895 "movd 0x0(%4,%3,4),%%xmm2 \n"
3896 "movzb 0xf(%0),%3 \n"
3897 "movd 0x0(%4,%3,4),%%xmm3 \n"
fbarchard@google.com787f8272013-02-21 00:08:56 +00003898 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3899 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003900 "movlhps %%xmm3,%%xmm2 \n"
3901 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003902 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003903 "sub $0x4,%2 \n"
3904 "movdqa %%xmm0,(%0,%1,1) \n"
3905 "lea 0x10(%0),%0 \n"
3906 "jg 1b \n"
3907 : "+r"(src_argb), // %0
3908 "+r"(dst_argb), // %1
3909 "+r"(width), // %2
3910 "+r"(alpha) // %3
3911 : "r"(fixed_invtbl8) // %4
3912 : "memory", "cc"
3913#if defined(__SSE2__)
3914 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3915#endif
3916 );
3917}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003918#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003919
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003920#ifdef HAS_ARGBGRAYROW_SSSE3
3921// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003922void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003923 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003924 "movdqa %3,%%xmm4 \n"
fbarchard@google.com050b39a2013-04-01 20:07:14 +00003925 "movdqa %4,%%xmm5 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003926 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003927
3928 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003929 ".p2align 4 \n"
3930 "1: \n"
3931 "movdqa (%0),%%xmm0 \n"
3932 "movdqa 0x10(%0),%%xmm1 \n"
3933 "pmaddubsw %%xmm4,%%xmm0 \n"
3934 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003935 "phaddw %%xmm1,%%xmm0 \n"
fbarchard@google.com050b39a2013-04-01 20:07:14 +00003936 "paddw %%xmm5,%%xmm0 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003937 "psrlw $0x7,%%xmm0 \n"
3938 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003939 "movdqa (%0),%%xmm2 \n"
3940 "movdqa 0x10(%0),%%xmm3 \n"
3941 "psrld $0x18,%%xmm2 \n"
3942 "psrld $0x18,%%xmm3 \n"
3943 "packuswb %%xmm3,%%xmm2 \n"
3944 "packuswb %%xmm2,%%xmm2 \n"
3945 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003946 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003947 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003948 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003949 "punpcklwd %%xmm3,%%xmm0 \n"
3950 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003951 "sub $0x8,%2 \n"
3952 "movdqa %%xmm0,(%0,%1,1) \n"
3953 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003954 "lea 0x20(%0),%0 \n"
3955 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003956 : "+r"(src_argb), // %0
3957 "+r"(dst_argb), // %1
3958 "+r"(width) // %2
fbarchard@google.com050b39a2013-04-01 20:07:14 +00003959 : "m"(kARGBToYJ), // %3
3960 "m"(kAddYJ64) // %4
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003961 : "memory", "cc"
3962#if defined(__SSE2__)
fbarchard@google.com050b39a2013-04-01 20:07:14 +00003963 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003964#endif
3965 );
3966}
3967#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003968
3969#ifdef HAS_ARGBSEPIAROW_SSSE3
3970// b = (r * 35 + g * 68 + b * 17) >> 7
3971// g = (r * 45 + g * 88 + b * 22) >> 7
3972// r = (r * 50 + g * 98 + b * 24) >> 7
3973// Constant for ARGB color to sepia tone
3974CONST vec8 kARGBToSepiaB = {
3975 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3976};
3977
3978CONST vec8 kARGBToSepiaG = {
3979 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3980};
3981
3982CONST vec8 kARGBToSepiaR = {
3983 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3984};
3985
fbarchard@google.come442dc42012-06-18 17:37:09 +00003986// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003987void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3988 asm volatile (
3989 "movdqa %2,%%xmm2 \n"
3990 "movdqa %3,%%xmm3 \n"
3991 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003992
3993 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003994 ".p2align 4 \n"
3995 "1: \n"
3996 "movdqa (%0),%%xmm0 \n"
3997 "movdqa 0x10(%0),%%xmm6 \n"
3998 "pmaddubsw %%xmm2,%%xmm0 \n"
3999 "pmaddubsw %%xmm2,%%xmm6 \n"
4000 "phaddw %%xmm6,%%xmm0 \n"
4001 "psrlw $0x7,%%xmm0 \n"
4002 "packuswb %%xmm0,%%xmm0 \n"
4003 "movdqa (%0),%%xmm5 \n"
4004 "movdqa 0x10(%0),%%xmm1 \n"
4005 "pmaddubsw %%xmm3,%%xmm5 \n"
4006 "pmaddubsw %%xmm3,%%xmm1 \n"
4007 "phaddw %%xmm1,%%xmm5 \n"
4008 "psrlw $0x7,%%xmm5 \n"
4009 "packuswb %%xmm5,%%xmm5 \n"
4010 "punpcklbw %%xmm5,%%xmm0 \n"
4011 "movdqa (%0),%%xmm5 \n"
4012 "movdqa 0x10(%0),%%xmm1 \n"
4013 "pmaddubsw %%xmm4,%%xmm5 \n"
4014 "pmaddubsw %%xmm4,%%xmm1 \n"
4015 "phaddw %%xmm1,%%xmm5 \n"
4016 "psrlw $0x7,%%xmm5 \n"
4017 "packuswb %%xmm5,%%xmm5 \n"
4018 "movdqa (%0),%%xmm6 \n"
4019 "movdqa 0x10(%0),%%xmm1 \n"
4020 "psrld $0x18,%%xmm6 \n"
4021 "psrld $0x18,%%xmm1 \n"
4022 "packuswb %%xmm1,%%xmm6 \n"
4023 "packuswb %%xmm6,%%xmm6 \n"
4024 "punpcklbw %%xmm6,%%xmm5 \n"
4025 "movdqa %%xmm0,%%xmm1 \n"
4026 "punpcklwd %%xmm5,%%xmm0 \n"
4027 "punpckhwd %%xmm5,%%xmm1 \n"
4028 "sub $0x8,%1 \n"
4029 "movdqa %%xmm0,(%0) \n"
4030 "movdqa %%xmm1,0x10(%0) \n"
4031 "lea 0x20(%0),%0 \n"
4032 "jg 1b \n"
4033 : "+r"(dst_argb), // %0
4034 "+r"(width) // %1
4035 : "m"(kARGBToSepiaB), // %2
4036 "m"(kARGBToSepiaG), // %3
4037 "m"(kARGBToSepiaR) // %4
4038 : "memory", "cc"
4039#if defined(__SSE2__)
4040 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4041#endif
4042 );
4043}
4044#endif // HAS_ARGBSEPIAROW_SSSE3
4045
fbarchard@google.come442dc42012-06-18 17:37:09 +00004046#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4047// Tranform 8 ARGB pixels (32 bytes) with color matrix.
4048// Same as Sepia except matrix is provided.
4049void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
4050 int width) {
4051 asm volatile (
4052 "movd (%2),%%xmm2 \n"
4053 "movd 0x4(%2),%%xmm3 \n"
4054 "movd 0x8(%2),%%xmm4 \n"
4055 "pshufd $0x0,%%xmm2,%%xmm2 \n"
4056 "pshufd $0x0,%%xmm3,%%xmm3 \n"
4057 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00004058
4059 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00004060 ".p2align 4 \n"
4061 "1: \n"
4062 "movdqa (%0),%%xmm0 \n"
4063 "movdqa 0x10(%0),%%xmm6 \n"
4064 "pmaddubsw %%xmm2,%%xmm0 \n"
4065 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004066 "movdqa (%0),%%xmm5 \n"
4067 "movdqa 0x10(%0),%%xmm1 \n"
4068 "pmaddubsw %%xmm3,%%xmm5 \n"
4069 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00004070 "phaddsw %%xmm6,%%xmm0 \n"
4071 "phaddsw %%xmm1,%%xmm5 \n"
4072 "psraw $0x7,%%xmm0 \n"
4073 "psraw $0x7,%%xmm5 \n"
4074 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004075 "packuswb %%xmm5,%%xmm5 \n"
4076 "punpcklbw %%xmm5,%%xmm0 \n"
4077 "movdqa (%0),%%xmm5 \n"
4078 "movdqa 0x10(%0),%%xmm1 \n"
4079 "pmaddubsw %%xmm4,%%xmm5 \n"
4080 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00004081 "phaddsw %%xmm1,%%xmm5 \n"
4082 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004083 "packuswb %%xmm5,%%xmm5 \n"
4084 "movdqa (%0),%%xmm6 \n"
4085 "movdqa 0x10(%0),%%xmm1 \n"
4086 "psrld $0x18,%%xmm6 \n"
4087 "psrld $0x18,%%xmm1 \n"
4088 "packuswb %%xmm1,%%xmm6 \n"
4089 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004090 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00004091 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00004092 "punpcklwd %%xmm5,%%xmm0 \n"
4093 "punpckhwd %%xmm5,%%xmm1 \n"
4094 "sub $0x8,%1 \n"
4095 "movdqa %%xmm0,(%0) \n"
4096 "movdqa %%xmm1,0x10(%0) \n"
4097 "lea 0x20(%0),%0 \n"
4098 "jg 1b \n"
4099 : "+r"(dst_argb), // %0
4100 "+r"(width) // %1
4101 : "r"(matrix_argb) // %2
4102 : "memory", "cc"
4103#if defined(__SSE2__)
4104 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4105#endif
4106 );
4107}
4108#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4109
fbarchard@google.com81b804e2012-06-20 02:15:01 +00004110#ifdef HAS_ARGBQUANTIZEROW_SSE2
4111// Quantize 4 ARGB pixels (16 bytes).
4112// aligned to 16 bytes
4113void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4114 int interval_offset, int width) {
4115 asm volatile (
4116 "movd %2,%%xmm2 \n"
4117 "movd %3,%%xmm3 \n"
4118 "movd %4,%%xmm4 \n"
4119 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4120 "pshufd $0x44,%%xmm2,%%xmm2 \n"
4121 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4122 "pshufd $0x44,%%xmm3,%%xmm3 \n"
4123 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
4124 "pshufd $0x44,%%xmm4,%%xmm4 \n"
4125 "pxor %%xmm5,%%xmm5 \n"
4126 "pcmpeqb %%xmm6,%%xmm6 \n"
4127 "pslld $0x18,%%xmm6 \n"
4128
4129 // 4 pixel loop.
4130 ".p2align 2 \n"
4131 "1: \n"
4132 "movdqa (%0),%%xmm0 \n"
4133 "punpcklbw %%xmm5,%%xmm0 \n"
4134 "pmulhuw %%xmm2,%%xmm0 \n"
4135 "movdqa (%0),%%xmm1 \n"
4136 "punpckhbw %%xmm5,%%xmm1 \n"
4137 "pmulhuw %%xmm2,%%xmm1 \n"
4138 "pmullw %%xmm3,%%xmm0 \n"
4139 "movdqa (%0),%%xmm7 \n"
4140 "pmullw %%xmm3,%%xmm1 \n"
4141 "pand %%xmm6,%%xmm7 \n"
4142 "paddw %%xmm4,%%xmm0 \n"
4143 "paddw %%xmm4,%%xmm1 \n"
4144 "packuswb %%xmm1,%%xmm0 \n"
4145 "por %%xmm7,%%xmm0 \n"
4146 "sub $0x4,%1 \n"
4147 "movdqa %%xmm0,(%0) \n"
4148 "lea 0x10(%0),%0 \n"
4149 "jg 1b \n"
4150 : "+r"(dst_argb), // %0
4151 "+r"(width) // %1
4152 : "r"(scale), // %2
4153 "r"(interval_size), // %3
4154 "r"(interval_offset) // %4
4155 : "memory", "cc"
4156#if defined(__SSE2__)
4157 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4158#endif
4159 );
4160}
4161#endif // HAS_ARGBQUANTIZEROW_SSE2
4162
fbarchard@google.comb94b1392012-12-03 20:36:40 +00004163#ifdef HAS_ARGBSHADEROW_SSE2
4164// Shade 4 pixels at a time by specified value.
4165// Aligned to 16 bytes.
4166void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4167 uint32 value) {
4168 asm volatile (
4169 "movd %3,%%xmm2 \n"
4170 "sub %0,%1 \n"
4171 "punpcklbw %%xmm2,%%xmm2 \n"
4172 "punpcklqdq %%xmm2,%%xmm2 \n"
4173
4174 // 4 pixel loop.
4175 ".p2align 2 \n"
4176 "1: \n"
4177 "movdqa (%0),%%xmm0 \n"
4178 "movdqa %%xmm0,%%xmm1 \n"
4179 "punpcklbw %%xmm0,%%xmm0 \n"
4180 "punpckhbw %%xmm1,%%xmm1 \n"
4181 "pmulhuw %%xmm2,%%xmm0 \n"
4182 "pmulhuw %%xmm2,%%xmm1 \n"
4183 "psrlw $0x8,%%xmm0 \n"
4184 "psrlw $0x8,%%xmm1 \n"
4185 "packuswb %%xmm1,%%xmm0 \n"
4186 "sub $0x4,%2 \n"
4187 "movdqa %%xmm0,(%0,%1,1) \n"
4188 "lea 0x10(%0),%0 \n"
4189 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004190 : "+r"(src_argb), // %0
4191 "+r"(dst_argb), // %1
4192 "+r"(width) // %2
4193 : "r"(value) // %3
fbarchard@google.comb94b1392012-12-03 20:36:40 +00004194 : "memory", "cc"
4195#if defined(__SSE2__)
4196 , "xmm0", "xmm1", "xmm2"
4197#endif
4198 );
4199}
4200#endif // HAS_ARGBSHADEROW_SSE2
4201
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004202#ifdef HAS_ARGBMULTIPLYROW_SSE2
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004203// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004204void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4205 uint8* dst_argb, int width) {
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004206 asm volatile (
4207 "pxor %%xmm5,%%xmm5 \n"
4208 "sub %0,%1 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004209 "sub %0,%2 \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004210
4211 // 4 pixel loop.
4212 ".p2align 4 \n"
4213 "1: \n"
fbarchard@google.combb92aca2013-04-19 18:10:38 +00004214 "movdqu (%0),%%xmm0 \n"
4215 "movdqu (%0,%1),%%xmm2 \n"
4216 "movdqu %%xmm0,%%xmm1 \n"
4217 "movdqu %%xmm2,%%xmm3 \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004218 "punpcklbw %%xmm0,%%xmm0 \n"
4219 "punpckhbw %%xmm1,%%xmm1 \n"
4220 "punpcklbw %%xmm5,%%xmm2 \n"
4221 "punpckhbw %%xmm5,%%xmm3 \n"
4222 "pmulhuw %%xmm2,%%xmm0 \n"
4223 "pmulhuw %%xmm3,%%xmm1 \n"
4224 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004225 "sub $0x4,%3 \n"
fbarchard@google.combb92aca2013-04-19 18:10:38 +00004226 "movdqu %%xmm0,(%0,%2,1) \n"
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004227 "lea 0x10(%0),%0 \n"
4228 "jg 1b \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004229 : "+r"(src_argb0), // %0
4230 "+r"(src_argb1), // %1
fbarchard@google.com8fa76342013-01-18 02:25:04 +00004231 "+r"(dst_argb), // %2
4232 "+r"(width) // %3
fbarchard@google.com8ec60332013-01-17 20:18:08 +00004233 :
4234 : "memory", "cc"
4235#if defined(__SSE2__)
4236 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4237#endif
4238 );
4239}
4240#endif // HAS_ARGBMULTIPLYROW_SSE2
4241
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004242#ifdef HAS_ARGBADDROW_SSE2
4243// Add 2 rows of ARGB pixels together, 4 pixels at a time.
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004244void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4245 uint8* dst_argb, int width) {
4246 asm volatile (
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004247 "sub %0,%1 \n"
4248 "sub %0,%2 \n"
4249
4250 // 4 pixel loop.
4251 ".p2align 4 \n"
4252 "1: \n"
fbarchard@google.combb92aca2013-04-19 18:10:38 +00004253 "movdqu (%0),%%xmm0 \n"
4254 "movdqu (%0,%1),%%xmm1 \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004255 "paddusb %%xmm1,%%xmm0 \n"
4256 "sub $0x4,%3 \n"
fbarchard@google.combb92aca2013-04-19 18:10:38 +00004257 "movdqu %%xmm0,(%0,%2,1) \n"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004258 "lea 0x10(%0),%0 \n"
4259 "jg 1b \n"
4260 : "+r"(src_argb0), // %0
4261 "+r"(src_argb1), // %1
4262 "+r"(dst_argb), // %2
4263 "+r"(width) // %3
4264 :
4265 : "memory", "cc"
4266#if defined(__SSE2__)
fbarchard@google.com573a8832013-01-24 23:08:12 +00004267 , "xmm0", "xmm1"
fbarchard@google.com83e1b172013-01-18 23:03:56 +00004268#endif
4269 );
4270}
4271#endif // HAS_ARGBADDROW_SSE2
4272
fbarchard@google.com573a8832013-01-24 23:08:12 +00004273#ifdef HAS_ARGBSUBTRACTROW_SSE2
4274// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
fbarchard@google.com573a8832013-01-24 23:08:12 +00004275void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4276 uint8* dst_argb, int width) {
4277 asm volatile (
4278 "sub %0,%1 \n"
4279 "sub %0,%2 \n"
4280
4281 // 4 pixel loop.
4282 ".p2align 4 \n"
4283 "1: \n"
fbarchard@google.combb92aca2013-04-19 18:10:38 +00004284 "movdqu (%0),%%xmm0 \n"
4285 "movdqu (%0,%1),%%xmm1 \n"
fbarchard@google.com573a8832013-01-24 23:08:12 +00004286 "psubusb %%xmm1,%%xmm0 \n"
4287 "sub $0x4,%3 \n"
fbarchard@google.combb92aca2013-04-19 18:10:38 +00004288 "movdqu %%xmm0,(%0,%2,1) \n"
fbarchard@google.com573a8832013-01-24 23:08:12 +00004289 "lea 0x10(%0),%0 \n"
4290 "jg 1b \n"
4291 : "+r"(src_argb0), // %0
4292 "+r"(src_argb1), // %1
4293 "+r"(dst_argb), // %2
4294 "+r"(width) // %3
4295 :
4296 : "memory", "cc"
4297#if defined(__SSE2__)
4298 , "xmm0", "xmm1"
4299#endif
4300 );
4301}
4302#endif // HAS_ARGBSUBTRACTROW_SSE2
4303
fbarchard@google.com9d48df92013-03-24 20:12:25 +00004304#ifdef HAS_SOBELXROW_SSSE3
4305// SobelX as a matrix is
4306// -1 0 1
4307// -2 0 2
4308// -1 0 1
4309void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
4310 const uint8* src_y2, uint8* dst_sobelx, int width) {
4311 asm volatile (
4312 "sub %0,%1 \n"
4313 "sub %0,%2 \n"
4314 "sub %0,%3 \n"
4315 "pxor %%xmm5,%%xmm5 \n"
4316
4317 // 8 pixel loop.
4318 ".p2align 4 \n"
4319 "1: \n"
4320 "movq (%0),%%xmm0 \n"
4321 "movq 0x2(%0),%%xmm1 \n"
4322 "punpcklbw %%xmm5,%%xmm0 \n"
4323 "punpcklbw %%xmm5,%%xmm1 \n"
4324 "psubw %%xmm1,%%xmm0 \n"
4325 "movq (%0,%1,1),%%xmm1 \n"
4326 "movq 0x2(%0,%1,1),%%xmm2 \n"
4327 "punpcklbw %%xmm5,%%xmm1 \n"
4328 "punpcklbw %%xmm5,%%xmm2 \n"
4329 "psubw %%xmm2,%%xmm1 \n"
4330 "movq (%0,%2,1),%%xmm2 \n"
4331 "movq 0x2(%0,%2,1),%%xmm3 \n"
4332 "punpcklbw %%xmm5,%%xmm2 \n"
4333 "punpcklbw %%xmm5,%%xmm3 \n"
4334 "psubw %%xmm3,%%xmm2 \n"
4335 "paddw %%xmm2,%%xmm0 \n"
4336 "paddw %%xmm1,%%xmm0 \n"
4337 "paddw %%xmm1,%%xmm0 \n"
4338 "pabsw %%xmm0,%%xmm0 \n"
4339 "packuswb %%xmm0,%%xmm0 \n"
4340 "sub $0x8,%4 \n"
4341 "movq %%xmm0,(%0,%3,1) \n"
4342 "lea 0x8(%0),%0 \n"
4343 "jg 1b \n"
4344 : "+r"(src_y0), // %0
4345 "+r"(src_y1), // %1
4346 "+r"(src_y2), // %2
4347 "+r"(dst_sobelx), // %3
4348 "+r"(width) // %4
4349 :
4350 : "memory", "cc"
4351#if defined(__SSE2__)
4352 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4353#endif
4354 );
4355}
4356#endif // HAS_SOBELXROW_SSSE3
4357
4358#ifdef HAS_SOBELYROW_SSSE3
4359// SobelY as a matrix is
4360// -1 -2 -1
4361// 0 0 0
4362// 1 2 1
4363void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
4364 uint8* dst_sobely, int width) {
4365 asm volatile (
4366 "sub %0,%1 \n"
4367 "sub %0,%2 \n"
4368 "pxor %%xmm5,%%xmm5 \n"
4369
4370 // 8 pixel loop.
4371 ".p2align 4 \n"
4372 "1: \n"
4373 "movq (%0),%%xmm0 \n"
4374 "movq (%0,%1,1),%%xmm1 \n"
4375 "punpcklbw %%xmm5,%%xmm0 \n"
4376 "punpcklbw %%xmm5,%%xmm1 \n"
4377 "psubw %%xmm1,%%xmm0 \n"
4378 "movq 0x1(%0),%%xmm1 \n"
4379 "movq 0x1(%0,%1,1),%%xmm2 \n"
4380 "punpcklbw %%xmm5,%%xmm1 \n"
4381 "punpcklbw %%xmm5,%%xmm2 \n"
4382 "psubw %%xmm2,%%xmm1 \n"
4383 "movq 0x2(%0),%%xmm2 \n"
4384 "movq 0x2(%0,%1,1),%%xmm3 \n"
4385 "punpcklbw %%xmm5,%%xmm2 \n"
4386 "punpcklbw %%xmm5,%%xmm3 \n"
4387 "psubw %%xmm3,%%xmm2 \n"
4388 "paddw %%xmm2,%%xmm0 \n"
4389 "paddw %%xmm1,%%xmm0 \n"
4390 "paddw %%xmm1,%%xmm0 \n"
4391 "pabsw %%xmm0,%%xmm0 \n"
4392 "packuswb %%xmm0,%%xmm0 \n"
4393 "sub $0x8,%3 \n"
4394 "movq %%xmm0,(%0,%2,1) \n"
4395 "lea 0x8(%0),%0 \n"
4396 "jg 1b \n"
4397 : "+r"(src_y0), // %0
4398 "+r"(src_y1), // %1
4399 "+r"(dst_sobely), // %2
4400 "+r"(width) // %3
4401 :
4402 : "memory", "cc"
4403#if defined(__SSE2__)
4404 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4405#endif
4406 );
4407}
4408#endif // HAS_SOBELYROW_SSSE3
4409
4410#ifdef HAS_SOBELROW_SSE2
4411// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4412// A = 255
4413// R = Sobel
4414// G = Sobel
4415// B = Sobel
4416void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4417 uint8* dst_argb, int width) {
4418 asm volatile (
4419 "sub %0,%1 \n"
4420 "pcmpeqb %%xmm5,%%xmm5 \n"
4421 "pslld $0x18,%%xmm5 \n"
4422
4423 // 8 pixel loop.
4424 ".p2align 4 \n"
4425 "1: \n"
4426 "movdqa (%0),%%xmm0 \n"
4427 "movdqa (%0,%1,1),%%xmm1 \n"
4428 "lea 0x10(%0),%0 \n"
4429 "paddusb %%xmm1,%%xmm0 \n"
4430 "movdqa %%xmm0,%%xmm2 \n"
4431 "punpcklbw %%xmm0,%%xmm2 \n"
4432 "punpckhbw %%xmm0,%%xmm0 \n"
4433 "movdqa %%xmm2,%%xmm1 \n"
4434 "punpcklwd %%xmm2,%%xmm1 \n"
4435 "punpckhwd %%xmm2,%%xmm2 \n"
4436 "por %%xmm5,%%xmm1 \n"
4437 "por %%xmm5,%%xmm2 \n"
4438 "movdqa %%xmm0,%%xmm3 \n"
4439 "punpcklwd %%xmm0,%%xmm3 \n"
4440 "punpckhwd %%xmm0,%%xmm0 \n"
4441 "por %%xmm5,%%xmm3 \n"
4442 "por %%xmm5,%%xmm0 \n"
4443 "sub $0x10,%3 \n"
4444 "movdqa %%xmm1,(%2) \n"
4445 "movdqa %%xmm2,0x10(%2) \n"
4446 "movdqa %%xmm3,0x20(%2) \n"
4447 "movdqa %%xmm0,0x30(%2) \n"
4448 "lea 0x40(%2),%2 \n"
4449 "jg 1b \n"
4450 : "+r"(src_sobelx), // %0
4451 "+r"(src_sobely), // %1
4452 "+r"(dst_argb), // %2
4453 "+r"(width) // %3
4454 :
4455 : "memory", "cc"
4456#if defined(__SSE2__)
4457 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4458#endif
4459 );
4460}
4461#endif // HAS_SOBELROW_SSE2
4462
4463#ifdef HAS_SOBELXYROW_SSE2
4464// Mixes Sobel X, Sobel Y and Sobel into ARGB.
4465// A = 255
4466// R = Sobel X
4467// G = Sobel
4468// B = Sobel Y
4469void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4470 uint8* dst_argb, int width) {
4471 asm volatile (
4472 "sub %0,%1 \n"
4473 "pcmpeqb %%xmm5,%%xmm5 \n"
4474
4475 // 8 pixel loop.
4476 ".p2align 4 \n"
4477 "1: \n"
4478 "movdqa (%0),%%xmm0 \n"
4479 "movdqa (%0,%1,1),%%xmm1 \n"
4480 "lea 0x10(%0),%0 \n"
4481 "movdqa %%xmm0,%%xmm2 \n"
4482 "paddusb %%xmm1,%%xmm2 \n"
4483 "movdqa %%xmm0,%%xmm3 \n"
4484 "punpcklbw %%xmm5,%%xmm3 \n"
4485 "punpckhbw %%xmm5,%%xmm0 \n"
4486 "movdqa %%xmm1,%%xmm4 \n"
4487 "punpcklbw %%xmm2,%%xmm4 \n"
4488 "punpckhbw %%xmm2,%%xmm1 \n"
4489 "movdqa %%xmm4,%%xmm6 \n"
4490 "punpcklwd %%xmm3,%%xmm6 \n"
4491 "punpckhwd %%xmm3,%%xmm4 \n"
4492 "movdqa %%xmm1,%%xmm7 \n"
4493 "punpcklwd %%xmm0,%%xmm7 \n"
4494 "punpckhwd %%xmm0,%%xmm1 \n"
4495 "sub $0x10,%3 \n"
4496 "movdqa %%xmm6,(%2) \n"
4497 "movdqa %%xmm4,0x10(%2) \n"
4498 "movdqa %%xmm7,0x20(%2) \n"
4499 "movdqa %%xmm1,0x30(%2) \n"
4500 "lea 0x40(%2),%2 \n"
4501 "jg 1b \n"
4502 : "+r"(src_sobelx), // %0
4503 "+r"(src_sobely), // %1
4504 "+r"(dst_argb), // %2
4505 "+r"(width) // %3
4506 :
4507 : "memory", "cc"
4508#if defined(__SSE2__)
4509 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4510#endif
4511 );
4512}
4513#endif // HAS_SOBELXYROW_SSE2
4514
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004515#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4516// Creates a table of cumulative sums where each value is a sum of all values
4517// above and to the left of the value, inclusive of the value.
4518void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00004519 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004520 asm volatile (
4521 "sub %1,%2 \n"
4522 "pxor %%xmm0,%%xmm0 \n"
4523 "pxor %%xmm1,%%xmm1 \n"
4524 "sub $0x4,%3 \n"
4525 "jl 49f \n"
4526 "test $0xf,%1 \n"
4527 "jne 49f \n"
4528
4529 // 4 pixel loop \n"
4530 ".p2align 2 \n"
4531 "40: \n"
4532 "movdqu (%0),%%xmm2 \n"
4533 "lea 0x10(%0),%0 \n"
4534 "movdqa %%xmm2,%%xmm4 \n"
4535 "punpcklbw %%xmm1,%%xmm2 \n"
4536 "movdqa %%xmm2,%%xmm3 \n"
4537 "punpcklwd %%xmm1,%%xmm2 \n"
4538 "punpckhwd %%xmm1,%%xmm3 \n"
4539 "punpckhbw %%xmm1,%%xmm4 \n"
4540 "movdqa %%xmm4,%%xmm5 \n"
4541 "punpcklwd %%xmm1,%%xmm4 \n"
4542 "punpckhwd %%xmm1,%%xmm5 \n"
4543 "paddd %%xmm2,%%xmm0 \n"
4544 "movdqa (%1,%2,1),%%xmm2 \n"
4545 "paddd %%xmm0,%%xmm2 \n"
4546 "paddd %%xmm3,%%xmm0 \n"
4547 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
4548 "paddd %%xmm0,%%xmm3 \n"
4549 "paddd %%xmm4,%%xmm0 \n"
4550 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
4551 "paddd %%xmm0,%%xmm4 \n"
4552 "paddd %%xmm5,%%xmm0 \n"
4553 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
4554 "paddd %%xmm0,%%xmm5 \n"
4555 "movdqa %%xmm2,(%1) \n"
4556 "movdqa %%xmm3,0x10(%1) \n"
4557 "movdqa %%xmm4,0x20(%1) \n"
4558 "movdqa %%xmm5,0x30(%1) \n"
4559 "lea 0x40(%1),%1 \n"
4560 "sub $0x4,%3 \n"
4561 "jge 40b \n"
4562
4563 "49: \n"
4564 "add $0x3,%3 \n"
4565 "jl 19f \n"
4566
4567 // 1 pixel loop \n"
4568 ".p2align 2 \n"
4569 "10: \n"
4570 "movd (%0),%%xmm2 \n"
4571 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00004572 "punpcklbw %%xmm1,%%xmm2 \n"
4573 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004574 "paddd %%xmm2,%%xmm0 \n"
4575 "movdqu (%1,%2,1),%%xmm2 \n"
4576 "paddd %%xmm0,%%xmm2 \n"
4577 "movdqu %%xmm2,(%1) \n"
4578 "lea 0x10(%1),%1 \n"
4579 "sub $0x1,%3 \n"
4580 "jge 10b \n"
4581
4582 "19: \n"
4583 : "+r"(row), // %0
4584 "+r"(cumsum), // %1
4585 "+r"(previous_cumsum), // %2
4586 "+r"(width) // %3
4587 :
4588 : "memory", "cc"
4589#if defined(__SSE2__)
4590 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4591#endif
4592 );
4593}
4594#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4595
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004596#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4597void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4598 int width, int area, uint8* dst,
4599 int count) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004600 asm volatile (
4601 "movd %5,%%xmm4 \n"
4602 "cvtdq2ps %%xmm4,%%xmm4 \n"
4603 "rcpss %%xmm4,%%xmm4 \n"
4604 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4605 "sub $0x4,%3 \n"
4606 "jl 49f \n"
4607
4608 // 4 pixel loop \n"
4609 ".p2align 2 \n"
4610 "40: \n"
4611 "movdqa (%0),%%xmm0 \n"
4612 "movdqa 0x10(%0),%%xmm1 \n"
4613 "movdqa 0x20(%0),%%xmm2 \n"
4614 "movdqa 0x30(%0),%%xmm3 \n"
4615 "psubd (%0,%4,4),%%xmm0 \n"
4616 "psubd 0x10(%0,%4,4),%%xmm1 \n"
4617 "psubd 0x20(%0,%4,4),%%xmm2 \n"
4618 "psubd 0x30(%0,%4,4),%%xmm3 \n"
4619 "lea 0x40(%0),%0 \n"
4620 "psubd (%1),%%xmm0 \n"
4621 "psubd 0x10(%1),%%xmm1 \n"
4622 "psubd 0x20(%1),%%xmm2 \n"
4623 "psubd 0x30(%1),%%xmm3 \n"
4624 "paddd (%1,%4,4),%%xmm0 \n"
4625 "paddd 0x10(%1,%4,4),%%xmm1 \n"
4626 "paddd 0x20(%1,%4,4),%%xmm2 \n"
4627 "paddd 0x30(%1,%4,4),%%xmm3 \n"
4628 "lea 0x40(%1),%1 \n"
4629 "cvtdq2ps %%xmm0,%%xmm0 \n"
4630 "cvtdq2ps %%xmm1,%%xmm1 \n"
4631 "mulps %%xmm4,%%xmm0 \n"
4632 "mulps %%xmm4,%%xmm1 \n"
4633 "cvtdq2ps %%xmm2,%%xmm2 \n"
4634 "cvtdq2ps %%xmm3,%%xmm3 \n"
4635 "mulps %%xmm4,%%xmm2 \n"
4636 "mulps %%xmm4,%%xmm3 \n"
4637 "cvtps2dq %%xmm0,%%xmm0 \n"
4638 "cvtps2dq %%xmm1,%%xmm1 \n"
4639 "cvtps2dq %%xmm2,%%xmm2 \n"
4640 "cvtps2dq %%xmm3,%%xmm3 \n"
4641 "packssdw %%xmm1,%%xmm0 \n"
4642 "packssdw %%xmm3,%%xmm2 \n"
4643 "packuswb %%xmm2,%%xmm0 \n"
4644 "movdqu %%xmm0,(%2) \n"
4645 "lea 0x10(%2),%2 \n"
4646 "sub $0x4,%3 \n"
4647 "jge 40b \n"
4648
4649 "49: \n"
4650 "add $0x3,%3 \n"
4651 "jl 19f \n"
4652
4653 // 1 pixel loop \n"
4654 ".p2align 2 \n"
4655 "10: \n"
4656 "movdqa (%0),%%xmm0 \n"
4657 "psubd (%0,%4,4),%%xmm0 \n"
4658 "lea 0x10(%0),%0 \n"
4659 "psubd (%1),%%xmm0 \n"
4660 "paddd (%1,%4,4),%%xmm0 \n"
4661 "lea 0x10(%1),%1 \n"
4662 "cvtdq2ps %%xmm0,%%xmm0 \n"
4663 "mulps %%xmm4,%%xmm0 \n"
4664 "cvtps2dq %%xmm0,%%xmm0 \n"
4665 "packssdw %%xmm0,%%xmm0 \n"
4666 "packuswb %%xmm0,%%xmm0 \n"
4667 "movd %%xmm0,(%2) \n"
4668 "lea 0x4(%2),%2 \n"
4669 "sub $0x1,%3 \n"
4670 "jge 10b \n"
4671 "19: \n"
4672 : "+r"(topleft), // %0
4673 "+r"(botleft), // %1
4674 "+r"(dst), // %2
4675 "+rm"(count) // %3
4676 : "r"(static_cast<intptr_t>(width)), // %4
4677 "rm"(area) // %5
4678 : "memory", "cc"
4679#if defined(__SSE2__)
4680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
4681#endif
4682 );
4683}
fbarchard@google.comf08ac6b2012-11-15 00:21:14 +00004684#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004685
fbarchard@google.com73444402012-08-09 17:33:29 +00004686#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004687// TODO(fbarchard): Find 64 bit way to avoid masking.
fbarchard@google.com73444402012-08-09 17:33:29 +00004688// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004689// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00004690// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004691
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00004692LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00004693void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
fbarchard@google.comc1f17f12012-11-06 02:38:45 +00004694 uint8* dst_argb, const float* src_dudv, int width) {
fbarchard@google.com73444402012-08-09 17:33:29 +00004695 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004696 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00004697 asm volatile (
4698 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004699 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004700 "shl $0x10,%1 \n"
4701 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004702 "movd %1,%%xmm5 \n"
4703 "sub $0x4,%4 \n"
4704 "jl 49f \n"
4705
4706 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4707 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004708 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004709 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004710 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004711 "movdqa %%xmm7,%%xmm4 \n"
4712 "addps %%xmm4,%%xmm4 \n"
4713 "movdqa %%xmm2,%%xmm3 \n"
4714 "addps %%xmm4,%%xmm3 \n"
4715 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004716
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004717 // 4 pixel loop \n"
4718 ".p2align 4 \n"
4719 "40: \n"
4720 "cvttps2dq %%xmm2,%%xmm0 \n"
4721 "cvttps2dq %%xmm3,%%xmm1 \n"
4722 "packssdw %%xmm1,%%xmm0 \n"
4723 "pmaddwd %%xmm5,%%xmm0 \n"
4724#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004725 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004726 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004727 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004728 "shr $32,%5 \n"
4729 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4730#else
4731 "movd %%xmm0,%1 \n"
4732 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4733 "movd %%xmm0,%5 \n"
4734 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4735#endif
4736 "movd (%0,%1,1),%%xmm1 \n"
4737 "movd (%0,%5,1),%%xmm6 \n"
4738 "punpckldq %%xmm6,%%xmm1 \n"
4739 "addps %%xmm4,%%xmm2 \n"
4740 "movq %%xmm1,(%2) \n"
4741#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004742 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004743 "mov %1,%5 \n"
4744 "and $0x0fffffff,%1 \n"
4745 "shr $32,%5 \n"
4746#else
4747 "movd %%xmm0,%1 \n"
4748 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4749 "movd %%xmm0,%5 \n"
4750#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004751 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004752 "movd (%0,%5,1),%%xmm6 \n"
4753 "punpckldq %%xmm6,%%xmm0 \n"
4754 "addps %%xmm4,%%xmm3 \n"
4755 "sub $0x4,%4 \n"
4756 "movq %%xmm0,0x08(%2) \n"
4757 "lea 0x10(%2),%2 \n"
4758 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004759
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004760 "49: \n"
4761 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004762 "jl 19f \n"
4763
4764 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004765 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004766 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004767 "cvttps2dq %%xmm2,%%xmm0 \n"
4768 "packssdw %%xmm0,%%xmm0 \n"
4769 "pmaddwd %%xmm5,%%xmm0 \n"
4770 "addps %%xmm7,%%xmm2 \n"
4771 "movd %%xmm0,%1 \n"
4772#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004773 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004774#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004775 "movd (%0,%1,1),%%xmm0 \n"
4776 "sub $0x1,%4 \n"
4777 "movd %%xmm0,(%2) \n"
4778 "lea 0x4(%2),%2 \n"
4779 "jge 10b \n"
4780 "19: \n"
4781 : "+r"(src_argb), // %0
4782 "+r"(src_argb_stride_temp), // %1
4783 "+r"(dst_argb), // %2
fbarchard@google.comb9114282013-05-30 23:42:27 +00004784 "+r"(src_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004785 "+rm"(width), // %4
4786 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004787 :
4788 : "memory", "cc"
4789#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004790 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004791#endif
4792 );
4793}
4794#endif // HAS_ARGBAFFINEROW_SSE2
4795
fbarchard@google.comb9114282013-05-30 23:42:27 +00004796// Bilinear filter 16x2 -> 16x1
4797void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4798 ptrdiff_t src_stride, int dst_width,
4799 int source_y_fraction) {
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004800 asm volatile (
4801 "sub %1,%0 \n"
4802 "shr %3 \n"
4803 "cmp $0x0,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004804 "je 100f \n"
4805 "cmp $0x20,%3 \n"
4806 "je 75f \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004807 "cmp $0x40,%3 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004808 "je 50f \n"
4809 "cmp $0x60,%3 \n"
4810 "je 25f \n"
4811
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004812 "movd %3,%%xmm0 \n"
4813 "neg %3 \n"
4814 "add $0x80,%3 \n"
4815 "movd %3,%%xmm5 \n"
4816 "punpcklbw %%xmm0,%%xmm5 \n"
4817 "punpcklwd %%xmm5,%%xmm5 \n"
4818 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004819
4820 // General purpose row blend.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004821 ".p2align 4 \n"
4822 "1: \n"
4823 "movdqa (%1),%%xmm0 \n"
4824 "movdqa (%1,%4,1),%%xmm2 \n"
4825 "movdqa %%xmm0,%%xmm1 \n"
4826 "punpcklbw %%xmm2,%%xmm0 \n"
4827 "punpckhbw %%xmm2,%%xmm1 \n"
4828 "pmaddubsw %%xmm5,%%xmm0 \n"
4829 "pmaddubsw %%xmm5,%%xmm1 \n"
4830 "psrlw $0x7,%%xmm0 \n"
4831 "psrlw $0x7,%%xmm1 \n"
4832 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004833 "sub $0x10,%2 \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004834 "movdqa %%xmm0,(%1,%0,1) \n"
4835 "lea 0x10(%1),%1 \n"
4836 "jg 1b \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004837 "jmp 99f \n"
4838
4839 // Blend 25 / 75.
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004840 ".p2align 4 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004841 "25: \n"
4842 "movdqa (%1),%%xmm0 \n"
4843 "movdqa (%1,%4,1),%%xmm1 \n"
4844 "pavgb %%xmm1,%%xmm0 \n"
4845 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004846 "sub $0x10,%2 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004847 "movdqa %%xmm0,(%1,%0,1) \n"
4848 "lea 0x10(%1),%1 \n"
4849 "jg 25b \n"
4850 "jmp 99f \n"
4851
4852 // Blend 50 / 50.
4853 ".p2align 4 \n"
4854 "50: \n"
4855 "movdqa (%1),%%xmm0 \n"
4856 "movdqa (%1,%4,1),%%xmm1 \n"
4857 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004858 "sub $0x10,%2 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004859 "movdqa %%xmm0,(%1,%0,1) \n"
4860 "lea 0x10(%1),%1 \n"
4861 "jg 50b \n"
4862 "jmp 99f \n"
4863
4864 // Blend 75 / 25.
4865 ".p2align 4 \n"
4866 "75: \n"
4867 "movdqa (%1),%%xmm1 \n"
4868 "movdqa (%1,%4,1),%%xmm0 \n"
4869 "pavgb %%xmm1,%%xmm0 \n"
4870 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004871 "sub $0x10,%2 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004872 "movdqa %%xmm0,(%1,%0,1) \n"
4873 "lea 0x10(%1),%1 \n"
4874 "jg 75b \n"
4875 "jmp 99f \n"
4876
4877 // Blend 100 / 0 - Copy row unchanged.
4878 ".p2align 4 \n"
4879 "100: \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004880 "movdqa (%1),%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004881 "sub $0x10,%2 \n"
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004882 "movdqa %%xmm0,(%1,%0,1) \n"
4883 "lea 0x10(%1),%1 \n"
fbarchard@google.comb5491752012-11-20 09:44:46 +00004884 "jg 100b \n"
4885
fbarchard@google.comb5491752012-11-20 09:44:46 +00004886 "99: \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004887 : "+r"(dst_ptr), // %0
4888 "+r"(src_ptr), // %1
fbarchard@google.comb5491752012-11-20 09:44:46 +00004889 "+r"(dst_width), // %2
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004890 "+r"(source_y_fraction) // %3
4891 : "r"(static_cast<intptr_t>(src_stride)) // %4
4892 : "memory", "cc"
4893#if defined(__SSE2__)
4894 , "xmm0", "xmm1", "xmm2", "xmm5"
4895#endif
4896 );
4897}
4898
fbarchard@google.com97c96262013-06-03 15:09:58 +00004899#ifdef HAS_INTERPOLATEROW_SSE2
fbarchard@google.comb9114282013-05-30 23:42:27 +00004900// Bilinear filter 16x2 -> 16x1
4901void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
4902 ptrdiff_t src_stride, int dst_width,
4903 int source_y_fraction) {
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004904 asm volatile (
4905 "sub %1,%0 \n"
4906 "shr %3 \n"
4907 "cmp $0x0,%3 \n"
4908 "je 100f \n"
4909 "cmp $0x20,%3 \n"
4910 "je 75f \n"
4911 "cmp $0x40,%3 \n"
4912 "je 50f \n"
4913 "cmp $0x60,%3 \n"
4914 "je 25f \n"
4915
4916 "movd %3,%%xmm0 \n"
4917 "neg %3 \n"
4918 "add $0x80,%3 \n"
4919 "movd %3,%%xmm5 \n"
4920 "punpcklbw %%xmm0,%%xmm5 \n"
4921 "punpcklwd %%xmm5,%%xmm5 \n"
4922 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4923 "pxor %%xmm4,%%xmm4 \n"
4924
4925 // General purpose row blend.
4926 ".p2align 4 \n"
4927 "1: \n"
4928 "movdqa (%1),%%xmm0 \n"
4929 "movdqa (%1,%4,1),%%xmm2 \n"
4930 "movdqa %%xmm0,%%xmm1 \n"
4931 "movdqa %%xmm2,%%xmm3 \n"
4932 "punpcklbw %%xmm4,%%xmm2 \n"
4933 "punpckhbw %%xmm4,%%xmm3 \n"
4934 "punpcklbw %%xmm4,%%xmm0 \n"
4935 "punpckhbw %%xmm4,%%xmm1 \n"
4936 "psubw %%xmm0,%%xmm2 \n"
4937 "psubw %%xmm1,%%xmm3 \n"
4938 "paddw %%xmm2,%%xmm2 \n"
4939 "paddw %%xmm3,%%xmm3 \n"
4940 "pmulhw %%xmm5,%%xmm2 \n"
4941 "pmulhw %%xmm5,%%xmm3 \n"
4942 "paddw %%xmm2,%%xmm0 \n"
4943 "paddw %%xmm3,%%xmm1 \n"
4944 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004945 "sub $0x10,%2 \n"
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004946 "movdqa %%xmm0,(%1,%0,1) \n"
4947 "lea 0x10(%1),%1 \n"
4948 "jg 1b \n"
4949 "jmp 99f \n"
4950
4951 // Blend 25 / 75.
4952 ".p2align 4 \n"
4953 "25: \n"
4954 "movdqa (%1),%%xmm0 \n"
4955 "movdqa (%1,%4,1),%%xmm1 \n"
4956 "pavgb %%xmm1,%%xmm0 \n"
4957 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004958 "sub $0x10,%2 \n"
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004959 "movdqa %%xmm0,(%1,%0,1) \n"
4960 "lea 0x10(%1),%1 \n"
4961 "jg 25b \n"
4962 "jmp 99f \n"
4963
4964 // Blend 50 / 50.
4965 ".p2align 4 \n"
4966 "50: \n"
4967 "movdqa (%1),%%xmm0 \n"
4968 "movdqa (%1,%4,1),%%xmm1 \n"
4969 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004970 "sub $0x10,%2 \n"
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004971 "movdqa %%xmm0,(%1,%0,1) \n"
4972 "lea 0x10(%1),%1 \n"
4973 "jg 50b \n"
4974 "jmp 99f \n"
4975
4976 // Blend 75 / 25.
4977 ".p2align 4 \n"
4978 "75: \n"
4979 "movdqa (%1),%%xmm1 \n"
4980 "movdqa (%1,%4,1),%%xmm0 \n"
4981 "pavgb %%xmm1,%%xmm0 \n"
4982 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004983 "sub $0x10,%2 \n"
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004984 "movdqa %%xmm0,(%1,%0,1) \n"
4985 "lea 0x10(%1),%1 \n"
4986 "jg 75b \n"
4987 "jmp 99f \n"
4988
4989 // Blend 100 / 0 - Copy row unchanged.
4990 ".p2align 4 \n"
4991 "100: \n"
4992 "movdqa (%1),%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004993 "sub $0x10,%2 \n"
fbarchard@google.comaf137b62013-02-05 22:42:56 +00004994 "movdqa %%xmm0,(%1,%0,1) \n"
4995 "lea 0x10(%1),%1 \n"
4996 "jg 100b \n"
4997
4998 "99: \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00004999 : "+r"(dst_ptr), // %0
5000 "+r"(src_ptr), // %1
fbarchard@google.comaf137b62013-02-05 22:42:56 +00005001 "+r"(dst_width), // %2
5002 "+r"(source_y_fraction) // %3
5003 : "r"(static_cast<intptr_t>(src_stride)) // %4
5004 : "memory", "cc"
5005#if defined(__SSE2__)
5006 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5007#endif
5008 );
5009}
fbarchard@google.com97c96262013-06-03 15:09:58 +00005010#endif // HAS_INTERPOLATEROW_SSE2
fbarchard@google.comaf137b62013-02-05 22:42:56 +00005011
fbarchard@google.comb9114282013-05-30 23:42:27 +00005012// Bilinear filter 16x2 -> 16x1
5013void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5014 ptrdiff_t src_stride, int dst_width,
5015 int source_y_fraction) {
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005016 asm volatile (
5017 "sub %1,%0 \n"
5018 "shr %3 \n"
5019 "cmp $0x0,%3 \n"
5020 "je 100f \n"
5021 "cmp $0x20,%3 \n"
5022 "je 75f \n"
5023 "cmp $0x40,%3 \n"
5024 "je 50f \n"
5025 "cmp $0x60,%3 \n"
5026 "je 25f \n"
5027
5028 "movd %3,%%xmm0 \n"
5029 "neg %3 \n"
5030 "add $0x80,%3 \n"
5031 "movd %3,%%xmm5 \n"
5032 "punpcklbw %%xmm0,%%xmm5 \n"
5033 "punpcklwd %%xmm5,%%xmm5 \n"
5034 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5035
5036 // General purpose row blend.
5037 ".p2align 4 \n"
5038 "1: \n"
5039 "movdqu (%1),%%xmm0 \n"
5040 "movdqu (%1,%4,1),%%xmm2 \n"
5041 "movdqu %%xmm0,%%xmm1 \n"
5042 "punpcklbw %%xmm2,%%xmm0 \n"
5043 "punpckhbw %%xmm2,%%xmm1 \n"
5044 "pmaddubsw %%xmm5,%%xmm0 \n"
5045 "pmaddubsw %%xmm5,%%xmm1 \n"
5046 "psrlw $0x7,%%xmm0 \n"
5047 "psrlw $0x7,%%xmm1 \n"
5048 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005049 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005050 "movdqu %%xmm0,(%1,%0,1) \n"
5051 "lea 0x10(%1),%1 \n"
5052 "jg 1b \n"
5053 "jmp 99f \n"
5054
5055 // Blend 25 / 75.
5056 ".p2align 4 \n"
5057 "25: \n"
5058 "movdqu (%1),%%xmm0 \n"
5059 "movdqu (%1,%4,1),%%xmm1 \n"
5060 "pavgb %%xmm1,%%xmm0 \n"
5061 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005062 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005063 "movdqu %%xmm0,(%1,%0,1) \n"
5064 "lea 0x10(%1),%1 \n"
5065 "jg 25b \n"
5066 "jmp 99f \n"
5067
5068 // Blend 50 / 50.
5069 ".p2align 4 \n"
5070 "50: \n"
5071 "movdqu (%1),%%xmm0 \n"
5072 "movdqu (%1,%4,1),%%xmm1 \n"
5073 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005074 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005075 "movdqu %%xmm0,(%1,%0,1) \n"
5076 "lea 0x10(%1),%1 \n"
5077 "jg 50b \n"
5078 "jmp 99f \n"
5079
5080 // Blend 75 / 25.
5081 ".p2align 4 \n"
5082 "75: \n"
5083 "movdqu (%1),%%xmm1 \n"
5084 "movdqu (%1,%4,1),%%xmm0 \n"
5085 "pavgb %%xmm1,%%xmm0 \n"
5086 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005087 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005088 "movdqu %%xmm0,(%1,%0,1) \n"
5089 "lea 0x10(%1),%1 \n"
5090 "jg 75b \n"
5091 "jmp 99f \n"
5092
5093 // Blend 100 / 0 - Copy row unchanged.
5094 ".p2align 4 \n"
5095 "100: \n"
5096 "movdqu (%1),%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005097 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005098 "movdqu %%xmm0,(%1,%0,1) \n"
5099 "lea 0x10(%1),%1 \n"
5100 "jg 100b \n"
5101
5102 "99: \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005103 : "+r"(dst_ptr), // %0
5104 "+r"(src_ptr), // %1
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005105 "+r"(dst_width), // %2
5106 "+r"(source_y_fraction) // %3
5107 : "r"(static_cast<intptr_t>(src_stride)) // %4
5108 : "memory", "cc"
5109#if defined(__SSE2__)
5110 , "xmm0", "xmm1", "xmm2", "xmm5"
5111#endif
5112 );
5113}
5114
fbarchard@google.com97c96262013-06-03 15:09:58 +00005115#ifdef HAS_INTERPOLATEROW_SSE2
fbarchard@google.comb9114282013-05-30 23:42:27 +00005116// Bilinear filter 16x2 -> 16x1
5117void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5118 ptrdiff_t src_stride, int dst_width,
5119 int source_y_fraction) {
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005120 asm volatile (
5121 "sub %1,%0 \n"
5122 "shr %3 \n"
5123 "cmp $0x0,%3 \n"
5124 "je 100f \n"
5125 "cmp $0x20,%3 \n"
5126 "je 75f \n"
5127 "cmp $0x40,%3 \n"
5128 "je 50f \n"
5129 "cmp $0x60,%3 \n"
5130 "je 25f \n"
5131
5132 "movd %3,%%xmm0 \n"
5133 "neg %3 \n"
5134 "add $0x80,%3 \n"
5135 "movd %3,%%xmm5 \n"
5136 "punpcklbw %%xmm0,%%xmm5 \n"
5137 "punpcklwd %%xmm5,%%xmm5 \n"
5138 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5139 "pxor %%xmm4,%%xmm4 \n"
5140
5141 // General purpose row blend.
5142 ".p2align 4 \n"
5143 "1: \n"
5144 "movdqu (%1),%%xmm0 \n"
5145 "movdqu (%1,%4,1),%%xmm2 \n"
5146 "movdqu %%xmm0,%%xmm1 \n"
5147 "movdqu %%xmm2,%%xmm3 \n"
5148 "punpcklbw %%xmm4,%%xmm2 \n"
5149 "punpckhbw %%xmm4,%%xmm3 \n"
5150 "punpcklbw %%xmm4,%%xmm0 \n"
5151 "punpckhbw %%xmm4,%%xmm1 \n"
5152 "psubw %%xmm0,%%xmm2 \n"
5153 "psubw %%xmm1,%%xmm3 \n"
5154 "paddw %%xmm2,%%xmm2 \n"
5155 "paddw %%xmm3,%%xmm3 \n"
5156 "pmulhw %%xmm5,%%xmm2 \n"
5157 "pmulhw %%xmm5,%%xmm3 \n"
5158 "paddw %%xmm2,%%xmm0 \n"
5159 "paddw %%xmm3,%%xmm1 \n"
5160 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005161 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005162 "movdqu %%xmm0,(%1,%0,1) \n"
5163 "lea 0x10(%1),%1 \n"
5164 "jg 1b \n"
5165 "jmp 99f \n"
5166
5167 // Blend 25 / 75.
5168 ".p2align 4 \n"
5169 "25: \n"
5170 "movdqu (%1),%%xmm0 \n"
5171 "movdqu (%1,%4,1),%%xmm1 \n"
5172 "pavgb %%xmm1,%%xmm0 \n"
5173 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005174 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005175 "movdqu %%xmm0,(%1,%0,1) \n"
5176 "lea 0x10(%1),%1 \n"
5177 "jg 25b \n"
5178 "jmp 99f \n"
5179
5180 // Blend 50 / 50.
5181 ".p2align 4 \n"
5182 "50: \n"
5183 "movdqu (%1),%%xmm0 \n"
5184 "movdqu (%1,%4,1),%%xmm1 \n"
5185 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005186 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005187 "movdqu %%xmm0,(%1,%0,1) \n"
5188 "lea 0x10(%1),%1 \n"
5189 "jg 50b \n"
5190 "jmp 99f \n"
5191
5192 // Blend 75 / 25.
5193 ".p2align 4 \n"
5194 "75: \n"
5195 "movdqu (%1),%%xmm1 \n"
5196 "movdqu (%1,%4,1),%%xmm0 \n"
5197 "pavgb %%xmm1,%%xmm0 \n"
5198 "pavgb %%xmm1,%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005199 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005200 "movdqu %%xmm0,(%1,%0,1) \n"
5201 "lea 0x10(%1),%1 \n"
5202 "jg 75b \n"
5203 "jmp 99f \n"
5204
5205 // Blend 100 / 0 - Copy row unchanged.
5206 ".p2align 4 \n"
5207 "100: \n"
5208 "movdqu (%1),%%xmm0 \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005209 "sub $0x10,%2 \n"
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005210 "movdqu %%xmm0,(%1,%0,1) \n"
5211 "lea 0x10(%1),%1 \n"
5212 "jg 100b \n"
5213
5214 "99: \n"
fbarchard@google.comb9114282013-05-30 23:42:27 +00005215 : "+r"(dst_ptr), // %0
5216 "+r"(src_ptr), // %1
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005217 "+r"(dst_width), // %2
5218 "+r"(source_y_fraction) // %3
5219 : "r"(static_cast<intptr_t>(src_stride)) // %4
5220 : "memory", "cc"
5221#if defined(__SSE2__)
5222 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5223#endif
5224 );
5225}
fbarchard@google.com97c96262013-06-03 15:09:58 +00005226#endif // HAS_INTERPOLATEROW_SSE2
fbarchard@google.comcd6056c2013-04-15 03:05:08 +00005227
fbarchard@google.come91bdac2012-10-09 21:09:33 +00005228void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
5229 uint8* dst_uv, int pix) {
5230 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00005231 "sub %0,%1 \n"
5232 ".p2align 4 \n"
5233 "1: \n"
5234 "movdqa (%0),%%xmm0 \n"
5235 "pavgb (%0,%3),%%xmm0 \n"
5236 "sub $0x10,%2 \n"
5237 "movdqa %%xmm0,(%0,%1) \n"
5238 "lea 0x10(%0),%0 \n"
5239 "jg 1b \n"
fbarchard@google.come91bdac2012-10-09 21:09:33 +00005240 : "+r"(src_uv), // %0
5241 "+r"(dst_uv), // %1
5242 "+r"(pix) // %2
5243 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
5244 : "memory", "cc"
5245#if defined(__SSE2__)
5246 , "xmm0"
5247#endif
5248 );
5249}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00005250
5251void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
5252 uint32 selector, int pix) {
5253 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00005254 "movd %3,%%xmm5 \n"
5255 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00005256 ".p2align 4 \n"
5257 "1: \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00005258 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00005259 "movdqa 0x10(%0),%%xmm1 \n"
5260 "lea 0x20(%0),%0 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00005261 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00005262 "pshufb %%xmm5,%%xmm1 \n"
fbarchard@google.coma3be4702013-03-22 05:20:02 +00005263 "punpckldq %%xmm1,%%xmm0 \n"
fbarchard@google.come8df16b2013-03-22 04:47:14 +00005264 "sub $0x8,%2 \n"
5265 "movq %%xmm0,(%1) \n"
5266 "lea 0x8(%1),%1 \n"
fbarchard@google.com10965432013-03-08 23:22:32 +00005267 "jg 1b \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00005268 : "+r"(src_argb), // %0
5269 "+r"(dst_bayer), // %1
5270 "+r"(pix) // %2
5271 : "g"(selector) // %3
5272 : "memory", "cc"
5273#if defined(__SSE2__)
fbarchard@google.come8df16b2013-03-22 04:47:14 +00005274 , "xmm0", "xmm1", "xmm5"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00005275#endif
5276 );
5277}
fbarchard@google.com9de88672012-10-12 06:23:33 +00005278
fbarchard@google.com10965432013-03-08 23:22:32 +00005279// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5280void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5281 const uint8* shuffler, int pix) {
5282 asm volatile (
5283 "movdqa (%3),%%xmm5 \n"
5284 ".p2align 4 \n"
5285 "1: \n"
5286 "movdqa (%0),%%xmm0 \n"
5287 "movdqa 0x10(%0),%%xmm1 \n"
5288 "lea 0x20(%0),%0 \n"
5289 "pshufb %%xmm5,%%xmm0 \n"
5290 "pshufb %%xmm5,%%xmm1 \n"
5291 "sub $0x8,%2 \n"
5292 "movdqa %%xmm0,(%1) \n"
5293 "movdqa %%xmm1,0x10(%1) \n"
5294 "lea 0x20(%1),%1 \n"
5295 "jg 1b \n"
5296 : "+r"(src_argb), // %0
5297 "+r"(dst_argb), // %1
5298 "+r"(pix) // %2
5299 : "r"(shuffler) // %3
5300 : "memory", "cc"
5301#if defined(__SSE2__)
5302 , "xmm0", "xmm1", "xmm5"
5303#endif
5304 );
5305}
5306
5307void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
5308 const uint8* shuffler, int pix) {
5309 asm volatile (
5310 "movdqa (%3),%%xmm5 \n"
5311 ".p2align 4 \n"
5312 "1: \n"
5313 "movdqu (%0),%%xmm0 \n"
5314 "movdqu 0x10(%0),%%xmm1 \n"
5315 "lea 0x20(%0),%0 \n"
5316 "pshufb %%xmm5,%%xmm0 \n"
5317 "pshufb %%xmm5,%%xmm1 \n"
5318 "sub $0x8,%2 \n"
5319 "movdqu %%xmm0,(%1) \n"
5320 "movdqu %%xmm1,0x10(%1) \n"
5321 "lea 0x20(%1),%1 \n"
5322 "jg 1b \n"
5323 : "+r"(src_argb), // %0
5324 "+r"(dst_argb), // %1
5325 "+r"(pix) // %2
5326 : "r"(shuffler) // %3
5327 : "memory", "cc"
5328#if defined(__SSE2__)
5329 , "xmm0", "xmm1", "xmm5"
5330#endif
5331 );
5332}
5333
fbarchard@google.com9de88672012-10-12 06:23:33 +00005334void I422ToYUY2Row_SSE2(const uint8* src_y,
5335 const uint8* src_u,
5336 const uint8* src_v,
5337 uint8* dst_frame, int width) {
5338 asm volatile (
fbarchard@google.com10965432013-03-08 23:22:32 +00005339 "sub %1,%2 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005340 ".p2align 4 \n"
5341 "1: \n"
5342 "movq (%1),%%xmm2 \n"
5343 "movq (%1,%2,1),%%xmm3 \n"
5344 "lea 0x8(%1),%1 \n"
5345 "punpcklbw %%xmm3,%%xmm2 \n"
fbarchard@google.com55c20a82013-04-02 22:03:49 +00005346 "movdqu (%0),%%xmm0 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005347 "lea 0x10(%0),%0 \n"
fbarchard@google.com55c20a82013-04-02 22:03:49 +00005348 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005349 "punpcklbw %%xmm2,%%xmm0 \n"
5350 "punpckhbw %%xmm2,%%xmm1 \n"
fbarchard@google.comf8e90172013-04-02 21:18:12 +00005351 "movdqu %%xmm0,(%3) \n"
5352 "movdqu %%xmm1,0x10(%3) \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005353 "lea 0x20(%3),%3 \n"
5354 "sub $0x10,%4 \n"
5355 "jg 1b \n"
5356 : "+r"(src_y), // %0
5357 "+r"(src_u), // %1
5358 "+r"(src_v), // %2
5359 "+r"(dst_frame), // %3
5360 "+rm"(width) // %4
5361 :
5362 : "memory", "cc"
5363#if defined(__SSE2__)
5364 , "xmm0", "xmm1", "xmm2", "xmm3"
5365#endif
5366 );
5367}
5368
5369void I422ToUYVYRow_SSE2(const uint8* src_y,
5370 const uint8* src_u,
5371 const uint8* src_v,
5372 uint8* dst_frame, int width) {
5373 asm volatile (
5374 "sub %1,%2 \n"
5375 ".p2align 4 \n"
5376 "1: \n"
5377 "movq (%1),%%xmm2 \n"
5378 "movq (%1,%2,1),%%xmm3 \n"
5379 "lea 0x8(%1),%1 \n"
5380 "punpcklbw %%xmm3,%%xmm2 \n"
fbarchard@google.comf8e90172013-04-02 21:18:12 +00005381 "movdqu (%0),%%xmm0 \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005382 "movdqa %%xmm2,%%xmm1 \n"
5383 "lea 0x10(%0),%0 \n"
5384 "punpcklbw %%xmm0,%%xmm1 \n"
5385 "punpckhbw %%xmm0,%%xmm2 \n"
fbarchard@google.comf8e90172013-04-02 21:18:12 +00005386 "movdqu %%xmm1,(%3) \n"
5387 "movdqu %%xmm2,0x10(%3) \n"
fbarchard@google.com9de88672012-10-12 06:23:33 +00005388 "lea 0x20(%3),%3 \n"
5389 "sub $0x10,%4 \n"
5390 "jg 1b \n"
5391 : "+r"(src_y), // %0
5392 "+r"(src_u), // %1
5393 "+r"(src_v), // %2
5394 "+r"(dst_frame), // %3
5395 "+rm"(width) // %4
5396 :
5397 : "memory", "cc"
5398#if defined(__SSE2__)
5399 , "xmm0", "xmm1", "xmm2", "xmm3"
5400#endif
5401 );
5402}
5403
fbarchard@google.com2d11d432012-02-16 02:50:39 +00005404#endif // defined(__x86_64__) || defined(__i386__)
5405
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00005406#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00005407} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00005408} // namespace libyuv
5409#endif