blob: 5e26005b7ff4d6dc98fbb573c3a1b180469e28e8 [file] [log] [blame]
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000012
frkoenig@google.comc82af4a2011-11-11 00:54:34 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
16namespace libyuv {
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000017extern "C" {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000018#endif
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +000019
fbarchard@google.com2d11d432012-02-16 02:50:39 +000020// This module is for GCC x86 and x64
fbarchard@google.comd2f44132012-04-04 21:53:27 +000021#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
fbarchard@google.com2d11d432012-02-16 02:50:39 +000022
fbarchard@google.com714050a2012-02-17 22:59:56 +000023// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000025#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
fbarchard@google.com714050a2012-02-17 22:59:56 +000031#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000038CONST vec8 kARGBToU = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000039 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
fbarchard@google.com714050a2012-02-17 22:59:56 +000042CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
fbarchard@google.com4de0c432012-10-11 01:25:46 +000072// Constants for RGBA.
73CONST vec8 kRGBAToY = {
74 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
75};
76
77CONST vec8 kRGBAToU = {
78 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
79};
80
81CONST vec8 kRGBAToV = {
82 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
83};
84
fbarchard@google.com714050a2012-02-17 22:59:56 +000085CONST uvec8 kAddY16 = {
86 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
fbarchard@google.comb6149762011-11-07 21:58:52 +000087};
fbarchard@google.com2430e042011-11-11 21:57:06 +000088
fbarchard@google.com373cdbd2011-12-14 21:10:07 +000089CONST uvec8 kAddUV128 = {
fbarchard@google.comb6149762011-11-07 21:58:52 +000090 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
92};
fbarchard@google.com228bdc22011-11-15 21:58:26 +000093
fbarchard@google.comba1f5262012-01-12 19:22:41 +000094// Shuffle table for converting RGB24 to ARGB.
95CONST uvec8 kShuffleMaskRGB24ToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +000096 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
97};
98
99// Shuffle table for converting RAW to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000100CONST uvec8 kShuffleMaskRAWToARGB = {
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000101 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
102};
103
fbarchard@google.comb6149762011-11-07 21:58:52 +0000104// Shuffle table for converting ABGR to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000105CONST uvec8 kShuffleMaskABGRToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000106 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
107};
108
109// Shuffle table for converting BGRA to ARGB.
fbarchard@google.com373cdbd2011-12-14 21:10:07 +0000110CONST uvec8 kShuffleMaskBGRAToARGB = {
fbarchard@google.comb6149762011-11-07 21:58:52 +0000111 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
112};
113
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000114// Shuffle table for converting RGBA to ARGB.
115CONST uvec8 kShuffleMaskRGBAToARGB = {
116 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
117};
118
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000119// Shuffle table for converting ARGB to RGBA.
120CONST uvec8 kShuffleMaskARGBToRGBA = {
121 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
122};
123
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000124// Shuffle table for converting ARGB to RGB24.
125CONST uvec8 kShuffleMaskARGBToRGB24 = {
126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
127};
128
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000129// Shuffle table for converting ARGB to RAW.
130CONST uvec8 kShuffleMaskARGBToRAW = {
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +0000131 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000132};
133
fbarchard@google.com4de0c432012-10-11 01:25:46 +0000134// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
fbarchard@google.com8d37dd52012-10-11 00:07:30 +0000135CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
136 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
137};
138
139// Shuffle table for converting ARGB to RAW.
140CONST uvec8 kShuffleMaskARGBToRAW_0 = {
141 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
142};
143
fbarchard@google.comb6149762011-11-07 21:58:52 +0000144void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000145 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000146 "pcmpeqb %%xmm5,%%xmm5 \n"
147 "pslld $0x18,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000148 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000149 "1: \n"
150 "movq (%0),%%xmm0 \n"
151 "lea 0x8(%0),%0 \n"
152 "punpcklbw %%xmm0,%%xmm0 \n"
153 "movdqa %%xmm0,%%xmm1 \n"
154 "punpcklwd %%xmm0,%%xmm0 \n"
155 "punpckhwd %%xmm1,%%xmm1 \n"
156 "por %%xmm5,%%xmm0 \n"
157 "por %%xmm5,%%xmm1 \n"
158 "movdqa %%xmm0,(%1) \n"
159 "movdqa %%xmm1,0x10(%1) \n"
160 "lea 0x20(%1),%1 \n"
161 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000162 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000163 : "+r"(src_y), // %0
164 "+r"(dst_argb), // %1
165 "+r"(pix) // %2
166 :
167 : "memory", "cc"
168#if defined(__SSE2__)
169 , "xmm0", "xmm1", "xmm5"
170#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000171 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000172}
fbarchard@google.comb6149762011-11-07 21:58:52 +0000173
fbarchard@google.com00b69a22012-11-02 06:03:28 +0000174void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
175 int pix) {
176 asm volatile (
177 "pcmpeqb %%xmm5,%%xmm5 \n"
178 "pslld $0x18,%%xmm5 \n"
179 ".p2align 4 \n"
180 "1: \n"
181 "movq (%0),%%xmm0 \n"
182 "lea 0x8(%0),%0 \n"
183 "punpcklbw %%xmm0,%%xmm0 \n"
184 "movdqa %%xmm0,%%xmm1 \n"
185 "punpcklwd %%xmm0,%%xmm0 \n"
186 "punpckhwd %%xmm1,%%xmm1 \n"
187 "por %%xmm5,%%xmm0 \n"
188 "por %%xmm5,%%xmm1 \n"
189 "movdqu %%xmm0,(%1) \n"
190 "movdqu %%xmm1,0x10(%1) \n"
191 "lea 0x20(%1),%1 \n"
192 "sub $0x8,%2 \n"
193 "jg 1b \n"
194 : "+r"(src_y), // %0
195 "+r"(dst_argb), // %1
196 "+r"(pix) // %2
197 :
198 : "memory", "cc"
199#if defined(__SSE2__)
200 , "xmm0", "xmm1", "xmm5"
201#endif
202 );
203}
204
fbarchard@google.comb6149762011-11-07 21:58:52 +0000205void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000206 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000207 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000208 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000209 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000210 "1: \n"
211 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000212 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000213 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000214 "movdqa %%xmm0,(%0,%1,1) \n"
215 "lea 0x10(%0),%0 \n"
216 "jg 1b \n"
217
fbarchard@google.comb6149762011-11-07 21:58:52 +0000218 : "+r"(src_abgr), // %0
219 "+r"(dst_argb), // %1
220 "+r"(pix) // %2
221 : "m"(kShuffleMaskABGRToARGB) // %3
222 : "memory", "cc"
223#if defined(__SSE2__)
224 , "xmm0", "xmm5"
fbarchard@google.com585a1262011-10-28 23:51:08 +0000225#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000226 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000227}
228
229void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000230 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000231 "movdqa %3,%%xmm5 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000232 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000233 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000234 "1: \n"
235 "movdqa (%0),%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000236 "pshufb %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000237 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000238 "movdqa %%xmm0,(%0,%1,1) \n"
239 "lea 0x10(%0),%0 \n"
240 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000241 : "+r"(src_bgra), // %0
242 "+r"(dst_argb), // %1
243 "+r"(pix) // %2
244 : "m"(kShuffleMaskBGRAToARGB) // %3
245 : "memory", "cc"
246#if defined(__SSE2__)
247 , "xmm0", "xmm5"
248#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000249 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000250}
251
fbarchard@google.comb8eabfe2012-09-14 06:59:31 +0000252void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
253 asm volatile (
254 "movdqa %3,%%xmm5 \n"
255 "sub %0,%1 \n"
256 ".p2align 4 \n"
257 "1: \n"
258 "movdqa (%0),%%xmm0 \n"
259 "pshufb %%xmm5,%%xmm0 \n"
260 "sub $0x4,%2 \n"
261 "movdqa %%xmm0,(%0,%1,1) \n"
262 "lea 0x10(%0),%0 \n"
263 "jg 1b \n"
264
265 : "+r"(src_rgba), // %0
266 "+r"(dst_argb), // %1
267 "+r"(pix) // %2
268 : "m"(kShuffleMaskRGBAToARGB) // %3
269 : "memory", "cc"
270#if defined(__SSE2__)
271 , "xmm0", "xmm5"
272#endif
273 );
274}
275
fbarchard@google.coma2cc3412012-09-14 18:09:41 +0000276void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
277 asm volatile (
278 "movdqa %3,%%xmm5 \n"
279 "sub %0,%1 \n"
280 ".p2align 4 \n"
281 "1: \n"
282 "movdqa (%0),%%xmm0 \n"
283 "pshufb %%xmm5,%%xmm0 \n"
284 "sub $0x4,%2 \n"
285 "movdqa %%xmm0,(%0,%1,1) \n"
286 "lea 0x10(%0),%0 \n"
287 "jg 1b \n"
288
289 : "+r"(src_argb), // %0
290 "+r"(dst_rgba), // %1
291 "+r"(pix) // %2
292 : "m"(kShuffleMaskARGBToRGBA) // %3
293 : "memory", "cc"
294#if defined(__SSE2__)
295 , "xmm0", "xmm5"
296#endif
297 );
298}
299
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000300void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000301 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000302 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
303 "pslld $0x18,%%xmm5 \n"
304 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000305 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000306 "1: \n"
307 "movdqu (%0),%%xmm0 \n"
308 "movdqu 0x10(%0),%%xmm1 \n"
309 "movdqu 0x20(%0),%%xmm3 \n"
310 "lea 0x30(%0),%0 \n"
311 "movdqa %%xmm3,%%xmm2 \n"
312 "palignr $0x8,%%xmm1,%%xmm2 \n"
313 "pshufb %%xmm4,%%xmm2 \n"
314 "por %%xmm5,%%xmm2 \n"
315 "palignr $0xc,%%xmm0,%%xmm1 \n"
316 "pshufb %%xmm4,%%xmm0 \n"
317 "movdqa %%xmm2,0x20(%1) \n"
318 "por %%xmm5,%%xmm0 \n"
319 "pshufb %%xmm4,%%xmm1 \n"
320 "movdqa %%xmm0,(%1) \n"
321 "por %%xmm5,%%xmm1 \n"
322 "palignr $0x4,%%xmm3,%%xmm3 \n"
323 "pshufb %%xmm4,%%xmm3 \n"
324 "movdqa %%xmm1,0x10(%1) \n"
325 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000326 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000327 "movdqa %%xmm3,0x30(%1) \n"
328 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000329 "jg 1b \n"
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000330 : "+r"(src_rgb24), // %0
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000331 "+r"(dst_argb), // %1
332 "+r"(pix) // %2
fbarchard@google.comba1f5262012-01-12 19:22:41 +0000333 : "m"(kShuffleMaskRGB24ToARGB) // %3
fbarchard@google.comb6149762011-11-07 21:58:52 +0000334 : "memory", "cc"
335#if defined(__SSE2__)
336 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
337#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000338 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000339}
340
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000341void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000342 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000343 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
344 "pslld $0x18,%%xmm5 \n"
345 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000346 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000347 "1: \n"
348 "movdqu (%0),%%xmm0 \n"
349 "movdqu 0x10(%0),%%xmm1 \n"
350 "movdqu 0x20(%0),%%xmm3 \n"
351 "lea 0x30(%0),%0 \n"
352 "movdqa %%xmm3,%%xmm2 \n"
353 "palignr $0x8,%%xmm1,%%xmm2 \n"
354 "pshufb %%xmm4,%%xmm2 \n"
355 "por %%xmm5,%%xmm2 \n"
356 "palignr $0xc,%%xmm0,%%xmm1 \n"
357 "pshufb %%xmm4,%%xmm0 \n"
358 "movdqa %%xmm2,0x20(%1) \n"
359 "por %%xmm5,%%xmm0 \n"
360 "pshufb %%xmm4,%%xmm1 \n"
361 "movdqa %%xmm0,(%1) \n"
362 "por %%xmm5,%%xmm1 \n"
363 "palignr $0x4,%%xmm3,%%xmm3 \n"
364 "pshufb %%xmm4,%%xmm3 \n"
365 "movdqa %%xmm1,0x10(%1) \n"
366 "por %%xmm5,%%xmm3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000367 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000368 "movdqa %%xmm3,0x30(%1) \n"
369 "lea 0x40(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000370 "jg 1b \n"
fbarchard@google.com9394ed92011-10-31 21:36:47 +0000371 : "+r"(src_raw), // %0
372 "+r"(dst_argb), // %1
373 "+r"(pix) // %2
fbarchard@google.comb6149762011-11-07 21:58:52 +0000374 : "m"(kShuffleMaskRAWToARGB) // %3
375 : "memory", "cc"
376#if defined(__SSE2__)
377 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
378#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000379 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000380}
381
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000382void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000383 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000384 "mov $0x1080108,%%eax \n"
385 "movd %%eax,%%xmm5 \n"
386 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com6d6b7702012-06-04 15:29:15 +0000387 "mov $0x20802080,%%eax \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000388 "movd %%eax,%%xmm6 \n"
389 "pshufd $0x0,%%xmm6,%%xmm6 \n"
390 "pcmpeqb %%xmm3,%%xmm3 \n"
391 "psllw $0xb,%%xmm3 \n"
392 "pcmpeqb %%xmm4,%%xmm4 \n"
393 "psllw $0xa,%%xmm4 \n"
394 "psrlw $0x5,%%xmm4 \n"
395 "pcmpeqb %%xmm7,%%xmm7 \n"
396 "psllw $0x8,%%xmm7 \n"
397 "sub %0,%1 \n"
398 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000399 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000400 "1: \n"
401 "movdqu (%0),%%xmm0 \n"
402 "movdqa %%xmm0,%%xmm1 \n"
403 "movdqa %%xmm0,%%xmm2 \n"
404 "pand %%xmm3,%%xmm1 \n"
405 "psllw $0xb,%%xmm2 \n"
406 "pmulhuw %%xmm5,%%xmm1 \n"
407 "pmulhuw %%xmm5,%%xmm2 \n"
408 "psllw $0x8,%%xmm1 \n"
409 "por %%xmm2,%%xmm1 \n"
410 "pand %%xmm4,%%xmm0 \n"
411 "pmulhuw %%xmm6,%%xmm0 \n"
412 "por %%xmm7,%%xmm0 \n"
413 "movdqa %%xmm1,%%xmm2 \n"
414 "punpcklbw %%xmm0,%%xmm1 \n"
415 "punpckhbw %%xmm0,%%xmm2 \n"
416 "movdqa %%xmm1,(%1,%0,2) \n"
417 "movdqa %%xmm2,0x10(%1,%0,2) \n"
418 "lea 0x10(%0),%0 \n"
419 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000420 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000421 : "+r"(src), // %0
422 "+r"(dst), // %1
423 "+r"(pix) // %2
424 :
425 : "memory", "cc", "eax"
426#if defined(__SSE2__)
427 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
428#endif
429 );
430}
431
432void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000433 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000434 "mov $0x1080108,%%eax \n"
435 "movd %%eax,%%xmm5 \n"
436 "pshufd $0x0,%%xmm5,%%xmm5 \n"
437 "mov $0x42004200,%%eax \n"
438 "movd %%eax,%%xmm6 \n"
439 "pshufd $0x0,%%xmm6,%%xmm6 \n"
440 "pcmpeqb %%xmm3,%%xmm3 \n"
441 "psllw $0xb,%%xmm3 \n"
442 "movdqa %%xmm3,%%xmm4 \n"
443 "psrlw $0x6,%%xmm4 \n"
444 "pcmpeqb %%xmm7,%%xmm7 \n"
445 "psllw $0x8,%%xmm7 \n"
446 "sub %0,%1 \n"
447 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000448 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000449 "1: \n"
450 "movdqu (%0),%%xmm0 \n"
451 "movdqa %%xmm0,%%xmm1 \n"
452 "movdqa %%xmm0,%%xmm2 \n"
453 "psllw $0x1,%%xmm1 \n"
454 "psllw $0xb,%%xmm2 \n"
455 "pand %%xmm3,%%xmm1 \n"
456 "pmulhuw %%xmm5,%%xmm2 \n"
457 "pmulhuw %%xmm5,%%xmm1 \n"
458 "psllw $0x8,%%xmm1 \n"
459 "por %%xmm2,%%xmm1 \n"
460 "movdqa %%xmm0,%%xmm2 \n"
461 "pand %%xmm4,%%xmm0 \n"
462 "psraw $0x8,%%xmm2 \n"
463 "pmulhuw %%xmm6,%%xmm0 \n"
464 "pand %%xmm7,%%xmm2 \n"
465 "por %%xmm2,%%xmm0 \n"
466 "movdqa %%xmm1,%%xmm2 \n"
467 "punpcklbw %%xmm0,%%xmm1 \n"
468 "punpckhbw %%xmm0,%%xmm2 \n"
469 "movdqa %%xmm1,(%1,%0,2) \n"
470 "movdqa %%xmm2,0x10(%1,%0,2) \n"
471 "lea 0x10(%0),%0 \n"
472 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000473 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000474 : "+r"(src), // %0
475 "+r"(dst), // %1
476 "+r"(pix) // %2
477 :
478 : "memory", "cc", "eax"
479#if defined(__SSE2__)
480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
481#endif
482 );
483}
484
485void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000486 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000487 "mov $0xf0f0f0f,%%eax \n"
488 "movd %%eax,%%xmm4 \n"
489 "pshufd $0x0,%%xmm4,%%xmm4 \n"
490 "movdqa %%xmm4,%%xmm5 \n"
491 "pslld $0x4,%%xmm5 \n"
492 "sub %0,%1 \n"
493 "sub %0,%1 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000494 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000495 "1: \n"
496 "movdqu (%0),%%xmm0 \n"
497 "movdqa %%xmm0,%%xmm2 \n"
498 "pand %%xmm4,%%xmm0 \n"
499 "pand %%xmm5,%%xmm2 \n"
500 "movdqa %%xmm0,%%xmm1 \n"
501 "movdqa %%xmm2,%%xmm3 \n"
502 "psllw $0x4,%%xmm1 \n"
503 "psrlw $0x4,%%xmm3 \n"
504 "por %%xmm1,%%xmm0 \n"
505 "por %%xmm3,%%xmm2 \n"
506 "movdqa %%xmm0,%%xmm1 \n"
507 "punpcklbw %%xmm2,%%xmm0 \n"
508 "punpckhbw %%xmm2,%%xmm1 \n"
509 "movdqa %%xmm0,(%1,%0,2) \n"
510 "movdqa %%xmm1,0x10(%1,%0,2) \n"
511 "lea 0x10(%0),%0 \n"
512 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000513 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000514 : "+r"(src), // %0
515 "+r"(dst), // %1
516 "+r"(pix) // %2
517 :
518 : "memory", "cc", "eax"
519#if defined(__SSE2__)
520 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
521#endif
522 );
523}
524
525void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000526 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000527 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000528 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000529 "1: \n"
530 "movdqa (%0),%%xmm0 \n"
531 "movdqa 0x10(%0),%%xmm1 \n"
532 "movdqa 0x20(%0),%%xmm2 \n"
533 "movdqa 0x30(%0),%%xmm3 \n"
534 "lea 0x40(%0),%0 \n"
535 "pshufb %%xmm6,%%xmm0 \n"
536 "pshufb %%xmm6,%%xmm1 \n"
537 "pshufb %%xmm6,%%xmm2 \n"
538 "pshufb %%xmm6,%%xmm3 \n"
539 "movdqa %%xmm1,%%xmm4 \n"
540 "psrldq $0x4,%%xmm1 \n"
541 "pslldq $0xc,%%xmm4 \n"
542 "movdqa %%xmm2,%%xmm5 \n"
543 "por %%xmm4,%%xmm0 \n"
544 "pslldq $0x8,%%xmm5 \n"
545 "movdqa %%xmm0,(%1) \n"
546 "por %%xmm5,%%xmm1 \n"
547 "psrldq $0x8,%%xmm2 \n"
548 "pslldq $0x4,%%xmm3 \n"
549 "por %%xmm3,%%xmm2 \n"
550 "movdqa %%xmm1,0x10(%1) \n"
551 "movdqa %%xmm2,0x20(%1) \n"
552 "lea 0x30(%1),%1 \n"
553 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000554 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000555 : "+r"(src), // %0
556 "+r"(dst), // %1
557 "+r"(pix) // %2
558 : "m"(kShuffleMaskARGBToRGB24) // %3
559 : "memory", "cc"
560#if defined(__SSE2__)
561 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
562#endif
563 );
564}
565
566void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000567 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000568 "movdqa %3,%%xmm6 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000569 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000570 "1: \n"
571 "movdqa (%0),%%xmm0 \n"
572 "movdqa 0x10(%0),%%xmm1 \n"
573 "movdqa 0x20(%0),%%xmm2 \n"
574 "movdqa 0x30(%0),%%xmm3 \n"
575 "lea 0x40(%0),%0 \n"
576 "pshufb %%xmm6,%%xmm0 \n"
577 "pshufb %%xmm6,%%xmm1 \n"
578 "pshufb %%xmm6,%%xmm2 \n"
579 "pshufb %%xmm6,%%xmm3 \n"
580 "movdqa %%xmm1,%%xmm4 \n"
581 "psrldq $0x4,%%xmm1 \n"
582 "pslldq $0xc,%%xmm4 \n"
583 "movdqa %%xmm2,%%xmm5 \n"
584 "por %%xmm4,%%xmm0 \n"
585 "pslldq $0x8,%%xmm5 \n"
586 "movdqa %%xmm0,(%1) \n"
587 "por %%xmm5,%%xmm1 \n"
588 "psrldq $0x8,%%xmm2 \n"
589 "pslldq $0x4,%%xmm3 \n"
590 "por %%xmm3,%%xmm2 \n"
591 "movdqa %%xmm1,0x10(%1) \n"
592 "movdqa %%xmm2,0x20(%1) \n"
593 "lea 0x30(%1),%1 \n"
594 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000595 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000596 : "+r"(src), // %0
597 "+r"(dst), // %1
598 "+r"(pix) // %2
599 : "m"(kShuffleMaskARGBToRAW) // %3
600 : "memory", "cc"
601#if defined(__SSE2__)
602 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
603#endif
604 );
605}
606
fbarchard@google.comdbcabea2012-10-29 21:20:25 +0000607void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000608 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000609 "pcmpeqb %%xmm3,%%xmm3 \n"
610 "psrld $0x1b,%%xmm3 \n"
611 "pcmpeqb %%xmm4,%%xmm4 \n"
612 "psrld $0x1a,%%xmm4 \n"
613 "pslld $0x5,%%xmm4 \n"
614 "pcmpeqb %%xmm5,%%xmm5 \n"
615 "pslld $0xb,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000616 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000617 "1: \n"
618 "movdqa (%0),%%xmm0 \n"
619 "movdqa %%xmm0,%%xmm1 \n"
620 "movdqa %%xmm0,%%xmm2 \n"
621 "pslld $0x8,%%xmm0 \n"
622 "psrld $0x3,%%xmm1 \n"
623 "psrld $0x5,%%xmm2 \n"
624 "psrad $0x10,%%xmm0 \n"
625 "pand %%xmm3,%%xmm1 \n"
626 "pand %%xmm4,%%xmm2 \n"
627 "pand %%xmm5,%%xmm0 \n"
628 "por %%xmm2,%%xmm1 \n"
629 "por %%xmm1,%%xmm0 \n"
630 "packssdw %%xmm0,%%xmm0 \n"
631 "lea 0x10(%0),%0 \n"
632 "movq %%xmm0,(%1) \n"
633 "lea 0x8(%1),%1 \n"
634 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000635 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000636 : "+r"(src), // %0
637 "+r"(dst), // %1
638 "+r"(pix) // %2
639 :
640 : "memory", "cc"
641#if defined(__SSE2__)
642 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
643#endif
644 );
645}
646
647void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000648 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000649 "pcmpeqb %%xmm4,%%xmm4 \n"
650 "psrld $0x1b,%%xmm4 \n"
651 "movdqa %%xmm4,%%xmm5 \n"
652 "pslld $0x5,%%xmm5 \n"
653 "movdqa %%xmm4,%%xmm6 \n"
654 "pslld $0xa,%%xmm6 \n"
655 "pcmpeqb %%xmm7,%%xmm7 \n"
656 "pslld $0xf,%%xmm7 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000657 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000658 "1: \n"
659 "movdqa (%0),%%xmm0 \n"
660 "movdqa %%xmm0,%%xmm1 \n"
661 "movdqa %%xmm0,%%xmm2 \n"
662 "movdqa %%xmm0,%%xmm3 \n"
663 "psrad $0x10,%%xmm0 \n"
664 "psrld $0x3,%%xmm1 \n"
665 "psrld $0x6,%%xmm2 \n"
666 "psrld $0x9,%%xmm3 \n"
667 "pand %%xmm7,%%xmm0 \n"
668 "pand %%xmm4,%%xmm1 \n"
669 "pand %%xmm5,%%xmm2 \n"
670 "pand %%xmm6,%%xmm3 \n"
671 "por %%xmm1,%%xmm0 \n"
672 "por %%xmm3,%%xmm2 \n"
673 "por %%xmm2,%%xmm0 \n"
674 "packssdw %%xmm0,%%xmm0 \n"
675 "lea 0x10(%0),%0 \n"
676 "movq %%xmm0,(%1) \n"
677 "lea 0x8(%1),%1 \n"
678 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000679 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000680 : "+r"(src), // %0
681 "+r"(dst), // %1
682 "+r"(pix) // %2
683 :
684 : "memory", "cc"
685#if defined(__SSE2__)
686 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
687#endif
688 );
689}
690
691void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000692 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000693 "pcmpeqb %%xmm4,%%xmm4 \n"
694 "psllw $0xc,%%xmm4 \n"
695 "movdqa %%xmm4,%%xmm3 \n"
696 "psrlw $0x8,%%xmm3 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000697 ".p2align 4 \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000698 "1: \n"
699 "movdqa (%0),%%xmm0 \n"
700 "movdqa %%xmm0,%%xmm1 \n"
701 "pand %%xmm3,%%xmm0 \n"
702 "pand %%xmm4,%%xmm1 \n"
703 "psrlq $0x4,%%xmm0 \n"
704 "psrlq $0x8,%%xmm1 \n"
705 "por %%xmm1,%%xmm0 \n"
706 "packuswb %%xmm0,%%xmm0 \n"
707 "lea 0x10(%0),%0 \n"
708 "movq %%xmm0,(%1) \n"
709 "lea 0x8(%1),%1 \n"
710 "sub $0x4,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000711 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000712 : "+r"(src), // %0
713 "+r"(dst), // %1
714 "+r"(pix) // %2
715 :
716 : "memory", "cc"
717#if defined(__SSE2__)
718 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
719#endif
720 );
721}
722
fbarchard@google.comb6149762011-11-07 21:58:52 +0000723void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000724 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000725 "movdqa %4,%%xmm5 \n"
726 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000727 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000728 "1: \n"
729 "movdqa (%0),%%xmm0 \n"
730 "movdqa 0x10(%0),%%xmm1 \n"
731 "movdqa 0x20(%0),%%xmm2 \n"
732 "movdqa 0x30(%0),%%xmm3 \n"
733 "pmaddubsw %%xmm4,%%xmm0 \n"
734 "pmaddubsw %%xmm4,%%xmm1 \n"
735 "pmaddubsw %%xmm4,%%xmm2 \n"
736 "pmaddubsw %%xmm4,%%xmm3 \n"
737 "lea 0x40(%0),%0 \n"
738 "phaddw %%xmm1,%%xmm0 \n"
739 "phaddw %%xmm3,%%xmm2 \n"
740 "psrlw $0x7,%%xmm0 \n"
741 "psrlw $0x7,%%xmm2 \n"
742 "packuswb %%xmm2,%%xmm0 \n"
743 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000744 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000745 "movdqa %%xmm0,(%1) \n"
746 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000747 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000748 : "+r"(src_argb), // %0
749 "+r"(dst_y), // %1
750 "+r"(pix) // %2
751 : "m"(kARGBToY), // %3
752 "m"(kAddY16) // %4
753 : "memory", "cc"
754#if defined(__SSE2__)
755 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
756#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000757 );
fbarchard@google.com585a1262011-10-28 23:51:08 +0000758}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000759
760void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000761 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000762 "movdqa %4,%%xmm5 \n"
763 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000764 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000765 "1: \n"
766 "movdqu (%0),%%xmm0 \n"
767 "movdqu 0x10(%0),%%xmm1 \n"
768 "movdqu 0x20(%0),%%xmm2 \n"
769 "movdqu 0x30(%0),%%xmm3 \n"
770 "pmaddubsw %%xmm4,%%xmm0 \n"
771 "pmaddubsw %%xmm4,%%xmm1 \n"
772 "pmaddubsw %%xmm4,%%xmm2 \n"
773 "pmaddubsw %%xmm4,%%xmm3 \n"
774 "lea 0x40(%0),%0 \n"
775 "phaddw %%xmm1,%%xmm0 \n"
776 "phaddw %%xmm3,%%xmm2 \n"
777 "psrlw $0x7,%%xmm0 \n"
778 "psrlw $0x7,%%xmm2 \n"
779 "packuswb %%xmm2,%%xmm0 \n"
780 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000781 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000782 "movdqu %%xmm0,(%1) \n"
783 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000784 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000785 : "+r"(src_argb), // %0
786 "+r"(dst_y), // %1
787 "+r"(pix) // %2
788 : "m"(kARGBToY), // %3
789 "m"(kAddY16) // %4
790 : "memory", "cc"
791#if defined(__SSE2__)
792 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
793#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000794 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000795}
fbarchard@google.com585a1262011-10-28 23:51:08 +0000796
fbarchard@google.com714050a2012-02-17 22:59:56 +0000797// TODO(fbarchard): pass xmm constants to single block of assembly.
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000798// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
799// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
800// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
fbarchard@google.com714050a2012-02-17 22:59:56 +0000801// and considered unsafe.
fbarchard@google.comb6149762011-11-07 21:58:52 +0000802void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
803 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000804 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000805 "movdqa %0,%%xmm4 \n"
806 "movdqa %1,%%xmm3 \n"
807 "movdqa %2,%%xmm5 \n"
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000808 :
fbarchard@google.comf2d84dd2012-05-14 20:23:35 +0000809 : "m"(kARGBToU), // %0
810 "m"(kARGBToV), // %1
811 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000812 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000813 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000814 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000815 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000816 "1: \n"
817 "movdqa (%0),%%xmm0 \n"
818 "movdqa 0x10(%0),%%xmm1 \n"
819 "movdqa 0x20(%0),%%xmm2 \n"
820 "movdqa 0x30(%0),%%xmm6 \n"
821 "pavgb (%0,%4,1),%%xmm0 \n"
822 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
823 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
824 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
825 "lea 0x40(%0),%0 \n"
826 "movdqa %%xmm0,%%xmm7 \n"
827 "shufps $0x88,%%xmm1,%%xmm0 \n"
828 "shufps $0xdd,%%xmm1,%%xmm7 \n"
829 "pavgb %%xmm7,%%xmm0 \n"
830 "movdqa %%xmm2,%%xmm7 \n"
831 "shufps $0x88,%%xmm6,%%xmm2 \n"
832 "shufps $0xdd,%%xmm6,%%xmm7 \n"
833 "pavgb %%xmm7,%%xmm2 \n"
834 "movdqa %%xmm0,%%xmm1 \n"
835 "movdqa %%xmm2,%%xmm6 \n"
836 "pmaddubsw %%xmm4,%%xmm0 \n"
837 "pmaddubsw %%xmm4,%%xmm2 \n"
838 "pmaddubsw %%xmm3,%%xmm1 \n"
839 "pmaddubsw %%xmm3,%%xmm6 \n"
840 "phaddw %%xmm2,%%xmm0 \n"
841 "phaddw %%xmm6,%%xmm1 \n"
842 "psraw $0x8,%%xmm0 \n"
843 "psraw $0x8,%%xmm1 \n"
844 "packsswb %%xmm1,%%xmm0 \n"
845 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000846 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000847 "movlps %%xmm0,(%1) \n"
848 "movhps %%xmm0,(%1,%2,1) \n"
849 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000850 "jg 1b \n"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000851 : "+r"(src_argb0), // %0
852 "+r"(dst_u), // %1
853 "+r"(dst_v), // %2
854 "+rm"(width) // %3
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000855 : "r"(static_cast<intptr_t>(src_stride_argb))
fbarchard@google.comb6149762011-11-07 21:58:52 +0000856 : "memory", "cc"
857#if defined(__SSE2__)
fbarchard@google.comd93d4482011-11-10 18:26:20 +0000858 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000859#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000860 );
fbarchard@google.comb6149762011-11-07 21:58:52 +0000861}
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000862
863void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
864 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000865 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000866 "movdqa %0,%%xmm4 \n"
867 "movdqa %1,%%xmm3 \n"
868 "movdqa %2,%%xmm5 \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000869 :
870 : "m"(kARGBToU), // %0
871 "m"(kARGBToV), // %1
872 "m"(kAddUV128) // %2
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000873 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000874 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000875 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000876 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000877 "1: \n"
878 "movdqu (%0),%%xmm0 \n"
879 "movdqu 0x10(%0),%%xmm1 \n"
880 "movdqu 0x20(%0),%%xmm2 \n"
881 "movdqu 0x30(%0),%%xmm6 \n"
882 "movdqu (%0,%4,1),%%xmm7 \n"
883 "pavgb %%xmm7,%%xmm0 \n"
884 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
885 "pavgb %%xmm7,%%xmm1 \n"
886 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
887 "pavgb %%xmm7,%%xmm2 \n"
888 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
889 "pavgb %%xmm7,%%xmm6 \n"
890 "lea 0x40(%0),%0 \n"
891 "movdqa %%xmm0,%%xmm7 \n"
892 "shufps $0x88,%%xmm1,%%xmm0 \n"
893 "shufps $0xdd,%%xmm1,%%xmm7 \n"
894 "pavgb %%xmm7,%%xmm0 \n"
895 "movdqa %%xmm2,%%xmm7 \n"
896 "shufps $0x88,%%xmm6,%%xmm2 \n"
897 "shufps $0xdd,%%xmm6,%%xmm7 \n"
898 "pavgb %%xmm7,%%xmm2 \n"
899 "movdqa %%xmm0,%%xmm1 \n"
900 "movdqa %%xmm2,%%xmm6 \n"
901 "pmaddubsw %%xmm4,%%xmm0 \n"
902 "pmaddubsw %%xmm4,%%xmm2 \n"
903 "pmaddubsw %%xmm3,%%xmm1 \n"
904 "pmaddubsw %%xmm3,%%xmm6 \n"
905 "phaddw %%xmm2,%%xmm0 \n"
906 "phaddw %%xmm6,%%xmm1 \n"
907 "psraw $0x8,%%xmm0 \n"
908 "psraw $0x8,%%xmm1 \n"
909 "packsswb %%xmm1,%%xmm0 \n"
910 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000911 "sub $0x10,%3 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +0000912 "movlps %%xmm0,(%1) \n"
913 "movhps %%xmm0,(%1,%2,1) \n"
914 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000915 "jg 1b \n"
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000916 : "+r"(src_argb0), // %0
917 "+r"(dst_u), // %1
918 "+r"(dst_v), // %2
919 "+rm"(width) // %3
920 : "r"(static_cast<intptr_t>(src_stride_argb))
921 : "memory", "cc"
922#if defined(__SSE2__)
923 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
924#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +0000925 );
fbarchard@google.comb5b27d12012-01-28 08:44:35 +0000926}
fbarchard@google.com714050a2012-02-17 22:59:56 +0000927
fbarchard@google.com714050a2012-02-17 22:59:56 +0000928void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000929 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000930 "movdqa %4,%%xmm5 \n"
931 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000932 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000933 "1: \n"
934 "movdqa (%0),%%xmm0 \n"
935 "movdqa 0x10(%0),%%xmm1 \n"
936 "movdqa 0x20(%0),%%xmm2 \n"
937 "movdqa 0x30(%0),%%xmm3 \n"
938 "pmaddubsw %%xmm4,%%xmm0 \n"
939 "pmaddubsw %%xmm4,%%xmm1 \n"
940 "pmaddubsw %%xmm4,%%xmm2 \n"
941 "pmaddubsw %%xmm4,%%xmm3 \n"
942 "lea 0x40(%0),%0 \n"
943 "phaddw %%xmm1,%%xmm0 \n"
944 "phaddw %%xmm3,%%xmm2 \n"
945 "psrlw $0x7,%%xmm0 \n"
946 "psrlw $0x7,%%xmm2 \n"
947 "packuswb %%xmm2,%%xmm0 \n"
948 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000949 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000950 "movdqa %%xmm0,(%1) \n"
951 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000952 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000953 : "+r"(src_bgra), // %0
954 "+r"(dst_y), // %1
955 "+r"(pix) // %2
956 : "m"(kBGRAToY), // %3
957 "m"(kAddY16) // %4
958 : "memory", "cc"
959#if defined(__SSE2__)
960 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
fbarchard@google.comb6149762011-11-07 21:58:52 +0000961#endif
fbarchard@google.com714050a2012-02-17 22:59:56 +0000962 );
963}
964
965void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +0000966 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +0000967 "movdqa %4,%%xmm5 \n"
968 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000969 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000970 "1: \n"
971 "movdqu (%0),%%xmm0 \n"
972 "movdqu 0x10(%0),%%xmm1 \n"
973 "movdqu 0x20(%0),%%xmm2 \n"
974 "movdqu 0x30(%0),%%xmm3 \n"
975 "pmaddubsw %%xmm4,%%xmm0 \n"
976 "pmaddubsw %%xmm4,%%xmm1 \n"
977 "pmaddubsw %%xmm4,%%xmm2 \n"
978 "pmaddubsw %%xmm4,%%xmm3 \n"
979 "lea 0x40(%0),%0 \n"
980 "phaddw %%xmm1,%%xmm0 \n"
981 "phaddw %%xmm3,%%xmm2 \n"
982 "psrlw $0x7,%%xmm0 \n"
983 "psrlw $0x7,%%xmm2 \n"
984 "packuswb %%xmm2,%%xmm0 \n"
985 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000986 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000987 "movdqu %%xmm0,(%1) \n"
988 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +0000989 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +0000990 : "+r"(src_bgra), // %0
991 "+r"(dst_y), // %1
992 "+r"(pix) // %2
993 : "m"(kBGRAToY), // %3
994 "m"(kAddY16) // %4
995 : "memory", "cc"
996#if defined(__SSE2__)
997 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
998#endif
999 );
1000}
1001
1002void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1003 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001004 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001005 "movdqa %0,%%xmm4 \n"
1006 "movdqa %1,%%xmm3 \n"
1007 "movdqa %2,%%xmm5 \n"
1008 :
1009 : "m"(kBGRAToU), // %0
1010 "m"(kBGRAToV), // %1
1011 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001012 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001013 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001014 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001015 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001016 "1: \n"
1017 "movdqa (%0),%%xmm0 \n"
1018 "movdqa 0x10(%0),%%xmm1 \n"
1019 "movdqa 0x20(%0),%%xmm2 \n"
1020 "movdqa 0x30(%0),%%xmm6 \n"
1021 "pavgb (%0,%4,1),%%xmm0 \n"
1022 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1023 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1024 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1025 "lea 0x40(%0),%0 \n"
1026 "movdqa %%xmm0,%%xmm7 \n"
1027 "shufps $0x88,%%xmm1,%%xmm0 \n"
1028 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1029 "pavgb %%xmm7,%%xmm0 \n"
1030 "movdqa %%xmm2,%%xmm7 \n"
1031 "shufps $0x88,%%xmm6,%%xmm2 \n"
1032 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1033 "pavgb %%xmm7,%%xmm2 \n"
1034 "movdqa %%xmm0,%%xmm1 \n"
1035 "movdqa %%xmm2,%%xmm6 \n"
1036 "pmaddubsw %%xmm4,%%xmm0 \n"
1037 "pmaddubsw %%xmm4,%%xmm2 \n"
1038 "pmaddubsw %%xmm3,%%xmm1 \n"
1039 "pmaddubsw %%xmm3,%%xmm6 \n"
1040 "phaddw %%xmm2,%%xmm0 \n"
1041 "phaddw %%xmm6,%%xmm1 \n"
1042 "psraw $0x8,%%xmm0 \n"
1043 "psraw $0x8,%%xmm1 \n"
1044 "packsswb %%xmm1,%%xmm0 \n"
1045 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001046 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001047 "movlps %%xmm0,(%1) \n"
1048 "movhps %%xmm0,(%1,%2,1) \n"
1049 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001050 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001051 : "+r"(src_bgra0), // %0
1052 "+r"(dst_u), // %1
1053 "+r"(dst_v), // %2
1054 "+rm"(width) // %3
1055 : "r"(static_cast<intptr_t>(src_stride_bgra))
1056 : "memory", "cc"
1057#if defined(__SSE2__)
1058 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1059#endif
1060 );
1061}
1062
1063void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1064 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001065 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001066 "movdqa %0,%%xmm4 \n"
1067 "movdqa %1,%%xmm3 \n"
1068 "movdqa %2,%%xmm5 \n"
1069 :
1070 : "m"(kBGRAToU), // %0
1071 "m"(kBGRAToV), // %1
1072 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001073 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001074 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001075 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001076 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001077 "1: \n"
1078 "movdqu (%0),%%xmm0 \n"
1079 "movdqu 0x10(%0),%%xmm1 \n"
1080 "movdqu 0x20(%0),%%xmm2 \n"
1081 "movdqu 0x30(%0),%%xmm6 \n"
1082 "movdqu (%0,%4,1),%%xmm7 \n"
1083 "pavgb %%xmm7,%%xmm0 \n"
1084 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1085 "pavgb %%xmm7,%%xmm1 \n"
1086 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1087 "pavgb %%xmm7,%%xmm2 \n"
1088 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1089 "pavgb %%xmm7,%%xmm6 \n"
1090 "lea 0x40(%0),%0 \n"
1091 "movdqa %%xmm0,%%xmm7 \n"
1092 "shufps $0x88,%%xmm1,%%xmm0 \n"
1093 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1094 "pavgb %%xmm7,%%xmm0 \n"
1095 "movdqa %%xmm2,%%xmm7 \n"
1096 "shufps $0x88,%%xmm6,%%xmm2 \n"
1097 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1098 "pavgb %%xmm7,%%xmm2 \n"
1099 "movdqa %%xmm0,%%xmm1 \n"
1100 "movdqa %%xmm2,%%xmm6 \n"
1101 "pmaddubsw %%xmm4,%%xmm0 \n"
1102 "pmaddubsw %%xmm4,%%xmm2 \n"
1103 "pmaddubsw %%xmm3,%%xmm1 \n"
1104 "pmaddubsw %%xmm3,%%xmm6 \n"
1105 "phaddw %%xmm2,%%xmm0 \n"
1106 "phaddw %%xmm6,%%xmm1 \n"
1107 "psraw $0x8,%%xmm0 \n"
1108 "psraw $0x8,%%xmm1 \n"
1109 "packsswb %%xmm1,%%xmm0 \n"
1110 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001111 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001112 "movlps %%xmm0,(%1) \n"
1113 "movhps %%xmm0,(%1,%2,1) \n"
1114 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001115 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001116 : "+r"(src_bgra0), // %0
1117 "+r"(dst_u), // %1
1118 "+r"(dst_v), // %2
1119 "+rm"(width) // %3
1120 : "r"(static_cast<intptr_t>(src_stride_bgra))
1121 : "memory", "cc"
1122#if defined(__SSE2__)
1123 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1124#endif
1125 );
1126}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001127
1128void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001129 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001130 "movdqa %4,%%xmm5 \n"
1131 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001132 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001133 "1: \n"
1134 "movdqa (%0),%%xmm0 \n"
1135 "movdqa 0x10(%0),%%xmm1 \n"
1136 "movdqa 0x20(%0),%%xmm2 \n"
1137 "movdqa 0x30(%0),%%xmm3 \n"
1138 "pmaddubsw %%xmm4,%%xmm0 \n"
1139 "pmaddubsw %%xmm4,%%xmm1 \n"
1140 "pmaddubsw %%xmm4,%%xmm2 \n"
1141 "pmaddubsw %%xmm4,%%xmm3 \n"
1142 "lea 0x40(%0),%0 \n"
1143 "phaddw %%xmm1,%%xmm0 \n"
1144 "phaddw %%xmm3,%%xmm2 \n"
1145 "psrlw $0x7,%%xmm0 \n"
1146 "psrlw $0x7,%%xmm2 \n"
1147 "packuswb %%xmm2,%%xmm0 \n"
1148 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001149 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001150 "movdqa %%xmm0,(%1) \n"
1151 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001152 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001153 : "+r"(src_abgr), // %0
1154 "+r"(dst_y), // %1
1155 "+r"(pix) // %2
1156 : "m"(kABGRToY), // %3
1157 "m"(kAddY16) // %4
1158 : "memory", "cc"
1159#if defined(__SSE2__)
1160 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1161#endif
1162 );
1163}
1164
1165void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001166 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001167 "movdqa %4,%%xmm5 \n"
1168 "movdqa %3,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001169 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001170 "1: \n"
1171 "movdqu (%0),%%xmm0 \n"
1172 "movdqu 0x10(%0),%%xmm1 \n"
1173 "movdqu 0x20(%0),%%xmm2 \n"
1174 "movdqu 0x30(%0),%%xmm3 \n"
1175 "pmaddubsw %%xmm4,%%xmm0 \n"
1176 "pmaddubsw %%xmm4,%%xmm1 \n"
1177 "pmaddubsw %%xmm4,%%xmm2 \n"
1178 "pmaddubsw %%xmm4,%%xmm3 \n"
1179 "lea 0x40(%0),%0 \n"
1180 "phaddw %%xmm1,%%xmm0 \n"
1181 "phaddw %%xmm3,%%xmm2 \n"
1182 "psrlw $0x7,%%xmm0 \n"
1183 "psrlw $0x7,%%xmm2 \n"
1184 "packuswb %%xmm2,%%xmm0 \n"
1185 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001186 "sub $0x10,%2 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001187 "movdqu %%xmm0,(%1) \n"
1188 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001189 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001190 : "+r"(src_abgr), // %0
1191 "+r"(dst_y), // %1
1192 "+r"(pix) // %2
1193 : "m"(kABGRToY), // %3
1194 "m"(kAddY16) // %4
1195 : "memory", "cc"
1196#if defined(__SSE2__)
1197 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1198#endif
1199 );
1200}
1201
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001202void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1203 asm volatile (
1204 "movdqa %4,%%xmm5 \n"
1205 "movdqa %3,%%xmm4 \n"
1206 ".p2align 4 \n"
1207 "1: \n"
1208 "movdqa (%0),%%xmm0 \n"
1209 "movdqa 0x10(%0),%%xmm1 \n"
1210 "movdqa 0x20(%0),%%xmm2 \n"
1211 "movdqa 0x30(%0),%%xmm3 \n"
1212 "pmaddubsw %%xmm4,%%xmm0 \n"
1213 "pmaddubsw %%xmm4,%%xmm1 \n"
1214 "pmaddubsw %%xmm4,%%xmm2 \n"
1215 "pmaddubsw %%xmm4,%%xmm3 \n"
1216 "lea 0x40(%0),%0 \n"
1217 "phaddw %%xmm1,%%xmm0 \n"
1218 "phaddw %%xmm3,%%xmm2 \n"
1219 "psrlw $0x7,%%xmm0 \n"
1220 "psrlw $0x7,%%xmm2 \n"
1221 "packuswb %%xmm2,%%xmm0 \n"
1222 "paddb %%xmm5,%%xmm0 \n"
1223 "sub $0x10,%2 \n"
1224 "movdqa %%xmm0,(%1) \n"
1225 "lea 0x10(%1),%1 \n"
1226 "jg 1b \n"
1227 : "+r"(src_rgba), // %0
1228 "+r"(dst_y), // %1
1229 "+r"(pix) // %2
1230 : "m"(kRGBAToY), // %3
1231 "m"(kAddY16) // %4
1232 : "memory", "cc"
1233#if defined(__SSE2__)
1234 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1235#endif
1236 );
1237}
1238
1239void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1240 asm volatile (
1241 "movdqa %4,%%xmm5 \n"
1242 "movdqa %3,%%xmm4 \n"
1243 ".p2align 4 \n"
1244 "1: \n"
1245 "movdqu (%0),%%xmm0 \n"
1246 "movdqu 0x10(%0),%%xmm1 \n"
1247 "movdqu 0x20(%0),%%xmm2 \n"
1248 "movdqu 0x30(%0),%%xmm3 \n"
1249 "pmaddubsw %%xmm4,%%xmm0 \n"
1250 "pmaddubsw %%xmm4,%%xmm1 \n"
1251 "pmaddubsw %%xmm4,%%xmm2 \n"
1252 "pmaddubsw %%xmm4,%%xmm3 \n"
1253 "lea 0x40(%0),%0 \n"
1254 "phaddw %%xmm1,%%xmm0 \n"
1255 "phaddw %%xmm3,%%xmm2 \n"
1256 "psrlw $0x7,%%xmm0 \n"
1257 "psrlw $0x7,%%xmm2 \n"
1258 "packuswb %%xmm2,%%xmm0 \n"
1259 "paddb %%xmm5,%%xmm0 \n"
1260 "sub $0x10,%2 \n"
1261 "movdqu %%xmm0,(%1) \n"
1262 "lea 0x10(%1),%1 \n"
1263 "jg 1b \n"
1264 : "+r"(src_rgba), // %0
1265 "+r"(dst_y), // %1
1266 "+r"(pix) // %2
1267 : "m"(kRGBAToY), // %3
1268 "m"(kAddY16) // %4
1269 : "memory", "cc"
1270#if defined(__SSE2__)
1271 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1272#endif
1273 );
1274}
1275
fbarchard@google.com714050a2012-02-17 22:59:56 +00001276void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1277 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001278 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001279 "movdqa %0,%%xmm4 \n"
1280 "movdqa %1,%%xmm3 \n"
1281 "movdqa %2,%%xmm5 \n"
1282 :
1283 : "m"(kABGRToU), // %0
1284 "m"(kABGRToV), // %1
1285 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001286 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001287 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001288 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001289 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001290 "1: \n"
1291 "movdqa (%0),%%xmm0 \n"
1292 "movdqa 0x10(%0),%%xmm1 \n"
1293 "movdqa 0x20(%0),%%xmm2 \n"
1294 "movdqa 0x30(%0),%%xmm6 \n"
1295 "pavgb (%0,%4,1),%%xmm0 \n"
1296 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1297 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1298 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1299 "lea 0x40(%0),%0 \n"
1300 "movdqa %%xmm0,%%xmm7 \n"
1301 "shufps $0x88,%%xmm1,%%xmm0 \n"
1302 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1303 "pavgb %%xmm7,%%xmm0 \n"
1304 "movdqa %%xmm2,%%xmm7 \n"
1305 "shufps $0x88,%%xmm6,%%xmm2 \n"
1306 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1307 "pavgb %%xmm7,%%xmm2 \n"
1308 "movdqa %%xmm0,%%xmm1 \n"
1309 "movdqa %%xmm2,%%xmm6 \n"
1310 "pmaddubsw %%xmm4,%%xmm0 \n"
1311 "pmaddubsw %%xmm4,%%xmm2 \n"
1312 "pmaddubsw %%xmm3,%%xmm1 \n"
1313 "pmaddubsw %%xmm3,%%xmm6 \n"
1314 "phaddw %%xmm2,%%xmm0 \n"
1315 "phaddw %%xmm6,%%xmm1 \n"
1316 "psraw $0x8,%%xmm0 \n"
1317 "psraw $0x8,%%xmm1 \n"
1318 "packsswb %%xmm1,%%xmm0 \n"
1319 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001320 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001321 "movlps %%xmm0,(%1) \n"
1322 "movhps %%xmm0,(%1,%2,1) \n"
1323 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001324 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001325 : "+r"(src_abgr0), // %0
1326 "+r"(dst_u), // %1
1327 "+r"(dst_v), // %2
1328 "+rm"(width) // %3
1329 : "r"(static_cast<intptr_t>(src_stride_abgr))
1330 : "memory", "cc"
1331#if defined(__SSE2__)
1332 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1333#endif
1334 );
1335}
1336
1337void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1338 uint8* dst_u, uint8* dst_v, int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001339 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001340 "movdqa %0,%%xmm4 \n"
1341 "movdqa %1,%%xmm3 \n"
1342 "movdqa %2,%%xmm5 \n"
1343 :
1344 : "m"(kABGRToU), // %0
1345 "m"(kABGRToV), // %1
1346 "m"(kAddUV128) // %2
fbarchard@google.com714050a2012-02-17 22:59:56 +00001347 );
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001348 asm volatile (
fbarchard@google.com714050a2012-02-17 22:59:56 +00001349 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001350 ".p2align 4 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001351 "1: \n"
1352 "movdqu (%0),%%xmm0 \n"
1353 "movdqu 0x10(%0),%%xmm1 \n"
1354 "movdqu 0x20(%0),%%xmm2 \n"
1355 "movdqu 0x30(%0),%%xmm6 \n"
1356 "movdqu (%0,%4,1),%%xmm7 \n"
1357 "pavgb %%xmm7,%%xmm0 \n"
1358 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1359 "pavgb %%xmm7,%%xmm1 \n"
1360 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1361 "pavgb %%xmm7,%%xmm2 \n"
1362 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1363 "pavgb %%xmm7,%%xmm6 \n"
1364 "lea 0x40(%0),%0 \n"
1365 "movdqa %%xmm0,%%xmm7 \n"
1366 "shufps $0x88,%%xmm1,%%xmm0 \n"
1367 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1368 "pavgb %%xmm7,%%xmm0 \n"
1369 "movdqa %%xmm2,%%xmm7 \n"
1370 "shufps $0x88,%%xmm6,%%xmm2 \n"
1371 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1372 "pavgb %%xmm7,%%xmm2 \n"
1373 "movdqa %%xmm0,%%xmm1 \n"
1374 "movdqa %%xmm2,%%xmm6 \n"
1375 "pmaddubsw %%xmm4,%%xmm0 \n"
1376 "pmaddubsw %%xmm4,%%xmm2 \n"
1377 "pmaddubsw %%xmm3,%%xmm1 \n"
1378 "pmaddubsw %%xmm3,%%xmm6 \n"
1379 "phaddw %%xmm2,%%xmm0 \n"
1380 "phaddw %%xmm6,%%xmm1 \n"
1381 "psraw $0x8,%%xmm0 \n"
1382 "psraw $0x8,%%xmm1 \n"
1383 "packsswb %%xmm1,%%xmm0 \n"
1384 "paddb %%xmm5,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001385 "sub $0x10,%3 \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001386 "movlps %%xmm0,(%1) \n"
1387 "movhps %%xmm0,(%1,%2,1) \n"
1388 "lea 0x8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001389 "jg 1b \n"
fbarchard@google.com714050a2012-02-17 22:59:56 +00001390 : "+r"(src_abgr0), // %0
1391 "+r"(dst_u), // %1
1392 "+r"(dst_v), // %2
1393 "+rm"(width) // %3
1394 : "r"(static_cast<intptr_t>(src_stride_abgr))
1395 : "memory", "cc"
1396#if defined(__SSE2__)
1397 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1398#endif
1399 );
1400}
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001401
1402void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1403 uint8* dst_u, uint8* dst_v, int width) {
1404 asm volatile (
1405 "movdqa %0,%%xmm4 \n"
1406 "movdqa %1,%%xmm3 \n"
1407 "movdqa %2,%%xmm5 \n"
1408 :
1409 : "m"(kRGBAToU), // %0
1410 "m"(kRGBAToV), // %1
1411 "m"(kAddUV128) // %2
1412 );
1413 asm volatile (
1414 "sub %1,%2 \n"
1415 ".p2align 4 \n"
1416 "1: \n"
1417 "movdqa (%0),%%xmm0 \n"
1418 "movdqa 0x10(%0),%%xmm1 \n"
1419 "movdqa 0x20(%0),%%xmm2 \n"
1420 "movdqa 0x30(%0),%%xmm6 \n"
1421 "pavgb (%0,%4,1),%%xmm0 \n"
1422 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1423 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1424 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1425 "lea 0x40(%0),%0 \n"
1426 "movdqa %%xmm0,%%xmm7 \n"
1427 "shufps $0x88,%%xmm1,%%xmm0 \n"
1428 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1429 "pavgb %%xmm7,%%xmm0 \n"
1430 "movdqa %%xmm2,%%xmm7 \n"
1431 "shufps $0x88,%%xmm6,%%xmm2 \n"
1432 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1433 "pavgb %%xmm7,%%xmm2 \n"
1434 "movdqa %%xmm0,%%xmm1 \n"
1435 "movdqa %%xmm2,%%xmm6 \n"
1436 "pmaddubsw %%xmm4,%%xmm0 \n"
1437 "pmaddubsw %%xmm4,%%xmm2 \n"
1438 "pmaddubsw %%xmm3,%%xmm1 \n"
1439 "pmaddubsw %%xmm3,%%xmm6 \n"
1440 "phaddw %%xmm2,%%xmm0 \n"
1441 "phaddw %%xmm6,%%xmm1 \n"
1442 "psraw $0x8,%%xmm0 \n"
1443 "psraw $0x8,%%xmm1 \n"
1444 "packsswb %%xmm1,%%xmm0 \n"
1445 "paddb %%xmm5,%%xmm0 \n"
1446 "sub $0x10,%3 \n"
1447 "movlps %%xmm0,(%1) \n"
1448 "movhps %%xmm0,(%1,%2,1) \n"
1449 "lea 0x8(%1),%1 \n"
1450 "jg 1b \n"
1451 : "+r"(src_rgba0), // %0
1452 "+r"(dst_u), // %1
1453 "+r"(dst_v), // %2
1454 "+rm"(width) // %3
1455 : "r"(static_cast<intptr_t>(src_stride_rgba))
1456 : "memory", "cc"
1457#if defined(__SSE2__)
1458 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1459#endif
1460 );
1461}
1462
1463void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1464 uint8* dst_u, uint8* dst_v, int width) {
1465 asm volatile (
1466 "movdqa %0,%%xmm4 \n"
1467 "movdqa %1,%%xmm3 \n"
1468 "movdqa %2,%%xmm5 \n"
1469 :
1470 : "m"(kRGBAToU), // %0
1471 "m"(kRGBAToV), // %1
1472 "m"(kAddUV128) // %2
1473 );
1474 asm volatile (
1475 "sub %1,%2 \n"
1476 ".p2align 4 \n"
1477 "1: \n"
1478 "movdqu (%0),%%xmm0 \n"
1479 "movdqu 0x10(%0),%%xmm1 \n"
1480 "movdqu 0x20(%0),%%xmm2 \n"
1481 "movdqu 0x30(%0),%%xmm6 \n"
1482 "movdqu (%0,%4,1),%%xmm7 \n"
1483 "pavgb %%xmm7,%%xmm0 \n"
1484 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1485 "pavgb %%xmm7,%%xmm1 \n"
1486 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1487 "pavgb %%xmm7,%%xmm2 \n"
1488 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1489 "pavgb %%xmm7,%%xmm6 \n"
1490 "lea 0x40(%0),%0 \n"
1491 "movdqa %%xmm0,%%xmm7 \n"
1492 "shufps $0x88,%%xmm1,%%xmm0 \n"
1493 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1494 "pavgb %%xmm7,%%xmm0 \n"
1495 "movdqa %%xmm2,%%xmm7 \n"
1496 "shufps $0x88,%%xmm6,%%xmm2 \n"
1497 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1498 "pavgb %%xmm7,%%xmm2 \n"
1499 "movdqa %%xmm0,%%xmm1 \n"
1500 "movdqa %%xmm2,%%xmm6 \n"
1501 "pmaddubsw %%xmm4,%%xmm0 \n"
1502 "pmaddubsw %%xmm4,%%xmm2 \n"
1503 "pmaddubsw %%xmm3,%%xmm1 \n"
1504 "pmaddubsw %%xmm3,%%xmm6 \n"
1505 "phaddw %%xmm2,%%xmm0 \n"
1506 "phaddw %%xmm6,%%xmm1 \n"
1507 "psraw $0x8,%%xmm0 \n"
1508 "psraw $0x8,%%xmm1 \n"
1509 "packsswb %%xmm1,%%xmm0 \n"
1510 "paddb %%xmm5,%%xmm0 \n"
1511 "sub $0x10,%3 \n"
1512 "movlps %%xmm0,(%1) \n"
1513 "movhps %%xmm0,(%1,%2,1) \n"
1514 "lea 0x8(%1),%1 \n"
1515 "jg 1b \n"
1516 : "+r"(src_rgba0), // %0
1517 "+r"(dst_u), // %1
1518 "+r"(dst_v), // %2
1519 "+rm"(width) // %3
1520 : "r"(static_cast<intptr_t>(src_stride_rgba))
1521 : "memory", "cc"
1522#if defined(__SSE2__)
1523 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1524#endif
1525 );
1526}
fbarchard@google.com714050a2012-02-17 22:59:56 +00001527#endif // HAS_ARGBTOYROW_SSSE3
fbarchard@google.comb6149762011-11-07 21:58:52 +00001528
fbarchard@google.come214fe32012-06-04 23:47:11 +00001529#ifdef HAS_I422TOARGBROW_SSSE3
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001530#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1531#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1532#define UR 0
1533
1534#define VB 0
1535#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1536#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1537
1538// Bias
1539#define BB UB * 128 + VB * 128
1540#define BG UG * 128 + VG * 128
1541#define BR UR * 128 + VR * 128
1542
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001543#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
fbarchard@google.com228bdc22011-11-15 21:58:26 +00001544
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001545struct {
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001546 vec8 kUVToB; // 0
1547 vec8 kUVToG; // 16
1548 vec8 kUVToR; // 32
1549 vec16 kUVBiasB; // 48
1550 vec16 kUVBiasG; // 64
1551 vec16 kUVBiasR; // 80
1552 vec16 kYSub16; // 96
1553 vec16 kYToRgb; // 112
1554 vec8 kVUToB; // 128
1555 vec8 kVUToG; // 144
1556 vec8 kVUToR; // 160
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00001557} CONST SIMD_ALIGNED(kYuvConstants) = {
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001558 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1559 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1560 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1561 { BB, BB, BB, BB, BB, BB, BB, BB },
1562 { BG, BG, BG, BG, BG, BG, BG, BG },
1563 { BR, BR, BR, BR, BR, BR, BR, BR },
1564 { 16, 16, 16, 16, 16, 16, 16, 16 },
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001565 { YG, YG, YG, YG, YG, YG, YG, YG },
1566 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1567 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1568 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
fbarchard@google.com1c2d8be2011-11-17 21:57:54 +00001569};
1570
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001571
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001572// Read 8 UV from 411
1573#define READYUV444 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001574 "movq (%[u_buf]),%%xmm0 \n" \
1575 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1576 "lea 0x8(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001577 "punpcklbw %%xmm1,%%xmm0 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001578
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001579// Read 4 UV from 422, upsample to 8 UV
1580#define READYUV422 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001581 "movd (%[u_buf]),%%xmm0 \n" \
1582 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1583 "lea 0x4(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001584 "punpcklbw %%xmm1,%%xmm0 \n" \
1585 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.comb6149762011-11-07 21:58:52 +00001586
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001587// Read 2 UV from 411, upsample to 8 UV
1588#define READYUV411 \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001589 "movd (%[u_buf]),%%xmm0 \n" \
1590 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1591 "lea 0x2(%[u_buf]),%[u_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001592 "punpcklbw %%xmm1,%%xmm0 \n" \
1593 "punpcklwd %%xmm0,%%xmm0 \n" \
1594 "punpckldq %%xmm0,%%xmm0 \n" \
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001595
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001596// Read 4 UV from NV12, upsample to 8 UV
1597#define READNV12 \
1598 "movq (%[uv_buf]),%%xmm0 \n" \
1599 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
fbarchard@google.com6fd84a82012-09-19 07:35:45 +00001600 "punpcklwd %%xmm0,%%xmm0 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001601
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001602// Convert 8 pixels: 8 UV and 8 Y
1603#define YUVTORGB \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001604 "movdqa %%xmm0,%%xmm1 \n" \
1605 "movdqa %%xmm0,%%xmm2 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001606 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1607 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1608 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1609 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1610 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1611 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1612 "movq (%[y_buf]),%%xmm3 \n" \
1613 "lea 0x8(%[y_buf]),%[y_buf] \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001614 "punpcklbw %%xmm4,%%xmm3 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001615 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1616 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001617 "paddsw %%xmm3,%%xmm0 \n" \
1618 "paddsw %%xmm3,%%xmm1 \n" \
1619 "paddsw %%xmm3,%%xmm2 \n" \
1620 "psraw $0x6,%%xmm0 \n" \
1621 "psraw $0x6,%%xmm1 \n" \
1622 "psraw $0x6,%%xmm2 \n" \
1623 "packuswb %%xmm0,%%xmm0 \n" \
1624 "packuswb %%xmm1,%%xmm1 \n" \
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001625 "packuswb %%xmm2,%%xmm2 \n" \
1626
1627// Convert 8 pixels: 8 VU and 8 Y
1628#define YVUTORGB \
1629 "movdqa %%xmm0,%%xmm1 \n" \
1630 "movdqa %%xmm0,%%xmm2 \n" \
1631 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1632 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1633 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1634 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1635 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1636 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1637 "movq (%[y_buf]),%%xmm3 \n" \
1638 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1639 "punpcklbw %%xmm4,%%xmm3 \n" \
1640 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1641 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1642 "paddsw %%xmm3,%%xmm0 \n" \
1643 "paddsw %%xmm3,%%xmm1 \n" \
1644 "paddsw %%xmm3,%%xmm2 \n" \
1645 "psraw $0x6,%%xmm0 \n" \
1646 "psraw $0x6,%%xmm1 \n" \
1647 "psraw $0x6,%%xmm2 \n" \
1648 "packuswb %%xmm0,%%xmm0 \n" \
1649 "packuswb %%xmm1,%%xmm1 \n" \
1650 "packuswb %%xmm2,%%xmm2 \n" \
fbarchard@google.come214fe32012-06-04 23:47:11 +00001651
1652void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001653 const uint8* u_buf,
1654 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001655 uint8* argb_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001656 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001657 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001658 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001659 "pcmpeqb %%xmm5,%%xmm5 \n"
1660 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001661 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001662 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001663 READYUV444
1664 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001665 "punpcklbw %%xmm1,%%xmm0 \n"
1666 "punpcklbw %%xmm5,%%xmm2 \n"
1667 "movdqa %%xmm0,%%xmm1 \n"
1668 "punpcklwd %%xmm2,%%xmm0 \n"
1669 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001670 "movdqa %%xmm0,(%[argb_buf]) \n"
1671 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1672 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1673 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00001674 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001675 : [y_buf]"+r"(y_buf), // %[y_buf]
1676 [u_buf]"+r"(u_buf), // %[u_buf]
1677 [v_buf]"+r"(v_buf), // %[v_buf]
1678 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1679 [width]"+rm"(width) // %[width]
1680 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001681 : "memory", "cc"
1682#if defined(__SSE2__)
1683 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1684#endif
1685 );
1686}
1687
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001688void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1689 const uint8* u_buf,
1690 const uint8* v_buf,
1691 uint8* rgb24_buf,
1692 int width) {
1693// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
1694#ifdef __APPLE__
1695 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001696 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1697 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1698 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1699 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001700#endif
1701
1702 asm volatile (
1703#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001704 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1705 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001706#endif
1707 "sub %[u_buf],%[v_buf] \n"
1708 "pxor %%xmm4,%%xmm4 \n"
1709 ".p2align 4 \n"
1710 "1: \n"
1711 READYUV422
1712 YUVTORGB
1713 "punpcklbw %%xmm1,%%xmm0 \n"
1714 "punpcklbw %%xmm2,%%xmm2 \n"
1715 "movdqa %%xmm0,%%xmm1 \n"
1716 "punpcklwd %%xmm2,%%xmm0 \n"
1717 "punpckhwd %%xmm2,%%xmm1 \n"
1718 "pshufb %%xmm5,%%xmm0 \n"
1719 "pshufb %%xmm6,%%xmm1 \n"
1720 "palignr $0xc,%%xmm0,%%xmm1 \n"
1721 "movq %%xmm0,(%[rgb24_buf]) \n"
1722 "movdqu %%xmm1,0x8(%[rgb24_buf]) \n"
1723 "lea 0x18(%[rgb24_buf]),%[rgb24_buf] \n"
1724 "sub $0x8,%[width] \n"
1725 "jg 1b \n"
1726 : [y_buf]"+r"(y_buf), // %[y_buf]
1727 [u_buf]"+r"(u_buf), // %[u_buf]
1728 [v_buf]"+r"(v_buf), // %[v_buf]
1729 [rgb24_buf]"+r"(rgb24_buf), // %[rgb24_buf]
1730 [width]"+rm"(width) // %[width]
1731 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1732#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001733 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1734 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001735#endif
1736 : "memory", "cc"
1737#if defined(__SSE2__)
1738 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1739#endif
1740 );
1741}
1742
1743void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1744 const uint8* u_buf,
1745 const uint8* v_buf,
1746 uint8* raw_buf,
1747 int width) {
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001748// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001749#ifdef __APPLE__
1750 asm volatile (
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001751 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1752 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1753 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1754 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001755#endif
1756
1757 asm volatile (
1758#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001759 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1760 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001761#endif
1762 "sub %[u_buf],%[v_buf] \n"
1763 "pxor %%xmm4,%%xmm4 \n"
1764 ".p2align 4 \n"
1765 "1: \n"
1766 READYUV422
1767 YUVTORGB
1768 "punpcklbw %%xmm1,%%xmm0 \n"
1769 "punpcklbw %%xmm2,%%xmm2 \n"
1770 "movdqa %%xmm0,%%xmm1 \n"
1771 "punpcklwd %%xmm2,%%xmm0 \n"
1772 "punpckhwd %%xmm2,%%xmm1 \n"
1773 "pshufb %%xmm5,%%xmm0 \n"
1774 "pshufb %%xmm6,%%xmm1 \n"
1775 "palignr $0xc,%%xmm0,%%xmm1 \n"
1776 "movq %%xmm0,(%[raw_buf]) \n"
1777 "movdqu %%xmm1,0x8(%[raw_buf]) \n"
1778 "lea 0x18(%[raw_buf]),%[raw_buf] \n"
1779 "sub $0x8,%[width] \n"
1780 "jg 1b \n"
1781 : [y_buf]"+r"(y_buf), // %[y_buf]
1782 [u_buf]"+r"(u_buf), // %[u_buf]
1783 [v_buf]"+r"(v_buf), // %[v_buf]
1784 [raw_buf]"+r"(raw_buf), // %[raw_buf]
1785 [width]"+rm"(width) // %[width]
1786 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
1787#ifndef __APPLE__
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001788 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1789 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001790#endif
1791 : "memory", "cc"
1792#if defined(__SSE2__)
fbarchard@google.com4de0c432012-10-11 01:25:46 +00001793 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00001794#endif
1795 );
1796}
1797
fbarchard@google.come214fe32012-06-04 23:47:11 +00001798void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00001799 const uint8* u_buf,
1800 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00001801 uint8* argb_buf,
fbarchard@google.comdbcabea2012-10-29 21:20:25 +00001802 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00001803 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001804 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001805 "pcmpeqb %%xmm5,%%xmm5 \n"
1806 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00001807 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00001808 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001809 READYUV422
1810 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001811 "punpcklbw %%xmm1,%%xmm0 \n"
1812 "punpcklbw %%xmm5,%%xmm2 \n"
1813 "movdqa %%xmm0,%%xmm1 \n"
1814 "punpcklwd %%xmm2,%%xmm0 \n"
1815 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001816 "movdqa %%xmm0,(%[argb_buf]) \n"
1817 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1818 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1819 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001820 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001821 : [y_buf]"+r"(y_buf), // %[y_buf]
1822 [u_buf]"+r"(u_buf), // %[u_buf]
1823 [v_buf]"+r"(v_buf), // %[v_buf]
1824 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1825 [width]"+rm"(width) // %[width]
1826 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001827 : "memory", "cc"
1828#if defined(__SSE2__)
1829 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1830#endif
1831 );
1832}
1833
1834void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1835 const uint8* u_buf,
1836 const uint8* v_buf,
1837 uint8* argb_buf,
1838 int width) {
1839 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001840 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001841 "pcmpeqb %%xmm5,%%xmm5 \n"
1842 "pxor %%xmm4,%%xmm4 \n"
1843 ".p2align 4 \n"
1844 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001845 READYUV411
1846 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001847 "punpcklbw %%xmm1,%%xmm0 \n"
1848 "punpcklbw %%xmm5,%%xmm2 \n"
1849 "movdqa %%xmm0,%%xmm1 \n"
1850 "punpcklwd %%xmm2,%%xmm0 \n"
1851 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001852 "movdqa %%xmm0,(%[argb_buf]) \n"
1853 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1854 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1855 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001856 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001857 : [y_buf]"+r"(y_buf), // %[y_buf]
1858 [u_buf]"+r"(u_buf), // %[u_buf]
1859 [v_buf]"+r"(v_buf), // %[v_buf]
1860 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1861 [width]"+rm"(width) // %[width]
1862 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1863 : "memory", "cc"
1864#if defined(__SSE2__)
1865 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1866#endif
1867 );
1868}
1869
1870void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1871 const uint8* uv_buf,
1872 uint8* argb_buf,
1873 int width) {
1874 asm volatile (
1875 "pcmpeqb %%xmm5,%%xmm5 \n"
1876 "pxor %%xmm4,%%xmm4 \n"
1877 ".p2align 4 \n"
1878 "1: \n"
1879 READNV12
1880 YUVTORGB
1881 "punpcklbw %%xmm1,%%xmm0 \n"
1882 "punpcklbw %%xmm5,%%xmm2 \n"
1883 "movdqa %%xmm0,%%xmm1 \n"
1884 "punpcklwd %%xmm2,%%xmm0 \n"
1885 "punpckhwd %%xmm2,%%xmm1 \n"
1886 "movdqa %%xmm0,(%[argb_buf]) \n"
1887 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1888 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1889 "sub $0x8,%[width] \n"
1890 "jg 1b \n"
1891 : [y_buf]"+r"(y_buf), // %[y_buf]
1892 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1893 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1894 [width]"+rm"(width) // %[width]
1895 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1896 : "memory", "cc"
1897#if defined(__SSE2__)
1898 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1899#endif
1900 );
1901}
1902
1903void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1904 const uint8* vu_buf,
1905 uint8* argb_buf,
1906 int width) {
1907 asm volatile (
1908 "pcmpeqb %%xmm5,%%xmm5 \n"
1909 "pxor %%xmm4,%%xmm4 \n"
1910 ".p2align 4 \n"
1911 "1: \n"
1912 READNV12
1913 YVUTORGB
1914 "punpcklbw %%xmm1,%%xmm0 \n"
1915 "punpcklbw %%xmm5,%%xmm2 \n"
1916 "movdqa %%xmm0,%%xmm1 \n"
1917 "punpcklwd %%xmm2,%%xmm0 \n"
1918 "punpckhwd %%xmm2,%%xmm1 \n"
1919 "movdqa %%xmm0,(%[argb_buf]) \n"
1920 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1921 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1922 "sub $0x8,%[width] \n"
1923 "jg 1b \n"
1924 : [y_buf]"+r"(y_buf), // %[y_buf]
1925 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1926 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1927 [width]"+rm"(width) // %[width]
1928 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001929 : "memory", "cc"
1930#if defined(__SSE2__)
1931 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1932#endif
1933 );
1934}
1935
1936void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1937 const uint8* u_buf,
1938 const uint8* v_buf,
1939 uint8* argb_buf,
1940 int width) {
1941 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001942 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001943 "pcmpeqb %%xmm5,%%xmm5 \n"
1944 "pxor %%xmm4,%%xmm4 \n"
1945 ".p2align 4 \n"
1946 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001947 READYUV444
1948 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001949 "punpcklbw %%xmm1,%%xmm0 \n"
1950 "punpcklbw %%xmm5,%%xmm2 \n"
1951 "movdqa %%xmm0,%%xmm1 \n"
1952 "punpcklwd %%xmm2,%%xmm0 \n"
1953 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001954 "movdqu %%xmm0,(%[argb_buf]) \n"
1955 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1956 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1957 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001958 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001959 : [y_buf]"+r"(y_buf), // %[y_buf]
1960 [u_buf]"+r"(u_buf), // %[u_buf]
1961 [v_buf]"+r"(v_buf), // %[v_buf]
1962 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1963 [width]"+rm"(width) // %[width]
1964 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00001965 : "memory", "cc"
1966#if defined(__SSE2__)
1967 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1968#endif
1969 );
1970}
1971
1972void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1973 const uint8* u_buf,
1974 const uint8* v_buf,
1975 uint8* argb_buf,
1976 int width) {
1977 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001978 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001979 "pcmpeqb %%xmm5,%%xmm5 \n"
1980 "pxor %%xmm4,%%xmm4 \n"
1981 ".p2align 4 \n"
1982 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00001983 READYUV422
1984 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00001985 "punpcklbw %%xmm1,%%xmm0 \n"
1986 "punpcklbw %%xmm5,%%xmm2 \n"
1987 "movdqa %%xmm0,%%xmm1 \n"
1988 "punpcklwd %%xmm2,%%xmm0 \n"
1989 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001990 "movdqu %%xmm0,(%[argb_buf]) \n"
1991 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1992 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1993 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00001994 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00001995 : [y_buf]"+r"(y_buf), // %[y_buf]
1996 [u_buf]"+r"(u_buf), // %[u_buf]
1997 [v_buf]"+r"(v_buf), // %[v_buf]
1998 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1999 [width]"+rm"(width) // %[width]
2000 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002001 : "memory", "cc"
2002#if defined(__SSE2__)
2003 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2004#endif
2005 );
2006}
2007
2008void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2009 const uint8* u_buf,
2010 const uint8* v_buf,
2011 uint8* argb_buf,
2012 int width) {
2013 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002014 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002015 "pcmpeqb %%xmm5,%%xmm5 \n"
2016 "pxor %%xmm4,%%xmm4 \n"
2017 ".p2align 4 \n"
2018 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002019 READYUV411
2020 YUVTORGB
fbarchard@google.come214fe32012-06-04 23:47:11 +00002021 "punpcklbw %%xmm1,%%xmm0 \n"
2022 "punpcklbw %%xmm5,%%xmm2 \n"
2023 "movdqa %%xmm0,%%xmm1 \n"
2024 "punpcklwd %%xmm2,%%xmm0 \n"
2025 "punpckhwd %%xmm2,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002026 "movdqu %%xmm0,(%[argb_buf]) \n"
2027 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2028 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2029 "sub $0x8,%[width] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002030 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002031 : [y_buf]"+r"(y_buf), // %[y_buf]
2032 [u_buf]"+r"(u_buf), // %[u_buf]
2033 [v_buf]"+r"(v_buf), // %[v_buf]
2034 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2035 [width]"+rm"(width) // %[width]
2036 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2037 : "memory", "cc"
2038#if defined(__SSE2__)
2039 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2040#endif
2041 );
2042}
2043
2044void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2045 const uint8* uv_buf,
2046 uint8* argb_buf,
2047 int width) {
2048 asm volatile (
2049 "pcmpeqb %%xmm5,%%xmm5 \n"
2050 "pxor %%xmm4,%%xmm4 \n"
2051 ".p2align 4 \n"
2052 "1: \n"
2053 READNV12
2054 YUVTORGB
2055 "punpcklbw %%xmm1,%%xmm0 \n"
2056 "punpcklbw %%xmm5,%%xmm2 \n"
2057 "movdqa %%xmm0,%%xmm1 \n"
2058 "punpcklwd %%xmm2,%%xmm0 \n"
2059 "punpckhwd %%xmm2,%%xmm1 \n"
2060 "movdqu %%xmm0,(%[argb_buf]) \n"
2061 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2062 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2063 "sub $0x8,%[width] \n"
2064 "jg 1b \n"
2065 : [y_buf]"+r"(y_buf), // %[y_buf]
2066 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2067 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2068 [width]"+rm"(width) // %[width]
2069 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2070 : "memory", "cc"
2071#if defined(__SSE2__)
2072 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2073#endif
2074 );
2075}
2076
2077void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2078 const uint8* vu_buf,
2079 uint8* argb_buf,
2080 int width) {
2081 asm volatile (
2082 "pcmpeqb %%xmm5,%%xmm5 \n"
2083 "pxor %%xmm4,%%xmm4 \n"
2084 ".p2align 4 \n"
2085 "1: \n"
2086 READNV12
2087 YVUTORGB
2088 "punpcklbw %%xmm1,%%xmm0 \n"
2089 "punpcklbw %%xmm5,%%xmm2 \n"
2090 "movdqa %%xmm0,%%xmm1 \n"
2091 "punpcklwd %%xmm2,%%xmm0 \n"
2092 "punpckhwd %%xmm2,%%xmm1 \n"
2093 "movdqu %%xmm0,(%[argb_buf]) \n"
2094 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2095 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2096 "sub $0x8,%[width] \n"
2097 "jg 1b \n"
2098 : [y_buf]"+r"(y_buf), // %[y_buf]
2099 [uv_buf]"+r"(vu_buf), // %[uv_buf]
2100 [argb_buf]"+r"(argb_buf), // %[argb_buf]
2101 [width]"+rm"(width) // %[width]
2102 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come214fe32012-06-04 23:47:11 +00002103 : "memory", "cc"
2104#if defined(__SSE2__)
2105 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2106#endif
2107 );
2108}
2109
2110void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2111 const uint8* u_buf,
2112 const uint8* v_buf,
2113 uint8* bgra_buf,
2114 int width) {
2115 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002116 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come214fe32012-06-04 23:47:11 +00002117 "pcmpeqb %%xmm5,%%xmm5 \n"
2118 "pxor %%xmm4,%%xmm4 \n"
2119 ".p2align 4 \n"
2120 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002121 READYUV422
2122 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002123 "pcmpeqb %%xmm5,%%xmm5 \n"
2124 "punpcklbw %%xmm0,%%xmm1 \n"
2125 "punpcklbw %%xmm2,%%xmm5 \n"
2126 "movdqa %%xmm5,%%xmm0 \n"
2127 "punpcklwd %%xmm1,%%xmm5 \n"
2128 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002129 "movdqa %%xmm5,(%[argb_buf]) \n"
2130 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2131 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2132 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002133 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002134 : [y_buf]"+r"(y_buf), // %[y_buf]
2135 [u_buf]"+r"(u_buf), // %[u_buf]
2136 [v_buf]"+r"(v_buf), // %[v_buf]
2137 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2138 [width]"+rm"(width) // %[width]
2139 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002140 : "memory", "cc"
2141#if defined(__SSE2__)
2142 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2143#endif
2144 );
2145}
2146
fbarchard@google.come214fe32012-06-04 23:47:11 +00002147void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002148 const uint8* u_buf,
2149 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002150 uint8* abgr_buf,
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002151 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002152 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002153 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002154 "pcmpeqb %%xmm5,%%xmm5 \n"
2155 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002156 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002157 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002158 READYUV422
2159 YUVTORGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002160 "punpcklbw %%xmm1,%%xmm2 \n"
2161 "punpcklbw %%xmm5,%%xmm0 \n"
2162 "movdqa %%xmm2,%%xmm1 \n"
2163 "punpcklwd %%xmm0,%%xmm2 \n"
2164 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002165 "movdqa %%xmm2,(%[argb_buf]) \n"
2166 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
2167 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2168 "sub $0x8,%[width] \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002169 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002170 : [y_buf]"+r"(y_buf), // %[y_buf]
2171 [u_buf]"+r"(u_buf), // %[u_buf]
2172 [v_buf]"+r"(v_buf), // %[v_buf]
2173 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2174 [width]"+rm"(width) // %[width]
2175 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002176 : "memory", "cc"
2177#if defined(__SSE2__)
2178 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2179#endif
2180 );
2181}
2182
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002183void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2184 const uint8* u_buf,
2185 const uint8* v_buf,
2186 uint8* rgba_buf,
2187 int width) {
2188 asm volatile (
2189 "sub %[u_buf],%[v_buf] \n"
2190 "pcmpeqb %%xmm5,%%xmm5 \n"
2191 "pxor %%xmm4,%%xmm4 \n"
2192 ".p2align 4 \n"
2193 "1: \n"
2194 READYUV422
2195 YUVTORGB
2196 "pcmpeqb %%xmm5,%%xmm5 \n"
2197 "punpcklbw %%xmm2,%%xmm1 \n"
2198 "punpcklbw %%xmm0,%%xmm5 \n"
2199 "movdqa %%xmm5,%%xmm0 \n"
2200 "punpcklwd %%xmm1,%%xmm5 \n"
2201 "punpckhwd %%xmm1,%%xmm0 \n"
2202 "movdqa %%xmm5,(%[argb_buf]) \n"
2203 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2204 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2205 "sub $0x8,%[width] \n"
2206 "jg 1b \n"
2207 : [y_buf]"+r"(y_buf), // %[y_buf]
2208 [u_buf]"+r"(u_buf), // %[u_buf]
2209 [v_buf]"+r"(v_buf), // %[v_buf]
2210 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2211 [width]"+rm"(width) // %[width]
2212 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2213 : "memory", "cc"
2214#if defined(__SSE2__)
2215 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2216#endif
2217 );
2218}
2219
fbarchard@google.come214fe32012-06-04 23:47:11 +00002220void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002221 const uint8* u_buf,
2222 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002223 uint8* bgra_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002224 int width) {
2225 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002226 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002227 "pcmpeqb %%xmm5,%%xmm5 \n"
2228 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002229 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002230 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002231 READYUV422
2232 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002233 "pcmpeqb %%xmm5,%%xmm5 \n"
2234 "punpcklbw %%xmm0,%%xmm1 \n"
2235 "punpcklbw %%xmm2,%%xmm5 \n"
2236 "movdqa %%xmm5,%%xmm0 \n"
2237 "punpcklwd %%xmm1,%%xmm5 \n"
2238 "punpckhwd %%xmm1,%%xmm0 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002239 "movdqu %%xmm5,(%[argb_buf]) \n"
2240 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
2241 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2242 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002243 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002244 : [y_buf]"+r"(y_buf), // %[y_buf]
2245 [u_buf]"+r"(u_buf), // %[u_buf]
2246 [v_buf]"+r"(v_buf), // %[v_buf]
2247 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
2248 [width]"+rm"(width) // %[width]
2249 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002250 : "memory", "cc"
2251#if defined(__SSE2__)
2252 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2253#endif
2254 );
2255}
2256
fbarchard@google.come214fe32012-06-04 23:47:11 +00002257void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002258 const uint8* u_buf,
2259 const uint8* v_buf,
fbarchard@google.come214fe32012-06-04 23:47:11 +00002260 uint8* abgr_buf,
fbarchard@google.com952a5072012-03-30 18:10:50 +00002261 int width) {
2262 asm volatile (
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002263 "sub %[u_buf],%[v_buf] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002264 "pcmpeqb %%xmm5,%%xmm5 \n"
2265 "pxor %%xmm4,%%xmm4 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002266 ".p2align 4 \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002267 "1: \n"
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002268 READYUV422
2269 YUVTORGB
fbarchard@google.com952a5072012-03-30 18:10:50 +00002270 "punpcklbw %%xmm1,%%xmm2 \n"
2271 "punpcklbw %%xmm5,%%xmm0 \n"
2272 "movdqa %%xmm2,%%xmm1 \n"
2273 "punpcklwd %%xmm0,%%xmm2 \n"
2274 "punpckhwd %%xmm0,%%xmm1 \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002275 "movdqu %%xmm2,(%[argb_buf]) \n"
2276 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
2277 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2278 "sub $0x8,%[width] \n"
fbarchard@google.com952a5072012-03-30 18:10:50 +00002279 "jg 1b \n"
fbarchard@google.com2d9fe082012-06-05 22:11:34 +00002280 : [y_buf]"+r"(y_buf), // %[y_buf]
2281 [u_buf]"+r"(u_buf), // %[u_buf]
2282 [v_buf]"+r"(v_buf), // %[v_buf]
2283 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
2284 [width]"+rm"(width) // %[width]
2285 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
fbarchard@google.com952a5072012-03-30 18:10:50 +00002286 : "memory", "cc"
2287#if defined(__SSE2__)
2288 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2289#endif
2290 );
2291}
fbarchard@google.come91bdac2012-10-09 21:09:33 +00002292
2293void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2294 const uint8* u_buf,
2295 const uint8* v_buf,
2296 uint8* rgba_buf,
2297 int width) {
2298 asm volatile (
2299 "sub %[u_buf],%[v_buf] \n"
2300 "pcmpeqb %%xmm5,%%xmm5 \n"
2301 "pxor %%xmm4,%%xmm4 \n"
2302 ".p2align 4 \n"
2303 "1: \n"
2304 READYUV422
2305 YUVTORGB
2306 "pcmpeqb %%xmm5,%%xmm5 \n"
2307 "punpcklbw %%xmm2,%%xmm1 \n"
2308 "punpcklbw %%xmm0,%%xmm5 \n"
2309 "movdqa %%xmm5,%%xmm0 \n"
2310 "punpcklwd %%xmm1,%%xmm5 \n"
2311 "punpckhwd %%xmm1,%%xmm0 \n"
2312 "movdqa %%xmm5,(%[argb_buf]) \n"
2313 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
2314 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
2315 "sub $0x8,%[width] \n"
2316 "jg 1b \n"
2317 : [y_buf]"+r"(y_buf), // %[y_buf]
2318 [u_buf]"+r"(u_buf), // %[u_buf]
2319 [v_buf]"+r"(v_buf), // %[v_buf]
2320 [argb_buf]"+r"(rgba_buf), // %[argb_buf]
2321 [width]"+rm"(width) // %[width]
2322 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2323 : "memory", "cc"
2324#if defined(__SSE2__)
2325 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2326#endif
2327 );
2328}
2329
fbarchard@google.come214fe32012-06-04 23:47:11 +00002330#endif // HAS_I422TOARGBROW_SSSE3
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002331
2332#ifdef HAS_YTOARGBROW_SSE2
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002333void YToARGBRow_SSE2(const uint8* y_buf,
2334 uint8* rgb_buf,
2335 int width) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002336 asm volatile (
fbarchard@google.com30859f72012-11-02 09:51:29 +00002337 "pxor %%xmm5,%%xmm5 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002338 "pcmpeqb %%xmm4,%%xmm4 \n"
2339 "pslld $0x18,%%xmm4 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002340 "mov $0x00100010,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002341 "movd %%eax,%%xmm3 \n"
2342 "pshufd $0x0,%%xmm3,%%xmm3 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002343 "mov $0x004a004a,%%eax \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002344 "movd %%eax,%%xmm2 \n"
2345 "pshufd $0x0,%%xmm2,%%xmm2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002346 ".p2align 4 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002347 "1: \n"
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002348 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002349 "movq (%0),%%xmm0 \n"
2350 "lea 0x8(%0),%0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002351 "punpcklbw %%xmm5,%%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002352 "psubusw %%xmm3,%%xmm0 \n"
fbarchard@google.com30859f72012-11-02 09:51:29 +00002353 "pmullw %%xmm2,%%xmm0 \n"
2354 "psrlw $6, %%xmm0 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002355 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002356
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002357 // Step 2: Weave into ARGB
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002358 "punpcklbw %%xmm0,%%xmm0 \n"
2359 "movdqa %%xmm0,%%xmm1 \n"
2360 "punpcklwd %%xmm0,%%xmm0 \n"
2361 "punpckhwd %%xmm1,%%xmm1 \n"
2362 "por %%xmm4,%%xmm0 \n"
2363 "por %%xmm4,%%xmm1 \n"
2364 "movdqa %%xmm0,(%1) \n"
2365 "movdqa %%xmm1,16(%1) \n"
2366 "lea 32(%1),%1 \n"
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002367
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002368 "sub $0x8,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002369 "jg 1b \n"
fbarchard@google.com3faa0f12011-10-20 06:04:16 +00002370 : "+r"(y_buf), // %0
2371 "+r"(rgb_buf), // %1
fbarchard@google.comb6149762011-11-07 21:58:52 +00002372 "+rm"(width) // %2
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002373 :
2374 : "memory", "cc", "eax"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002375#if defined(__SSE2__)
fbarchard@google.com8b9759c2011-12-14 04:17:39 +00002376 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comb6149762011-11-07 21:58:52 +00002377#endif
fbarchard@google.com228bdc22011-11-15 21:58:26 +00002378 );
mikhal@webrtc.org120d5e72011-10-07 17:57:17 +00002379}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002380#endif // HAS_YTOARGBROW_SSE2
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00002381
fbarchard@google.com42831e02012-01-21 02:54:17 +00002382#ifdef HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002383// Shuffle table for reversing the bytes.
fbarchard@google.com42831e02012-01-21 02:54:17 +00002384CONST uvec8 kShuffleMirror = {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002385 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2386};
2387
fbarchard@google.com42831e02012-01-21 02:54:17 +00002388void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
fbarchard@google.com12d04832011-11-21 23:54:38 +00002389 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002390 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002391 "movdqa %3,%%xmm5 \n"
2392 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002393 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002394 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002395 "movdqa (%0,%2),%%xmm0 \n"
2396 "pshufb %%xmm5,%%xmm0 \n"
2397 "sub $0x10,%2 \n"
2398 "movdqa %%xmm0,(%1) \n"
2399 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002400 "jg 1b \n"
fbarchard@google.com12d04832011-11-21 23:54:38 +00002401 : "+r"(src), // %0
2402 "+r"(dst), // %1
2403 "+r"(temp_width) // %2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002404 : "m"(kShuffleMirror) // %3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002405 : "memory", "cc"
2406#if defined(__SSE2__)
2407 , "xmm0", "xmm5"
2408#endif
2409 );
2410}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002411#endif // HAS_MIRRORROW_SSSE3
fbarchard@google.com12d04832011-11-21 23:54:38 +00002412
fbarchard@google.com42831e02012-01-21 02:54:17 +00002413#ifdef HAS_MIRRORROW_SSE2
fbarchard@google.com42831e02012-01-21 02:54:17 +00002414void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002415 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002416 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002417 "lea -0x10(%0),%0 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002418 ".p2align 4 \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002419 "1: \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002420 "movdqu (%0,%2),%%xmm0 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002421 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002422 "psllw $0x8,%%xmm0 \n"
2423 "psrlw $0x8,%%xmm1 \n"
2424 "por %%xmm1,%%xmm0 \n"
2425 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2426 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2427 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2428 "sub $0x10,%2 \n"
2429 "movdqu %%xmm0,(%1) \n"
2430 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002431 "jg 1b \n"
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002432 : "+r"(src), // %0
2433 "+r"(dst), // %1
2434 "+r"(temp_width) // %2
2435 :
2436 : "memory", "cc"
2437#if defined(__SSE2__)
2438 , "xmm0", "xmm1"
2439#endif
2440 );
2441}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002442#endif // HAS_MIRRORROW_SSE2
fbarchard@google.com373cdbd2011-12-14 21:10:07 +00002443
fbarchard@google.com16a96642012-03-02 22:38:09 +00002444#ifdef HAS_MIRRORROW_UV_SSSE3
2445// Shuffle table for reversing the bytes of UV channels.
2446CONST uvec8 kShuffleMirrorUV = {
2447 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2448};
2449void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2450 int width) {
2451 intptr_t temp_width = static_cast<intptr_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002452 asm volatile (
fbarchard@google.com16a96642012-03-02 22:38:09 +00002453 "movdqa %4,%%xmm1 \n"
2454 "lea -16(%0,%3,2),%0 \n"
2455 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002456 ".p2align 4 \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002457 "1: \n"
2458 "movdqa (%0),%%xmm0 \n"
2459 "lea -16(%0),%0 \n"
2460 "pshufb %%xmm1,%%xmm0 \n"
2461 "sub $8,%3 \n"
2462 "movlpd %%xmm0,(%1) \n"
2463 "movhpd %%xmm0,(%1,%2) \n"
2464 "lea 8(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002465 "jg 1b \n"
fbarchard@google.com16a96642012-03-02 22:38:09 +00002466 : "+r"(src), // %0
2467 "+r"(dst_u), // %1
2468 "+r"(dst_v), // %2
2469 "+r"(temp_width) // %3
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002470 : "m"(kShuffleMirrorUV) // %4
fbarchard@google.com16a96642012-03-02 22:38:09 +00002471 : "memory", "cc"
2472#if defined(__SSE2__)
2473 , "xmm0", "xmm1"
2474#endif
2475 );
2476}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002477#endif // HAS_MIRRORROW_UV_SSSE3
fbarchard@google.com16a96642012-03-02 22:38:09 +00002478
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002479#ifdef HAS_ARGBMIRRORROW_SSSE3
2480// Shuffle table for reversing the bytes.
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002481CONST uvec8 kARGBShuffleMirror = {
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002482 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2483};
2484
2485void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2486 intptr_t temp_width = static_cast<intptr_t>(width);
2487 asm volatile (
2488 "movdqa %3,%%xmm5 \n"
2489 "lea -0x10(%0),%0 \n"
2490 ".p2align 4 \n"
2491 "1: \n"
2492 "movdqa (%0,%2,4),%%xmm0 \n"
2493 "pshufb %%xmm5,%%xmm0 \n"
2494 "sub $0x4,%2 \n"
2495 "movdqa %%xmm0,(%1) \n"
2496 "lea 0x10(%1),%1 \n"
2497 "jg 1b \n"
2498 : "+r"(src), // %0
2499 "+r"(dst), // %1
2500 "+r"(temp_width) // %2
fbarchard@google.com4d2cfd32012-06-25 20:09:31 +00002501 : "m"(kARGBShuffleMirror) // %3
fbarchard@google.com27d42c72012-06-22 23:57:26 +00002502 : "memory", "cc"
2503#if defined(__SSE2__)
2504 , "xmm0", "xmm5"
2505#endif
2506 );
2507}
2508#endif // HAS_ARGBMIRRORROW_SSSE3
2509
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002510#ifdef HAS_SPLITUV_SSE2
2511void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002512 asm volatile (
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002513 "pcmpeqb %%xmm5,%%xmm5 \n"
2514 "psrlw $0x8,%%xmm5 \n"
2515 "sub %1,%2 \n"
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002516 ".p2align 4 \n"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002517 "1: \n"
2518 "movdqa (%0),%%xmm0 \n"
2519 "movdqa 0x10(%0),%%xmm1 \n"
2520 "lea 0x20(%0),%0 \n"
2521 "movdqa %%xmm0,%%xmm2 \n"
2522 "movdqa %%xmm1,%%xmm3 \n"
2523 "pand %%xmm5,%%xmm0 \n"
2524 "pand %%xmm5,%%xmm1 \n"
2525 "packuswb %%xmm1,%%xmm0 \n"
2526 "psrlw $0x8,%%xmm2 \n"
2527 "psrlw $0x8,%%xmm3 \n"
2528 "packuswb %%xmm3,%%xmm2 \n"
2529 "movdqa %%xmm0,(%1) \n"
2530 "movdqa %%xmm2,(%1,%2) \n"
2531 "lea 0x10(%1),%1 \n"
2532 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002533 "jg 1b \n"
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002534 : "+r"(src_uv), // %0
2535 "+r"(dst_u), // %1
2536 "+r"(dst_v), // %2
2537 "+r"(pix) // %3
2538 :
2539 : "memory", "cc"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002540#if defined(__SSE2__)
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002541 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002542#endif
2543 );
2544}
fbarchard@google.comdb694ed2012-10-17 21:54:04 +00002545
2546void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2547 int pix) {
2548 asm volatile (
2549 "pcmpeqb %%xmm5,%%xmm5 \n"
2550 "psrlw $0x8,%%xmm5 \n"
2551 "sub %1,%2 \n"
2552 ".p2align 4 \n"
2553 "1: \n"
2554 "movdqu (%0),%%xmm0 \n"
2555 "movdqu 0x10(%0),%%xmm1 \n"
2556 "lea 0x20(%0),%0 \n"
2557 "movdqa %%xmm0,%%xmm2 \n"
2558 "movdqa %%xmm1,%%xmm3 \n"
2559 "pand %%xmm5,%%xmm0 \n"
2560 "pand %%xmm5,%%xmm1 \n"
2561 "packuswb %%xmm1,%%xmm0 \n"
2562 "psrlw $0x8,%%xmm2 \n"
2563 "psrlw $0x8,%%xmm3 \n"
2564 "packuswb %%xmm3,%%xmm2 \n"
2565 "movdqu %%xmm0,(%1) \n"
2566 "movdqu %%xmm2,(%1,%2) \n"
2567 "lea 0x10(%1),%1 \n"
2568 "sub $0x10,%3 \n"
2569 "jg 1b \n"
2570 : "+r"(src_uv), // %0
2571 "+r"(dst_u), // %1
2572 "+r"(dst_v), // %2
2573 "+r"(pix) // %3
2574 :
2575 : "memory", "cc"
2576#if defined(__SSE2__)
2577 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2578#endif
2579 );
2580}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002581#endif // HAS_SPLITUV_SSE2
fbarchard@google.com2d11d432012-02-16 02:50:39 +00002582
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002583#ifdef HAS_MERGEUV_SSE2
2584void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2585 int width) {
2586 asm volatile (
2587 "sub %0,%1 \n"
2588 ".p2align 4 \n"
2589 "1: \n"
2590 "movdqa (%0),%%xmm0 \n"
2591 "movdqa (%0,%1,1),%%xmm1 \n"
2592 "lea 0x10(%0),%0 \n"
2593 "movdqa %%xmm0,%%xmm2 \n"
2594 "punpcklbw %%xmm1,%%xmm0 \n"
2595 "punpckhbw %%xmm1,%%xmm2 \n"
2596 "movdqa %%xmm0,(%2) \n"
2597 "movdqa %%xmm2,0x10(%2) \n"
2598 "lea 0x20(%2),%2 \n"
2599 "sub $0x10,%3 \n"
2600 "jg 1b \n"
2601 : "+r"(src_u), // %0
2602 "+r"(src_v), // %1
2603 "+r"(dst_uv), // %2
2604 "+r"(width) // %3
2605 :
2606 : "memory", "cc"
2607#if defined(__SSE2__)
2608 , "xmm0", "xmm1", "xmm2"
2609#endif
2610 );
2611}
fbarchard@google.come0d86482012-10-27 19:07:55 +00002612
2613void MergeUV_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
2614 uint8* dst_uv, int width) {
2615 asm volatile (
2616 "sub %0,%1 \n"
2617 ".p2align 4 \n"
2618 "1: \n"
2619 "movdqu (%0),%%xmm0 \n"
2620 "movdqu (%0,%1,1),%%xmm1 \n"
2621 "lea 0x10(%0),%0 \n"
2622 "movdqa %%xmm0,%%xmm2 \n"
2623 "punpcklbw %%xmm1,%%xmm0 \n"
2624 "punpckhbw %%xmm1,%%xmm2 \n"
2625 "movdqu %%xmm0,(%2) \n"
2626 "movdqu %%xmm2,0x10(%2) \n"
2627 "lea 0x20(%2),%2 \n"
2628 "sub $0x10,%3 \n"
2629 "jg 1b \n"
2630 : "+r"(src_u), // %0
2631 "+r"(src_v), // %1
2632 "+r"(dst_uv), // %2
2633 "+r"(width) // %3
2634 :
2635 : "memory", "cc"
2636#if defined(__SSE2__)
2637 , "xmm0", "xmm1", "xmm2"
2638#endif
2639 );
2640}
fbarchard@google.com1dafd442012-10-26 08:27:36 +00002641#endif // HAS_MERGEUV_SSE2
2642
fbarchard@google.com19932f82012-02-16 22:19:14 +00002643#ifdef HAS_COPYROW_SSE2
2644void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002645 asm volatile (
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002646 "sub %0,%1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00002647 ".p2align 4 \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002648 "1: \n"
2649 "movdqa (%0),%%xmm0 \n"
2650 "movdqa 0x10(%0),%%xmm1 \n"
2651 "movdqa %%xmm0,(%0,%1) \n"
2652 "movdqa %%xmm1,0x10(%0,%1) \n"
2653 "lea 0x20(%0),%0 \n"
2654 "sub $0x20,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002655 "jg 1b \n"
fbarchard@google.com19932f82012-02-16 22:19:14 +00002656 : "+r"(src), // %0
2657 "+r"(dst), // %1
2658 "+r"(count) // %2
2659 :
2660 : "memory", "cc"
2661#if defined(__SSE2__)
2662 , "xmm0", "xmm1"
2663#endif
2664 );
2665}
2666#endif // HAS_COPYROW_SSE2
2667
2668#ifdef HAS_COPYROW_X86
2669void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2670 size_t width_tmp = static_cast<size_t>(width);
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002671 asm volatile (
fbarchard@google.com19932f82012-02-16 22:19:14 +00002672 "shr $0x2,%2 \n"
2673 "rep movsl \n"
2674 : "+S"(src), // %0
2675 "+D"(dst), // %1
2676 "+c"(width_tmp) // %2
2677 :
2678 : "memory", "cc"
2679 );
2680}
fbarchard@google.com4c416e82012-06-05 15:48:07 +00002681#endif // HAS_COPYROW_X86
fbarchard@google.com19932f82012-02-16 22:19:14 +00002682
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00002683#ifdef HAS_SETROW_X86
2684void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2685 size_t width_tmp = static_cast<size_t>(width);
2686 asm volatile (
2687 "shr $0x2,%1 \n"
2688 "rep stosl \n"
2689 : "+D"(dst), // %0
2690 "+c"(width_tmp) // %1
2691 : "a"(v32) // %2
2692 : "memory", "cc");
2693}
2694
2695void SetRows32_X86(uint8* dst, uint32 v32, int width,
2696 int dst_stride, int height) {
2697 for (int y = 0; y < height; ++y) {
2698 size_t width_tmp = static_cast<size_t>(width);
2699 uint32* d = reinterpret_cast<uint32*>(dst);
2700 asm volatile (
2701 "rep stosl \n"
2702 : "+D"(d), // %0
2703 "+c"(width_tmp) // %1
2704 : "a"(v32) // %2
2705 : "memory", "cc");
2706 dst += dst_stride;
2707 }
2708}
2709#endif // HAS_SETROW_X86
2710
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00002711#ifdef HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002712void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002713 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002714 "pcmpeqb %%xmm5,%%xmm5 \n"
2715 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002716 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002717 "1: \n"
2718 "movdqa (%0),%%xmm0 \n"
2719 "movdqa 0x10(%0),%%xmm1 \n"
2720 "lea 0x20(%0),%0 \n"
2721 "pand %%xmm5,%%xmm0 \n"
2722 "pand %%xmm5,%%xmm1 \n"
2723 "packuswb %%xmm1,%%xmm0 \n"
2724 "movdqa %%xmm0,(%1) \n"
2725 "lea 0x10(%1),%1 \n"
2726 "sub $0x10,%2 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002727 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002728 : "+r"(src_yuy2), // %0
2729 "+r"(dst_y), // %1
2730 "+r"(pix) // %2
2731 :
2732 : "memory", "cc"
2733#if defined(__SSE2__)
2734 , "xmm0", "xmm1", "xmm5"
2735#endif
2736 );
2737}
2738
2739void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002740 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002741 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002742 "pcmpeqb %%xmm5,%%xmm5 \n"
2743 "psrlw $0x8,%%xmm5 \n"
2744 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002745 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002746 "1: \n"
2747 "movdqa (%0),%%xmm0 \n"
2748 "movdqa 0x10(%0),%%xmm1 \n"
2749 "movdqa (%0,%4,1),%%xmm2 \n"
2750 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2751 "lea 0x20(%0),%0 \n"
2752 "pavgb %%xmm2,%%xmm0 \n"
2753 "pavgb %%xmm3,%%xmm1 \n"
2754 "psrlw $0x8,%%xmm0 \n"
2755 "psrlw $0x8,%%xmm1 \n"
2756 "packuswb %%xmm1,%%xmm0 \n"
2757 "movdqa %%xmm0,%%xmm1 \n"
2758 "pand %%xmm5,%%xmm0 \n"
2759 "packuswb %%xmm0,%%xmm0 \n"
2760 "psrlw $0x8,%%xmm1 \n"
2761 "packuswb %%xmm1,%%xmm1 \n"
2762 "movq %%xmm0,(%1) \n"
2763 "movq %%xmm1,(%1,%2) \n"
2764 "lea 0x8(%1),%1 \n"
2765 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002766 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002767 : "+r"(src_yuy2), // %0
2768 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002769 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002770 "+r"(pix) // %3
2771 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2772 : "memory", "cc"
2773#if defined(__SSE2__)
2774 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2775#endif
2776 );
2777}
2778
fbarchard@google.comc704f782012-08-30 19:53:48 +00002779void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2780 uint8* dst_u, uint8* dst_v, int pix) {
2781 asm volatile (
2782 "pcmpeqb %%xmm5,%%xmm5 \n"
2783 "psrlw $0x8,%%xmm5 \n"
2784 "sub %1,%2 \n"
2785 ".p2align 4 \n"
2786 "1: \n"
2787 "movdqa (%0),%%xmm0 \n"
2788 "movdqa 0x10(%0),%%xmm1 \n"
2789 "lea 0x20(%0),%0 \n"
2790 "psrlw $0x8,%%xmm0 \n"
2791 "psrlw $0x8,%%xmm1 \n"
2792 "packuswb %%xmm1,%%xmm0 \n"
2793 "movdqa %%xmm0,%%xmm1 \n"
2794 "pand %%xmm5,%%xmm0 \n"
2795 "packuswb %%xmm0,%%xmm0 \n"
2796 "psrlw $0x8,%%xmm1 \n"
2797 "packuswb %%xmm1,%%xmm1 \n"
2798 "movq %%xmm0,(%1) \n"
2799 "movq %%xmm1,(%1,%2) \n"
2800 "lea 0x8(%1),%1 \n"
2801 "sub $0x10,%3 \n"
2802 "jg 1b \n"
2803 : "+r"(src_yuy2), // %0
2804 "+r"(dst_u), // %1
2805 "+r"(dst_v), // %2
2806 "+r"(pix) // %3
2807 :
2808 : "memory", "cc"
2809#if defined(__SSE2__)
2810 , "xmm0", "xmm1", "xmm5"
2811#endif
2812 );
2813}
fbarchard@google.comf3fb7b62012-03-29 23:19:34 +00002814
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002815void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2816 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002817 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002818 "pcmpeqb %%xmm5,%%xmm5 \n"
2819 "psrlw $0x8,%%xmm5 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002820 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002821 "1: \n"
2822 "movdqu (%0),%%xmm0 \n"
2823 "movdqu 0x10(%0),%%xmm1 \n"
2824 "lea 0x20(%0),%0 \n"
2825 "pand %%xmm5,%%xmm0 \n"
2826 "pand %%xmm5,%%xmm1 \n"
2827 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002828 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002829 "movdqu %%xmm0,(%1) \n"
2830 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002831 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002832 : "+r"(src_yuy2), // %0
2833 "+r"(dst_y), // %1
2834 "+r"(pix) // %2
2835 :
2836 : "memory", "cc"
2837#if defined(__SSE2__)
2838 , "xmm0", "xmm1", "xmm5"
2839#endif
2840 );
2841}
2842
2843void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2844 int stride_yuy2,
fbarchard@google.comd4164fb2012-08-30 20:42:01 +00002845 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002846 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002847 "pcmpeqb %%xmm5,%%xmm5 \n"
2848 "psrlw $0x8,%%xmm5 \n"
2849 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002850 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002851 "1: \n"
2852 "movdqu (%0),%%xmm0 \n"
2853 "movdqu 0x10(%0),%%xmm1 \n"
2854 "movdqu (%0,%4,1),%%xmm2 \n"
2855 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2856 "lea 0x20(%0),%0 \n"
2857 "pavgb %%xmm2,%%xmm0 \n"
2858 "pavgb %%xmm3,%%xmm1 \n"
2859 "psrlw $0x8,%%xmm0 \n"
2860 "psrlw $0x8,%%xmm1 \n"
2861 "packuswb %%xmm1,%%xmm0 \n"
2862 "movdqa %%xmm0,%%xmm1 \n"
2863 "pand %%xmm5,%%xmm0 \n"
2864 "packuswb %%xmm0,%%xmm0 \n"
2865 "psrlw $0x8,%%xmm1 \n"
2866 "packuswb %%xmm1,%%xmm1 \n"
2867 "movq %%xmm0,(%1) \n"
2868 "movq %%xmm1,(%1,%2) \n"
2869 "lea 0x8(%1),%1 \n"
2870 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002871 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002872 : "+r"(src_yuy2), // %0
2873 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002874 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002875 "+r"(pix) // %3
2876 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2877 : "memory", "cc"
2878#if defined(__SSE2__)
2879 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2880#endif
fbarchard@google.comf1b60632012-02-17 19:27:20 +00002881 );
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002882}
2883
fbarchard@google.comc704f782012-08-30 19:53:48 +00002884void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2885 uint8* dst_u, uint8* dst_v, int pix) {
2886 asm volatile (
2887 "pcmpeqb %%xmm5,%%xmm5 \n"
2888 "psrlw $0x8,%%xmm5 \n"
2889 "sub %1,%2 \n"
2890 ".p2align 4 \n"
2891 "1: \n"
2892 "movdqu (%0),%%xmm0 \n"
2893 "movdqu 0x10(%0),%%xmm1 \n"
2894 "lea 0x20(%0),%0 \n"
2895 "psrlw $0x8,%%xmm0 \n"
2896 "psrlw $0x8,%%xmm1 \n"
2897 "packuswb %%xmm1,%%xmm0 \n"
2898 "movdqa %%xmm0,%%xmm1 \n"
2899 "pand %%xmm5,%%xmm0 \n"
2900 "packuswb %%xmm0,%%xmm0 \n"
2901 "psrlw $0x8,%%xmm1 \n"
2902 "packuswb %%xmm1,%%xmm1 \n"
2903 "movq %%xmm0,(%1) \n"
2904 "movq %%xmm1,(%1,%2) \n"
2905 "lea 0x8(%1),%1 \n"
2906 "sub $0x10,%3 \n"
2907 "jg 1b \n"
2908 : "+r"(src_yuy2), // %0
2909 "+r"(dst_u), // %1
2910 "+r"(dst_v), // %2
2911 "+r"(pix) // %3
2912 :
2913 : "memory", "cc"
2914#if defined(__SSE2__)
2915 , "xmm0", "xmm1", "xmm5"
2916#endif
2917 );
2918}
2919
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002920void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002921 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002922 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002923 "1: \n"
2924 "movdqa (%0),%%xmm0 \n"
2925 "movdqa 0x10(%0),%%xmm1 \n"
2926 "lea 0x20(%0),%0 \n"
2927 "psrlw $0x8,%%xmm0 \n"
2928 "psrlw $0x8,%%xmm1 \n"
2929 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002930 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002931 "movdqa %%xmm0,(%1) \n"
2932 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002933 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002934 : "+r"(src_uyvy), // %0
2935 "+r"(dst_y), // %1
2936 "+r"(pix) // %2
2937 :
2938 : "memory", "cc"
2939#if defined(__SSE2__)
2940 , "xmm0", "xmm1"
2941#endif
2942 );
2943}
2944
2945void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00002946 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00002947 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002948 "pcmpeqb %%xmm5,%%xmm5 \n"
2949 "psrlw $0x8,%%xmm5 \n"
2950 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00002951 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002952 "1: \n"
2953 "movdqa (%0),%%xmm0 \n"
2954 "movdqa 0x10(%0),%%xmm1 \n"
2955 "movdqa (%0,%4,1),%%xmm2 \n"
2956 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2957 "lea 0x20(%0),%0 \n"
2958 "pavgb %%xmm2,%%xmm0 \n"
2959 "pavgb %%xmm3,%%xmm1 \n"
2960 "pand %%xmm5,%%xmm0 \n"
2961 "pand %%xmm5,%%xmm1 \n"
2962 "packuswb %%xmm1,%%xmm0 \n"
2963 "movdqa %%xmm0,%%xmm1 \n"
2964 "pand %%xmm5,%%xmm0 \n"
2965 "packuswb %%xmm0,%%xmm0 \n"
2966 "psrlw $0x8,%%xmm1 \n"
2967 "packuswb %%xmm1,%%xmm1 \n"
2968 "movq %%xmm0,(%1) \n"
2969 "movq %%xmm1,(%1,%2) \n"
2970 "lea 0x8(%1),%1 \n"
2971 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00002972 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002973 : "+r"(src_uyvy), // %0
2974 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00002975 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00002976 "+r"(pix) // %3
2977 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2978 : "memory", "cc"
2979#if defined(__SSE2__)
2980 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2981#endif
2982 );
2983}
2984
fbarchard@google.comc704f782012-08-30 19:53:48 +00002985void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2986 uint8* dst_u, uint8* dst_v, int pix) {
2987 asm volatile (
2988 "pcmpeqb %%xmm5,%%xmm5 \n"
2989 "psrlw $0x8,%%xmm5 \n"
2990 "sub %1,%2 \n"
2991 ".p2align 4 \n"
2992 "1: \n"
2993 "movdqa (%0),%%xmm0 \n"
2994 "movdqa 0x10(%0),%%xmm1 \n"
2995 "lea 0x20(%0),%0 \n"
2996 "pand %%xmm5,%%xmm0 \n"
2997 "pand %%xmm5,%%xmm1 \n"
2998 "packuswb %%xmm1,%%xmm0 \n"
2999 "movdqa %%xmm0,%%xmm1 \n"
3000 "pand %%xmm5,%%xmm0 \n"
3001 "packuswb %%xmm0,%%xmm0 \n"
3002 "psrlw $0x8,%%xmm1 \n"
3003 "packuswb %%xmm1,%%xmm1 \n"
3004 "movq %%xmm0,(%1) \n"
3005 "movq %%xmm1,(%1,%2) \n"
3006 "lea 0x8(%1),%1 \n"
3007 "sub $0x10,%3 \n"
3008 "jg 1b \n"
3009 : "+r"(src_uyvy), // %0
3010 "+r"(dst_u), // %1
3011 "+r"(dst_v), // %2
3012 "+r"(pix) // %3
3013 :
3014 : "memory", "cc"
3015#if defined(__SSE2__)
3016 , "xmm0", "xmm1", "xmm5"
3017#endif
3018 );
3019}
3020
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003021void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3022 uint8* dst_y, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003023 asm volatile (
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003024 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003025 "1: \n"
3026 "movdqu (%0),%%xmm0 \n"
3027 "movdqu 0x10(%0),%%xmm1 \n"
3028 "lea 0x20(%0),%0 \n"
3029 "psrlw $0x8,%%xmm0 \n"
3030 "psrlw $0x8,%%xmm1 \n"
3031 "packuswb %%xmm1,%%xmm0 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003032 "sub $0x10,%2 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003033 "movdqu %%xmm0,(%1) \n"
3034 "lea 0x10(%1),%1 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003035 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003036 : "+r"(src_uyvy), // %0
3037 "+r"(dst_y), // %1
3038 "+r"(pix) // %2
3039 :
3040 : "memory", "cc"
3041#if defined(__SSE2__)
3042 , "xmm0", "xmm1"
3043#endif
3044 );
3045}
3046
3047void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
fbarchard@google.comc704f782012-08-30 19:53:48 +00003048 uint8* dst_u, uint8* dst_v, int pix) {
fbarchard@google.come14b2ab2012-03-26 16:15:15 +00003049 asm volatile (
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003050 "pcmpeqb %%xmm5,%%xmm5 \n"
3051 "psrlw $0x8,%%xmm5 \n"
3052 "sub %1,%2 \n"
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003053 ".p2align 4 \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003054 "1: \n"
3055 "movdqu (%0),%%xmm0 \n"
3056 "movdqu 0x10(%0),%%xmm1 \n"
3057 "movdqu (%0,%4,1),%%xmm2 \n"
3058 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
3059 "lea 0x20(%0),%0 \n"
3060 "pavgb %%xmm2,%%xmm0 \n"
3061 "pavgb %%xmm3,%%xmm1 \n"
3062 "pand %%xmm5,%%xmm0 \n"
3063 "pand %%xmm5,%%xmm1 \n"
3064 "packuswb %%xmm1,%%xmm0 \n"
3065 "movdqa %%xmm0,%%xmm1 \n"
3066 "pand %%xmm5,%%xmm0 \n"
3067 "packuswb %%xmm0,%%xmm0 \n"
3068 "psrlw $0x8,%%xmm1 \n"
3069 "packuswb %%xmm1,%%xmm1 \n"
3070 "movq %%xmm0,(%1) \n"
3071 "movq %%xmm1,(%1,%2) \n"
3072 "lea 0x8(%1),%1 \n"
3073 "sub $0x10,%3 \n"
fbarchard@google.com18184fd2012-03-12 18:53:19 +00003074 "jg 1b \n"
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003075 : "+r"(src_uyvy), // %0
3076 "+r"(dst_u), // %1
fbarchard@google.comc704f782012-08-30 19:53:48 +00003077 "+r"(dst_v), // %2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003078 "+r"(pix) // %3
3079 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
3080 : "memory", "cc"
3081#if defined(__SSE2__)
3082 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3083#endif
3084 );
3085}
fbarchard@google.comc704f782012-08-30 19:53:48 +00003086
3087void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3088 uint8* dst_u, uint8* dst_v, int pix) {
3089 asm volatile (
3090 "pcmpeqb %%xmm5,%%xmm5 \n"
3091 "psrlw $0x8,%%xmm5 \n"
3092 "sub %1,%2 \n"
3093 ".p2align 4 \n"
3094 "1: \n"
3095 "movdqu (%0),%%xmm0 \n"
3096 "movdqu 0x10(%0),%%xmm1 \n"
3097 "lea 0x20(%0),%0 \n"
3098 "pand %%xmm5,%%xmm0 \n"
3099 "pand %%xmm5,%%xmm1 \n"
3100 "packuswb %%xmm1,%%xmm0 \n"
3101 "movdqa %%xmm0,%%xmm1 \n"
3102 "pand %%xmm5,%%xmm0 \n"
3103 "packuswb %%xmm0,%%xmm0 \n"
3104 "psrlw $0x8,%%xmm1 \n"
3105 "packuswb %%xmm1,%%xmm1 \n"
3106 "movq %%xmm0,(%1) \n"
3107 "movq %%xmm1,(%1,%2) \n"
3108 "lea 0x8(%1),%1 \n"
3109 "sub $0x10,%3 \n"
3110 "jg 1b \n"
3111 : "+r"(src_uyvy), // %0
3112 "+r"(dst_u), // %1
3113 "+r"(dst_v), // %2
3114 "+r"(pix) // %3
3115 :
3116 : "memory", "cc"
3117#if defined(__SSE2__)
3118 , "xmm0", "xmm1", "xmm5"
3119#endif
3120 );
3121}
fbarchard@google.comb95dbf22012-02-11 01:18:30 +00003122#endif // HAS_YUY2TOYROW_SSE2
fbarchard@google.come5f3fd42012-02-06 22:40:32 +00003123
fbarchard@google.comba3aeed2012-03-07 18:24:51 +00003124#ifdef HAS_ARGBBLENDROW_SSE2
fbarchard@google.com96af8702012-04-06 18:22:27 +00003125// Blend 8 pixels at a time.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003126void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3127 uint8* dst_argb, int width) {
fbarchard@google.comc757f302012-04-03 00:49:16 +00003128 asm volatile (
3129 "pcmpeqb %%xmm7,%%xmm7 \n"
3130 "psrlw $0xf,%%xmm7 \n"
3131 "pcmpeqb %%xmm6,%%xmm6 \n"
3132 "psrlw $0x8,%%xmm6 \n"
3133 "pcmpeqb %%xmm5,%%xmm5 \n"
3134 "psllw $0x8,%%xmm5 \n"
3135 "pcmpeqb %%xmm4,%%xmm4 \n"
3136 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003137 "sub $0x1,%3 \n"
3138 "je 91f \n"
3139 "jl 99f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003140
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003141 // 1 pixel loop until destination pointer is aligned.
3142 "10: \n"
3143 "test $0xf,%2 \n"
3144 "je 19f \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003145 "movd (%0),%%xmm3 \n"
3146 "lea 0x4(%0),%0 \n"
3147 "movdqa %%xmm3,%%xmm0 \n"
3148 "pxor %%xmm4,%%xmm3 \n"
3149 "movd (%1),%%xmm2 \n"
3150 "psrlw $0x8,%%xmm3 \n"
3151 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3152 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3153 "pand %%xmm6,%%xmm2 \n"
3154 "paddw %%xmm7,%%xmm3 \n"
3155 "pmullw %%xmm3,%%xmm2 \n"
3156 "movd (%1),%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003157 "lea 0x4(%1),%1 \n"
fbarchard@google.comc757f302012-04-03 00:49:16 +00003158 "psrlw $0x8,%%xmm1 \n"
3159 "por %%xmm4,%%xmm0 \n"
3160 "pmullw %%xmm3,%%xmm1 \n"
3161 "psrlw $0x8,%%xmm2 \n"
3162 "paddusb %%xmm2,%%xmm0 \n"
3163 "pand %%xmm5,%%xmm1 \n"
3164 "paddusb %%xmm1,%%xmm0 \n"
3165 "sub $0x1,%3 \n"
3166 "movd %%xmm0,(%2) \n"
3167 "lea 0x4(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003168 "jge 10b \n"
3169
3170 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003171 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003172 "jl 49f \n"
3173
fbarchard@google.com794fe122012-06-15 01:05:01 +00003174 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003175 ".p2align 2 \n"
3176 "41: \n"
3177 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003178 "lea 0x10(%0),%0 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003179 "movdqa %%xmm3,%%xmm0 \n"
3180 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003181 "movdqu (%1),%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003182 "psrlw $0x8,%%xmm3 \n"
3183 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3184 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003185 "pand %%xmm6,%%xmm2 \n"
3186 "paddw %%xmm7,%%xmm3 \n"
3187 "pmullw %%xmm3,%%xmm2 \n"
3188 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003189 "lea 0x10(%1),%1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003190 "psrlw $0x8,%%xmm1 \n"
3191 "por %%xmm4,%%xmm0 \n"
3192 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003193 "psrlw $0x8,%%xmm2 \n"
3194 "paddusb %%xmm2,%%xmm0 \n"
3195 "pand %%xmm5,%%xmm1 \n"
3196 "paddusb %%xmm1,%%xmm0 \n"
3197 "sub $0x4,%3 \n"
3198 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003199 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003200 "jge 41b \n"
3201
3202 "49: \n"
3203 "add $0x3,%3 \n"
3204 "jl 99f \n"
3205
fbarchard@google.com794fe122012-06-15 01:05:01 +00003206 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003207 "91: \n"
3208 "movd (%0),%%xmm3 \n"
3209 "lea 0x4(%0),%0 \n"
3210 "movdqa %%xmm3,%%xmm0 \n"
3211 "pxor %%xmm4,%%xmm3 \n"
3212 "movd (%1),%%xmm2 \n"
3213 "psrlw $0x8,%%xmm3 \n"
3214 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3215 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3216 "pand %%xmm6,%%xmm2 \n"
3217 "paddw %%xmm7,%%xmm3 \n"
3218 "pmullw %%xmm3,%%xmm2 \n"
3219 "movd (%1),%%xmm1 \n"
3220 "lea 0x4(%1),%1 \n"
3221 "psrlw $0x8,%%xmm1 \n"
3222 "por %%xmm4,%%xmm0 \n"
3223 "pmullw %%xmm3,%%xmm1 \n"
3224 "psrlw $0x8,%%xmm2 \n"
3225 "paddusb %%xmm2,%%xmm0 \n"
3226 "pand %%xmm5,%%xmm1 \n"
3227 "paddusb %%xmm1,%%xmm0 \n"
3228 "sub $0x1,%3 \n"
3229 "movd %%xmm0,(%2) \n"
3230 "lea 0x4(%2),%2 \n"
3231 "jge 91b \n"
3232 "99: \n"
3233 : "+r"(src_argb0), // %0
3234 "+r"(src_argb1), // %1
3235 "+r"(dst_argb), // %2
3236 "+r"(width) // %3
fbarchard@google.comc757f302012-04-03 00:49:16 +00003237 :
3238 : "memory", "cc"
3239#if defined(__SSE2__)
3240 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3241#endif
3242 );
3243}
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003244#endif // HAS_ARGBBLENDROW_SSE2
fbarchard@google.comc757f302012-04-03 00:49:16 +00003245
fbarchard@google.com96af8702012-04-06 18:22:27 +00003246#ifdef HAS_ARGBBLENDROW_SSSE3
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003247// Shuffle table for isolating alpha.
fbarchard@google.com96af8702012-04-06 18:22:27 +00003248CONST uvec8 kShuffleAlpha = {
3249 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3250 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3251};
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003252
3253// Blend 8 pixels at a time
3254// Shuffle table for reversing the bytes.
3255
3256// Same as SSE2, but replaces
3257// psrlw xmm3, 8 // alpha
3258// pshufhw xmm3, xmm3,0F5h // 8 alpha words
3259// pshuflw xmm3, xmm3,0F5h
3260// with..
3261// pshufb xmm3, kShuffleAlpha // alpha
3262
3263void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3264 uint8* dst_argb, int width) {
fbarchard@google.com96af8702012-04-06 18:22:27 +00003265 asm volatile (
3266 "pcmpeqb %%xmm7,%%xmm7 \n"
3267 "psrlw $0xf,%%xmm7 \n"
3268 "pcmpeqb %%xmm6,%%xmm6 \n"
3269 "psrlw $0x8,%%xmm6 \n"
3270 "pcmpeqb %%xmm5,%%xmm5 \n"
3271 "psllw $0x8,%%xmm5 \n"
3272 "pcmpeqb %%xmm4,%%xmm4 \n"
3273 "pslld $0x18,%%xmm4 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003274 "sub $0x1,%3 \n"
3275 "je 91f \n"
3276 "jl 99f \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003277
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003278 // 1 pixel loop until destination pointer is aligned.
3279 "10: \n"
3280 "test $0xf,%2 \n"
3281 "je 19f \n"
3282 "movd (%0),%%xmm3 \n"
3283 "lea 0x4(%0),%0 \n"
3284 "movdqa %%xmm3,%%xmm0 \n"
3285 "pxor %%xmm4,%%xmm3 \n"
3286 "movd (%1),%%xmm2 \n"
3287 "pshufb %4,%%xmm3 \n"
3288 "pand %%xmm6,%%xmm2 \n"
3289 "paddw %%xmm7,%%xmm3 \n"
3290 "pmullw %%xmm3,%%xmm2 \n"
3291 "movd (%1),%%xmm1 \n"
3292 "lea 0x4(%1),%1 \n"
3293 "psrlw $0x8,%%xmm1 \n"
3294 "por %%xmm4,%%xmm0 \n"
3295 "pmullw %%xmm3,%%xmm1 \n"
3296 "psrlw $0x8,%%xmm2 \n"
3297 "paddusb %%xmm2,%%xmm0 \n"
3298 "pand %%xmm5,%%xmm1 \n"
3299 "paddusb %%xmm1,%%xmm0 \n"
3300 "sub $0x1,%3 \n"
3301 "movd %%xmm0,(%2) \n"
3302 "lea 0x4(%2),%2 \n"
3303 "jge 10b \n"
3304
3305 "19: \n"
fbarchard@google.comee220882012-06-14 00:07:56 +00003306 "add $1-4,%3 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003307 "jl 49f \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003308 "test $0xf,%0 \n"
3309 "jne 41f \n"
3310 "test $0xf,%1 \n"
3311 "jne 41f \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003312
fbarchard@google.com794fe122012-06-15 01:05:01 +00003313 // 4 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003314 ".p2align 2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003315 "40: \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003316 "movdqa (%0),%%xmm3 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003317 "lea 0x10(%0),%0 \n"
3318 "movdqa %%xmm3,%%xmm0 \n"
3319 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003320 "movdqa (%1),%%xmm2 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003321 "pshufb %4,%%xmm3 \n"
3322 "pand %%xmm6,%%xmm2 \n"
3323 "paddw %%xmm7,%%xmm3 \n"
3324 "pmullw %%xmm3,%%xmm2 \n"
fbarchard@google.comf3181b32012-08-15 18:57:18 +00003325 "movdqa (%1),%%xmm1 \n"
fbarchard@google.comf877e712012-08-15 00:51:24 +00003326 "lea 0x10(%1),%1 \n"
3327 "psrlw $0x8,%%xmm1 \n"
3328 "por %%xmm4,%%xmm0 \n"
3329 "pmullw %%xmm3,%%xmm1 \n"
3330 "psrlw $0x8,%%xmm2 \n"
3331 "paddusb %%xmm2,%%xmm0 \n"
3332 "pand %%xmm5,%%xmm1 \n"
3333 "paddusb %%xmm1,%%xmm0 \n"
3334 "sub $0x4,%3 \n"
3335 "movdqa %%xmm0,(%2) \n"
3336 "lea 0x10(%2),%2 \n"
3337 "jge 40b \n"
3338 "jmp 49f \n"
3339
3340 // 4 pixel unaligned loop.
3341 ".p2align 2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003342 "41: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003343 "movdqu (%0),%%xmm3 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003344 "lea 0x10(%0),%0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003345 "movdqa %%xmm3,%%xmm0 \n"
3346 "pxor %%xmm4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003347 "movdqu (%1),%%xmm2 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003348 "pshufb %4,%%xmm3 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003349 "pand %%xmm6,%%xmm2 \n"
3350 "paddw %%xmm7,%%xmm3 \n"
3351 "pmullw %%xmm3,%%xmm2 \n"
3352 "movdqu (%1),%%xmm1 \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003353 "lea 0x10(%1),%1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003354 "psrlw $0x8,%%xmm1 \n"
3355 "por %%xmm4,%%xmm0 \n"
3356 "pmullw %%xmm3,%%xmm1 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003357 "psrlw $0x8,%%xmm2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003358 "paddusb %%xmm2,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003359 "pand %%xmm5,%%xmm1 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003360 "paddusb %%xmm1,%%xmm0 \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003361 "sub $0x4,%3 \n"
3362 "movdqa %%xmm0,(%2) \n"
fbarchard@google.com794fe122012-06-15 01:05:01 +00003363 "lea 0x10(%2),%2 \n"
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003364 "jge 41b \n"
3365
3366 "49: \n"
3367 "add $0x3,%3 \n"
3368 "jl 99f \n"
3369
fbarchard@google.com794fe122012-06-15 01:05:01 +00003370 // 1 pixel loop.
fbarchard@google.combac5f2c2012-06-13 23:43:12 +00003371 "91: \n"
3372 "movd (%0),%%xmm3 \n"
3373 "lea 0x4(%0),%0 \n"
3374 "movdqa %%xmm3,%%xmm0 \n"
3375 "pxor %%xmm4,%%xmm3 \n"
3376 "movd (%1),%%xmm2 \n"
3377 "pshufb %4,%%xmm3 \n"
3378 "pand %%xmm6,%%xmm2 \n"
3379 "paddw %%xmm7,%%xmm3 \n"
3380 "pmullw %%xmm3,%%xmm2 \n"
3381 "movd (%1),%%xmm1 \n"
3382 "lea 0x4(%1),%1 \n"
3383 "psrlw $0x8,%%xmm1 \n"
3384 "por %%xmm4,%%xmm0 \n"
3385 "pmullw %%xmm3,%%xmm1 \n"
3386 "psrlw $0x8,%%xmm2 \n"
3387 "paddusb %%xmm2,%%xmm0 \n"
3388 "pand %%xmm5,%%xmm1 \n"
3389 "paddusb %%xmm1,%%xmm0 \n"
3390 "sub $0x1,%3 \n"
3391 "movd %%xmm0,(%2) \n"
3392 "lea 0x4(%2),%2 \n"
3393 "jge 91b \n"
3394 "99: \n"
fbarchard@google.com96af8702012-04-06 18:22:27 +00003395 : "+r"(src_argb0), // %0
3396 "+r"(src_argb1), // %1
3397 "+r"(dst_argb), // %2
3398 "+r"(width) // %3
3399 : "m"(kShuffleAlpha) // %4
3400 : "memory", "cc"
3401#if defined(__SSE2__)
3402 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3403#endif
3404 );
3405}
3406#endif // HAS_ARGBBLENDROW_SSSE3
3407
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003408#ifdef HAS_ARGBATTENUATE_SSE2
3409// Attenuate 4 pixels at a time.
3410// aligned to 16 bytes
3411void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3412 asm volatile (
3413 "sub %0,%1 \n"
3414 "pcmpeqb %%xmm4,%%xmm4 \n"
3415 "pslld $0x18,%%xmm4 \n"
3416 "pcmpeqb %%xmm5,%%xmm5 \n"
3417 "psrld $0x8,%%xmm5 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003418
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003419 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003420 ".p2align 4 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003421 "1: \n"
3422 "movdqa (%0),%%xmm0 \n"
3423 "punpcklbw %%xmm0,%%xmm0 \n"
3424 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3425 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3426 "pmulhuw %%xmm2,%%xmm0 \n"
3427 "movdqa (%0),%%xmm1 \n"
3428 "punpckhbw %%xmm1,%%xmm1 \n"
3429 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3430 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3431 "pmulhuw %%xmm2,%%xmm1 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003432 "movdqa (%0),%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003433 "psrlw $0x8,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003434 "pand %%xmm4,%%xmm2 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003435 "psrlw $0x8,%%xmm1 \n"
3436 "packuswb %%xmm1,%%xmm0 \n"
3437 "pand %%xmm5,%%xmm0 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003438 "por %%xmm2,%%xmm0 \n"
fbarchard@google.com8ed54222012-04-18 17:07:07 +00003439 "sub $0x4,%2 \n"
3440 "movdqa %%xmm0,(%0,%1,1) \n"
3441 "lea 0x10(%0),%0 \n"
3442 "jg 1b \n"
3443 : "+r"(src_argb), // %0
3444 "+r"(dst_argb), // %1
3445 "+r"(width) // %2
3446 :
3447 : "memory", "cc"
3448#if defined(__SSE2__)
3449 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3450#endif
3451 );
3452}
3453#endif // HAS_ARGBATTENUATE_SSE2
3454
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003455#ifdef HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003456// Shuffle table duplicating alpha
3457CONST uvec8 kShuffleAlpha0 = {
3458 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3459};
3460CONST uvec8 kShuffleAlpha1 = {
3461 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3462 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3463};
3464// Attenuate 4 pixels at a time.
3465// aligned to 16 bytes
3466void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3467 asm volatile (
3468 "sub %0,%1 \n"
3469 "pcmpeqb %%xmm3,%%xmm3 \n"
3470 "pslld $0x18,%%xmm3 \n"
3471 "movdqa %3,%%xmm4 \n"
3472 "movdqa %4,%%xmm5 \n"
3473
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003474 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003475 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003476 "1: \n"
3477 "movdqa (%0),%%xmm0 \n"
3478 "pshufb %%xmm4,%%xmm0 \n"
3479 "movdqa (%0),%%xmm1 \n"
3480 "punpcklbw %%xmm1,%%xmm1 \n"
3481 "pmulhuw %%xmm1,%%xmm0 \n"
3482 "movdqa (%0),%%xmm1 \n"
3483 "pshufb %%xmm5,%%xmm1 \n"
3484 "movdqa (%0),%%xmm2 \n"
3485 "punpckhbw %%xmm2,%%xmm2 \n"
3486 "pmulhuw %%xmm2,%%xmm1 \n"
3487 "movdqa (%0),%%xmm2 \n"
3488 "pand %%xmm3,%%xmm2 \n"
3489 "psrlw $0x8,%%xmm0 \n"
3490 "psrlw $0x8,%%xmm1 \n"
3491 "packuswb %%xmm1,%%xmm0 \n"
3492 "por %%xmm2,%%xmm0 \n"
3493 "sub $0x4,%2 \n"
3494 "movdqa %%xmm0,(%0,%1,1) \n"
3495 "lea 0x10(%0),%0 \n"
3496 "jg 1b \n"
3497 : "+r"(src_argb), // %0
3498 "+r"(dst_argb), // %1
3499 "+r"(width) // %2
3500 : "m"(kShuffleAlpha0), // %3
3501 "m"(kShuffleAlpha1) // %4
3502 : "memory", "cc"
3503#if defined(__SSE2__)
3504 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3505#endif
3506 );
3507}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003508#endif // HAS_ARGBATTENUATEROW_SSSE3
fbarchard@google.com810cd912012-04-20 20:15:27 +00003509
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003510#ifdef HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003511// Unattenuate 4 pixels at a time.
3512// aligned to 16 bytes
3513void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3514 int width) {
3515 uintptr_t alpha = 0;
3516 asm volatile (
3517 "sub %0,%1 \n"
3518 "pcmpeqb %%xmm4,%%xmm4 \n"
3519 "pslld $0x18,%%xmm4 \n"
3520
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003521 // 4 pixel loop.
fbarchard@google.com5bf29b52012-05-02 00:10:16 +00003522 ".p2align 4 \n"
fbarchard@google.com810cd912012-04-20 20:15:27 +00003523 "1: \n"
3524 "movdqa (%0),%%xmm0 \n"
3525 "movzb 0x3(%0),%3 \n"
3526 "punpcklbw %%xmm0,%%xmm0 \n"
3527 "movd 0x0(%4,%3,4),%%xmm2 \n"
3528 "movzb 0x7(%0),%3 \n"
3529 "movd 0x0(%4,%3,4),%%xmm3 \n"
3530 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3531 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3532 "movlhps %%xmm3,%%xmm2 \n"
3533 "pmulhuw %%xmm2,%%xmm0 \n"
3534 "movdqa (%0),%%xmm1 \n"
3535 "movzb 0xb(%0),%3 \n"
3536 "punpckhbw %%xmm1,%%xmm1 \n"
3537 "movd 0x0(%4,%3,4),%%xmm2 \n"
3538 "movzb 0xf(%0),%3 \n"
3539 "movd 0x0(%4,%3,4),%%xmm3 \n"
3540 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3541 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3542 "movlhps %%xmm3,%%xmm2 \n"
3543 "pmulhuw %%xmm2,%%xmm1 \n"
3544 "movdqa (%0),%%xmm2 \n"
3545 "pand %%xmm4,%%xmm2 \n"
3546 "packuswb %%xmm1,%%xmm0 \n"
3547 "por %%xmm2,%%xmm0 \n"
3548 "sub $0x4,%2 \n"
3549 "movdqa %%xmm0,(%0,%1,1) \n"
3550 "lea 0x10(%0),%0 \n"
3551 "jg 1b \n"
3552 : "+r"(src_argb), // %0
3553 "+r"(dst_argb), // %1
3554 "+r"(width), // %2
3555 "+r"(alpha) // %3
3556 : "r"(fixed_invtbl8) // %4
3557 : "memory", "cc"
3558#if defined(__SSE2__)
3559 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3560#endif
3561 );
3562}
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003563#endif // HAS_ARGBUNATTENUATEROW_SSE2
fbarchard@google.com810cd912012-04-20 20:15:27 +00003564
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003565#ifdef HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00003566// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
fbarchard@google.com221e6022012-05-21 22:24:41 +00003567CONST vec8 kARGBToGray = {
3568 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3569};
3570
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003571// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003572void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003573 asm volatile (
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003574 "movdqa %3,%%xmm4 \n"
3575 "sub %0,%1 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003576
3577 // 8 pixel loop.
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003578 ".p2align 4 \n"
3579 "1: \n"
3580 "movdqa (%0),%%xmm0 \n"
3581 "movdqa 0x10(%0),%%xmm1 \n"
3582 "pmaddubsw %%xmm4,%%xmm0 \n"
3583 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003584 "phaddw %%xmm1,%%xmm0 \n"
3585 "psrlw $0x7,%%xmm0 \n"
3586 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003587 "movdqa (%0),%%xmm2 \n"
3588 "movdqa 0x10(%0),%%xmm3 \n"
3589 "psrld $0x18,%%xmm2 \n"
3590 "psrld $0x18,%%xmm3 \n"
3591 "packuswb %%xmm3,%%xmm2 \n"
3592 "packuswb %%xmm2,%%xmm2 \n"
3593 "movdqa %%xmm0,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003594 "punpcklbw %%xmm0,%%xmm0 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003595 "punpcklbw %%xmm2,%%xmm3 \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003596 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com221e6022012-05-21 22:24:41 +00003597 "punpcklwd %%xmm3,%%xmm0 \n"
3598 "punpckhwd %%xmm3,%%xmm1 \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003599 "sub $0x8,%2 \n"
3600 "movdqa %%xmm0,(%0,%1,1) \n"
3601 "movdqa %%xmm1,0x10(%0,%1,1) \n"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003602 "lea 0x20(%0),%0 \n"
3603 "jg 1b \n"
fbarchard@google.comeeac2902012-07-18 18:54:32 +00003604 : "+r"(src_argb), // %0
3605 "+r"(dst_argb), // %1
3606 "+r"(width) // %2
3607 : "m"(kARGBToGray) // %3
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003608 : "memory", "cc"
3609#if defined(__SSE2__)
fbarchard@google.com221e6022012-05-21 22:24:41 +00003610 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
fbarchard@google.comffaea7e2012-05-18 19:43:59 +00003611#endif
3612 );
3613}
3614#endif // HAS_ARGBGRAYROW_SSSE3
fbarchard@google.com221e6022012-05-21 22:24:41 +00003615
3616#ifdef HAS_ARGBSEPIAROW_SSSE3
3617// b = (r * 35 + g * 68 + b * 17) >> 7
3618// g = (r * 45 + g * 88 + b * 22) >> 7
3619// r = (r * 50 + g * 98 + b * 24) >> 7
3620// Constant for ARGB color to sepia tone
3621CONST vec8 kARGBToSepiaB = {
3622 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3623};
3624
3625CONST vec8 kARGBToSepiaG = {
3626 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3627};
3628
3629CONST vec8 kARGBToSepiaR = {
3630 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3631};
3632
fbarchard@google.come442dc42012-06-18 17:37:09 +00003633// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003634void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3635 asm volatile (
3636 "movdqa %2,%%xmm2 \n"
3637 "movdqa %3,%%xmm3 \n"
3638 "movdqa %4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003639
3640 // 8 pixel loop.
fbarchard@google.com221e6022012-05-21 22:24:41 +00003641 ".p2align 4 \n"
3642 "1: \n"
3643 "movdqa (%0),%%xmm0 \n"
3644 "movdqa 0x10(%0),%%xmm6 \n"
3645 "pmaddubsw %%xmm2,%%xmm0 \n"
3646 "pmaddubsw %%xmm2,%%xmm6 \n"
3647 "phaddw %%xmm6,%%xmm0 \n"
3648 "psrlw $0x7,%%xmm0 \n"
3649 "packuswb %%xmm0,%%xmm0 \n"
3650 "movdqa (%0),%%xmm5 \n"
3651 "movdqa 0x10(%0),%%xmm1 \n"
3652 "pmaddubsw %%xmm3,%%xmm5 \n"
3653 "pmaddubsw %%xmm3,%%xmm1 \n"
3654 "phaddw %%xmm1,%%xmm5 \n"
3655 "psrlw $0x7,%%xmm5 \n"
3656 "packuswb %%xmm5,%%xmm5 \n"
3657 "punpcklbw %%xmm5,%%xmm0 \n"
3658 "movdqa (%0),%%xmm5 \n"
3659 "movdqa 0x10(%0),%%xmm1 \n"
3660 "pmaddubsw %%xmm4,%%xmm5 \n"
3661 "pmaddubsw %%xmm4,%%xmm1 \n"
3662 "phaddw %%xmm1,%%xmm5 \n"
3663 "psrlw $0x7,%%xmm5 \n"
3664 "packuswb %%xmm5,%%xmm5 \n"
3665 "movdqa (%0),%%xmm6 \n"
3666 "movdqa 0x10(%0),%%xmm1 \n"
3667 "psrld $0x18,%%xmm6 \n"
3668 "psrld $0x18,%%xmm1 \n"
3669 "packuswb %%xmm1,%%xmm6 \n"
3670 "packuswb %%xmm6,%%xmm6 \n"
3671 "punpcklbw %%xmm6,%%xmm5 \n"
3672 "movdqa %%xmm0,%%xmm1 \n"
3673 "punpcklwd %%xmm5,%%xmm0 \n"
3674 "punpckhwd %%xmm5,%%xmm1 \n"
3675 "sub $0x8,%1 \n"
3676 "movdqa %%xmm0,(%0) \n"
3677 "movdqa %%xmm1,0x10(%0) \n"
3678 "lea 0x20(%0),%0 \n"
3679 "jg 1b \n"
3680 : "+r"(dst_argb), // %0
3681 "+r"(width) // %1
3682 : "m"(kARGBToSepiaB), // %2
3683 "m"(kARGBToSepiaG), // %3
3684 "m"(kARGBToSepiaR) // %4
3685 : "memory", "cc"
3686#if defined(__SSE2__)
3687 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3688#endif
3689 );
3690}
3691#endif // HAS_ARGBSEPIAROW_SSSE3
3692
fbarchard@google.come442dc42012-06-18 17:37:09 +00003693#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3694// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3695// Same as Sepia except matrix is provided.
3696void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3697 int width) {
3698 asm volatile (
3699 "movd (%2),%%xmm2 \n"
3700 "movd 0x4(%2),%%xmm3 \n"
3701 "movd 0x8(%2),%%xmm4 \n"
3702 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3703 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3704 "pshufd $0x0,%%xmm4,%%xmm4 \n"
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003705
3706 // 8 pixel loop.
fbarchard@google.come442dc42012-06-18 17:37:09 +00003707 ".p2align 4 \n"
3708 "1: \n"
3709 "movdqa (%0),%%xmm0 \n"
3710 "movdqa 0x10(%0),%%xmm6 \n"
3711 "pmaddubsw %%xmm2,%%xmm0 \n"
3712 "pmaddubsw %%xmm2,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003713 "movdqa (%0),%%xmm5 \n"
3714 "movdqa 0x10(%0),%%xmm1 \n"
3715 "pmaddubsw %%xmm3,%%xmm5 \n"
3716 "pmaddubsw %%xmm3,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003717 "phaddsw %%xmm6,%%xmm0 \n"
3718 "phaddsw %%xmm1,%%xmm5 \n"
3719 "psraw $0x7,%%xmm0 \n"
3720 "psraw $0x7,%%xmm5 \n"
3721 "packuswb %%xmm0,%%xmm0 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003722 "packuswb %%xmm5,%%xmm5 \n"
3723 "punpcklbw %%xmm5,%%xmm0 \n"
3724 "movdqa (%0),%%xmm5 \n"
3725 "movdqa 0x10(%0),%%xmm1 \n"
3726 "pmaddubsw %%xmm4,%%xmm5 \n"
3727 "pmaddubsw %%xmm4,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003728 "phaddsw %%xmm1,%%xmm5 \n"
3729 "psraw $0x7,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003730 "packuswb %%xmm5,%%xmm5 \n"
3731 "movdqa (%0),%%xmm6 \n"
3732 "movdqa 0x10(%0),%%xmm1 \n"
3733 "psrld $0x18,%%xmm6 \n"
3734 "psrld $0x18,%%xmm1 \n"
3735 "packuswb %%xmm1,%%xmm6 \n"
3736 "packuswb %%xmm6,%%xmm6 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003737 "movdqa %%xmm0,%%xmm1 \n"
fbarchard@google.com8f439ea2012-06-21 02:00:34 +00003738 "punpcklbw %%xmm6,%%xmm5 \n"
fbarchard@google.come442dc42012-06-18 17:37:09 +00003739 "punpcklwd %%xmm5,%%xmm0 \n"
3740 "punpckhwd %%xmm5,%%xmm1 \n"
3741 "sub $0x8,%1 \n"
3742 "movdqa %%xmm0,(%0) \n"
3743 "movdqa %%xmm1,0x10(%0) \n"
3744 "lea 0x20(%0),%0 \n"
3745 "jg 1b \n"
3746 : "+r"(dst_argb), // %0
3747 "+r"(width) // %1
3748 : "r"(matrix_argb) // %2
3749 : "memory", "cc"
3750#if defined(__SSE2__)
3751 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3752#endif
3753 );
3754}
3755#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3756
fbarchard@google.com81b804e2012-06-20 02:15:01 +00003757#ifdef HAS_ARGBQUANTIZEROW_SSE2
3758// Quantize 4 ARGB pixels (16 bytes).
3759// aligned to 16 bytes
3760void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3761 int interval_offset, int width) {
3762 asm volatile (
3763 "movd %2,%%xmm2 \n"
3764 "movd %3,%%xmm3 \n"
3765 "movd %4,%%xmm4 \n"
3766 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3767 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3768 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3769 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3770 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3771 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3772 "pxor %%xmm5,%%xmm5 \n"
3773 "pcmpeqb %%xmm6,%%xmm6 \n"
3774 "pslld $0x18,%%xmm6 \n"
3775
3776 // 4 pixel loop.
3777 ".p2align 2 \n"
3778 "1: \n"
3779 "movdqa (%0),%%xmm0 \n"
3780 "punpcklbw %%xmm5,%%xmm0 \n"
3781 "pmulhuw %%xmm2,%%xmm0 \n"
3782 "movdqa (%0),%%xmm1 \n"
3783 "punpckhbw %%xmm5,%%xmm1 \n"
3784 "pmulhuw %%xmm2,%%xmm1 \n"
3785 "pmullw %%xmm3,%%xmm0 \n"
3786 "movdqa (%0),%%xmm7 \n"
3787 "pmullw %%xmm3,%%xmm1 \n"
3788 "pand %%xmm6,%%xmm7 \n"
3789 "paddw %%xmm4,%%xmm0 \n"
3790 "paddw %%xmm4,%%xmm1 \n"
3791 "packuswb %%xmm1,%%xmm0 \n"
3792 "por %%xmm7,%%xmm0 \n"
3793 "sub $0x4,%1 \n"
3794 "movdqa %%xmm0,(%0) \n"
3795 "lea 0x10(%0),%0 \n"
3796 "jg 1b \n"
3797 : "+r"(dst_argb), // %0
3798 "+r"(width) // %1
3799 : "r"(scale), // %2
3800 "r"(interval_size), // %3
3801 "r"(interval_offset) // %4
3802 : "memory", "cc"
3803#if defined(__SSE2__)
3804 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3805#endif
3806 );
3807}
3808#endif // HAS_ARGBQUANTIZEROW_SSE2
3809
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003810#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3811// Creates a table of cumulative sums where each value is a sum of all values
3812// above and to the left of the value, inclusive of the value.
3813void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
fbarchard@google.com133adc42012-06-12 05:46:49 +00003814 const int32* previous_cumsum, int width) {
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003815 asm volatile (
3816 "sub %1,%2 \n"
3817 "pxor %%xmm0,%%xmm0 \n"
3818 "pxor %%xmm1,%%xmm1 \n"
3819 "sub $0x4,%3 \n"
3820 "jl 49f \n"
3821 "test $0xf,%1 \n"
3822 "jne 49f \n"
3823
3824 // 4 pixel loop \n"
3825 ".p2align 2 \n"
3826 "40: \n"
3827 "movdqu (%0),%%xmm2 \n"
3828 "lea 0x10(%0),%0 \n"
3829 "movdqa %%xmm2,%%xmm4 \n"
3830 "punpcklbw %%xmm1,%%xmm2 \n"
3831 "movdqa %%xmm2,%%xmm3 \n"
3832 "punpcklwd %%xmm1,%%xmm2 \n"
3833 "punpckhwd %%xmm1,%%xmm3 \n"
3834 "punpckhbw %%xmm1,%%xmm4 \n"
3835 "movdqa %%xmm4,%%xmm5 \n"
3836 "punpcklwd %%xmm1,%%xmm4 \n"
3837 "punpckhwd %%xmm1,%%xmm5 \n"
3838 "paddd %%xmm2,%%xmm0 \n"
3839 "movdqa (%1,%2,1),%%xmm2 \n"
3840 "paddd %%xmm0,%%xmm2 \n"
3841 "paddd %%xmm3,%%xmm0 \n"
3842 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3843 "paddd %%xmm0,%%xmm3 \n"
3844 "paddd %%xmm4,%%xmm0 \n"
3845 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3846 "paddd %%xmm0,%%xmm4 \n"
3847 "paddd %%xmm5,%%xmm0 \n"
3848 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3849 "paddd %%xmm0,%%xmm5 \n"
3850 "movdqa %%xmm2,(%1) \n"
3851 "movdqa %%xmm3,0x10(%1) \n"
3852 "movdqa %%xmm4,0x20(%1) \n"
3853 "movdqa %%xmm5,0x30(%1) \n"
3854 "lea 0x40(%1),%1 \n"
3855 "sub $0x4,%3 \n"
3856 "jge 40b \n"
3857
3858 "49: \n"
3859 "add $0x3,%3 \n"
3860 "jl 19f \n"
3861
3862 // 1 pixel loop \n"
3863 ".p2align 2 \n"
3864 "10: \n"
3865 "movd (%0),%%xmm2 \n"
3866 "lea 0x4(%0),%0 \n"
fbarchard@google.comf38aefe2012-06-12 00:29:29 +00003867 "punpcklbw %%xmm1,%%xmm2 \n"
3868 "punpcklwd %%xmm1,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003869 "paddd %%xmm2,%%xmm0 \n"
3870 "movdqu (%1,%2,1),%%xmm2 \n"
3871 "paddd %%xmm0,%%xmm2 \n"
3872 "movdqu %%xmm2,(%1) \n"
3873 "lea 0x10(%1),%1 \n"
3874 "sub $0x1,%3 \n"
3875 "jge 10b \n"
3876
3877 "19: \n"
3878 : "+r"(row), // %0
3879 "+r"(cumsum), // %1
3880 "+r"(previous_cumsum), // %2
3881 "+r"(width) // %3
3882 :
3883 : "memory", "cc"
3884#if defined(__SSE2__)
3885 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3886#endif
3887 );
3888}
3889#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3890
3891#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3892void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3893 int width, int area, uint8* dst, int count) {
3894 asm volatile (
3895 "movd %5,%%xmm4 \n"
3896 "cvtdq2ps %%xmm4,%%xmm4 \n"
3897 "rcpss %%xmm4,%%xmm4 \n"
3898 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3899 "sub $0x4,%3 \n"
3900 "jl 49f \n"
3901
3902 // 4 pixel loop \n"
3903 ".p2align 2 \n"
3904 "40: \n"
3905 "movdqa (%0),%%xmm0 \n"
3906 "movdqa 0x10(%0),%%xmm1 \n"
3907 "movdqa 0x20(%0),%%xmm2 \n"
3908 "movdqa 0x30(%0),%%xmm3 \n"
3909 "psubd (%0,%4,4),%%xmm0 \n"
3910 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3911 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3912 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3913 "lea 0x40(%0),%0 \n"
3914 "psubd (%1),%%xmm0 \n"
3915 "psubd 0x10(%1),%%xmm1 \n"
3916 "psubd 0x20(%1),%%xmm2 \n"
3917 "psubd 0x30(%1),%%xmm3 \n"
3918 "paddd (%1,%4,4),%%xmm0 \n"
3919 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3920 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3921 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3922 "lea 0x40(%1),%1 \n"
3923 "cvtdq2ps %%xmm0,%%xmm0 \n"
3924 "cvtdq2ps %%xmm1,%%xmm1 \n"
3925 "mulps %%xmm4,%%xmm0 \n"
3926 "mulps %%xmm4,%%xmm1 \n"
3927 "cvtdq2ps %%xmm2,%%xmm2 \n"
3928 "cvtdq2ps %%xmm3,%%xmm3 \n"
3929 "mulps %%xmm4,%%xmm2 \n"
3930 "mulps %%xmm4,%%xmm3 \n"
3931 "cvtps2dq %%xmm0,%%xmm0 \n"
3932 "cvtps2dq %%xmm1,%%xmm1 \n"
3933 "cvtps2dq %%xmm2,%%xmm2 \n"
3934 "cvtps2dq %%xmm3,%%xmm3 \n"
3935 "packssdw %%xmm1,%%xmm0 \n"
3936 "packssdw %%xmm3,%%xmm2 \n"
3937 "packuswb %%xmm2,%%xmm0 \n"
3938 "movdqu %%xmm0,(%2) \n"
3939 "lea 0x10(%2),%2 \n"
3940 "sub $0x4,%3 \n"
3941 "jge 40b \n"
3942
3943 "49: \n"
3944 "add $0x3,%3 \n"
3945 "jl 19f \n"
3946
3947 // 1 pixel loop \n"
3948 ".p2align 2 \n"
3949 "10: \n"
3950 "movdqa (%0),%%xmm0 \n"
3951 "psubd (%0,%4,4),%%xmm0 \n"
3952 "lea 0x10(%0),%0 \n"
3953 "psubd (%1),%%xmm0 \n"
3954 "paddd (%1,%4,4),%%xmm0 \n"
3955 "lea 0x10(%1),%1 \n"
3956 "cvtdq2ps %%xmm0,%%xmm0 \n"
3957 "mulps %%xmm4,%%xmm0 \n"
3958 "cvtps2dq %%xmm0,%%xmm0 \n"
3959 "packssdw %%xmm0,%%xmm0 \n"
3960 "packuswb %%xmm0,%%xmm0 \n"
3961 "movd %%xmm0,(%2) \n"
3962 "lea 0x4(%2),%2 \n"
3963 "sub $0x1,%3 \n"
3964 "jge 10b \n"
3965 "19: \n"
3966 : "+r"(topleft), // %0
3967 "+r"(botleft), // %1
3968 "+r"(dst), // %2
3969 "+rm"(count) // %3
3970 : "r"(static_cast<intptr_t>(width)), // %4
3971 "rm"(area) // %5
3972 : "memory", "cc"
3973#if defined(__SSE2__)
3974 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3975#endif
3976 );
3977}
3978#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003979#ifdef HAS_ARGBSHADE_SSE2
3980// Shade 4 pixels at a time by specified value.
3981// Aligned to 16 bytes.
3982void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3983 uint32 value) {
3984 asm volatile (
3985 "movd %3,%%xmm2 \n"
3986 "sub %0,%1 \n"
3987 "punpcklbw %%xmm2,%%xmm2 \n"
3988 "punpcklqdq %%xmm2,%%xmm2 \n"
fbarchard@google.comf51e8792012-06-10 02:40:04 +00003989
fbarchard@google.com6398e1d2012-07-11 19:12:32 +00003990 // 4 pixel loop.
3991 ".p2align 2 \n"
3992 "1: \n"
3993 "movdqa (%0),%%xmm0 \n"
3994 "movdqa %%xmm0,%%xmm1 \n"
3995 "punpcklbw %%xmm0,%%xmm0 \n"
3996 "punpckhbw %%xmm1,%%xmm1 \n"
3997 "pmulhuw %%xmm2,%%xmm0 \n"
3998 "pmulhuw %%xmm2,%%xmm1 \n"
3999 "psrlw $0x8,%%xmm0 \n"
4000 "psrlw $0x8,%%xmm1 \n"
4001 "packuswb %%xmm1,%%xmm0 \n"
4002 "sub $0x4,%2 \n"
4003 "movdqa %%xmm0,(%0,%1,1) \n"
4004 "lea 0x10(%0),%0 \n"
4005 "jg 1b \n"
4006 : "+r"(src_argb), // %0
4007 "+r"(dst_argb), // %1
4008 "+r"(width) // %2
4009 : "r"(value) // %3
4010 : "memory", "cc"
4011#if defined(__SSE2__)
4012 , "xmm0", "xmm1", "xmm2"
4013#endif
4014 );
4015}
4016#endif // HAS_ARGBSHADE_SSE2
fbarchard@google.comf51e8792012-06-10 02:40:04 +00004017
fbarchard@google.com73444402012-08-09 17:33:29 +00004018#ifdef HAS_ARGBAFFINEROW_SSE2
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004019// TODO(fbarchard): Find 64 bit way to avoid masking.
4020// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
fbarchard@google.com73444402012-08-09 17:33:29 +00004021// Copy ARGB pixels from source image with slope to a row of destination.
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004022// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00004023// an error if movq is used. movd %%xmm0,%1
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004024
fbarchard@google.comfc7314e2012-09-27 02:17:51 +00004025LIBYUV_API
fbarchard@google.com73444402012-08-09 17:33:29 +00004026void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4027 uint8* dst_argb, const float* uv_dudv, int width) {
4028 intptr_t src_argb_stride_temp = src_argb_stride;
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004029 intptr_t temp = 0;
fbarchard@google.com73444402012-08-09 17:33:29 +00004030 asm volatile (
4031 "movq (%3),%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004032 "movq 0x8(%3),%%xmm7 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004033 "shl $0x10,%1 \n"
4034 "add $0x4,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004035 "movd %1,%%xmm5 \n"
4036 "sub $0x4,%4 \n"
4037 "jl 49f \n"
4038
4039 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4040 "pshufd $0x0,%%xmm5,%%xmm5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004041 "movdqa %%xmm2,%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004042 "addps %%xmm7,%%xmm0 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004043 "movlhps %%xmm0,%%xmm2 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004044 "movdqa %%xmm7,%%xmm4 \n"
4045 "addps %%xmm4,%%xmm4 \n"
4046 "movdqa %%xmm2,%%xmm3 \n"
4047 "addps %%xmm4,%%xmm3 \n"
4048 "addps %%xmm4,%%xmm4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004049
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004050 // 4 pixel loop \n"
4051 ".p2align 4 \n"
4052 "40: \n"
4053 "cvttps2dq %%xmm2,%%xmm0 \n"
4054 "cvttps2dq %%xmm3,%%xmm1 \n"
4055 "packssdw %%xmm1,%%xmm0 \n"
4056 "pmaddwd %%xmm5,%%xmm0 \n"
4057#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004058 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004059 "mov %1,%5 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004060 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004061 "shr $32,%5 \n"
4062 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
4063#else
4064 "movd %%xmm0,%1 \n"
4065 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4066 "movd %%xmm0,%5 \n"
4067 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4068#endif
4069 "movd (%0,%1,1),%%xmm1 \n"
4070 "movd (%0,%5,1),%%xmm6 \n"
4071 "punpckldq %%xmm6,%%xmm1 \n"
4072 "addps %%xmm4,%%xmm2 \n"
4073 "movq %%xmm1,(%2) \n"
4074#if defined(__x86_64__)
fbarchard@google.com41f24bf2012-08-17 16:51:25 +00004075 "movd %%xmm0,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004076 "mov %1,%5 \n"
4077 "and $0x0fffffff,%1 \n"
4078 "shr $32,%5 \n"
4079#else
4080 "movd %%xmm0,%1 \n"
4081 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4082 "movd %%xmm0,%5 \n"
4083#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004084 "movd (%0,%1,1),%%xmm0 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004085 "movd (%0,%5,1),%%xmm6 \n"
4086 "punpckldq %%xmm6,%%xmm0 \n"
4087 "addps %%xmm4,%%xmm3 \n"
4088 "sub $0x4,%4 \n"
4089 "movq %%xmm0,0x08(%2) \n"
4090 "lea 0x10(%2),%2 \n"
4091 "jge 40b \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004092
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004093 "49: \n"
4094 "add $0x3,%4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004095 "jl 19f \n"
4096
4097 // 1 pixel loop \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004098 ".p2align 4 \n"
fbarchard@google.com73444402012-08-09 17:33:29 +00004099 "10: \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004100 "cvttps2dq %%xmm2,%%xmm0 \n"
4101 "packssdw %%xmm0,%%xmm0 \n"
4102 "pmaddwd %%xmm5,%%xmm0 \n"
4103 "addps %%xmm7,%%xmm2 \n"
4104 "movd %%xmm0,%1 \n"
4105#if defined(__x86_64__)
fbarchard@google.com73444402012-08-09 17:33:29 +00004106 "and $0x0fffffff,%1 \n"
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004107#endif
fbarchard@google.com73444402012-08-09 17:33:29 +00004108 "movd (%0,%1,1),%%xmm0 \n"
4109 "sub $0x1,%4 \n"
4110 "movd %%xmm0,(%2) \n"
4111 "lea 0x4(%2),%2 \n"
4112 "jge 10b \n"
4113 "19: \n"
4114 : "+r"(src_argb), // %0
4115 "+r"(src_argb_stride_temp), // %1
4116 "+r"(dst_argb), // %2
4117 "+r"(uv_dudv), // %3
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004118 "+rm"(width), // %4
4119 "+r"(temp) // %5
fbarchard@google.com73444402012-08-09 17:33:29 +00004120 :
4121 : "memory", "cc"
4122#if defined(__SSE2__)
fbarchard@google.come3cc7692012-08-10 20:41:27 +00004123 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
fbarchard@google.com73444402012-08-09 17:33:29 +00004124#endif
4125 );
4126}
4127#endif // HAS_ARGBAFFINEROW_SSE2
4128
fbarchard@google.com9bcc9a22012-09-16 05:00:37 +00004129// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
4130void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4131 ptrdiff_t src_stride, int dst_width,
4132 int source_y_fraction) {
4133 asm volatile (
4134 "sub %1,%0 \n"
4135 "shr %3 \n"
4136 "cmp $0x0,%3 \n"
4137 "je 2f \n"
4138 "cmp $0x40,%3 \n"
4139 "je 3f \n"
4140 "movd %3,%%xmm0 \n"
4141 "neg %3 \n"
4142 "add $0x80,%3 \n"
4143 "movd %3,%%xmm5 \n"
4144 "punpcklbw %%xmm0,%%xmm5 \n"
4145 "punpcklwd %%xmm5,%%xmm5 \n"
4146 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4147 ".p2align 4 \n"
4148 "1: \n"
4149 "movdqa (%1),%%xmm0 \n"
4150 "movdqa (%1,%4,1),%%xmm2 \n"
4151 "movdqa %%xmm0,%%xmm1 \n"
4152 "punpcklbw %%xmm2,%%xmm0 \n"
4153 "punpckhbw %%xmm2,%%xmm1 \n"
4154 "pmaddubsw %%xmm5,%%xmm0 \n"
4155 "pmaddubsw %%xmm5,%%xmm1 \n"
4156 "psrlw $0x7,%%xmm0 \n"
4157 "psrlw $0x7,%%xmm1 \n"
4158 "packuswb %%xmm1,%%xmm0 \n"
4159 "sub $0x4,%2 \n"
4160 "movdqa %%xmm0,(%1,%0,1) \n"
4161 "lea 0x10(%1),%1 \n"
4162 "jg 1b \n"
4163 "jmp 4f \n"
4164 ".p2align 4 \n"
4165 "2: \n"
4166 "movdqa (%1),%%xmm0 \n"
4167 "sub $0x4,%2 \n"
4168 "movdqa %%xmm0,(%1,%0,1) \n"
4169 "lea 0x10(%1),%1 \n"
4170 "jg 2b \n"
4171 "jmp 4f \n"
4172 ".p2align 4 \n"
4173 "3: \n"
4174 "movdqa (%1),%%xmm0 \n"
4175 "pavgb (%1,%4,1),%%xmm0 \n"
4176 "sub $0x4,%2 \n"
4177 "movdqa %%xmm0,(%1,%0,1) \n"
4178 "lea 0x10(%1),%1 \n"
4179 "jg 3b \n"
4180 "4: \n"
4181 ".p2align 4 \n"
4182 : "+r"(dst_ptr), // %0
4183 "+r"(src_ptr), // %1
4184 "+r"(dst_width), // %2
4185 "+r"(source_y_fraction) // %3
4186 : "r"(static_cast<intptr_t>(src_stride)) // %4
4187 : "memory", "cc"
4188#if defined(__SSE2__)
4189 , "xmm0", "xmm1", "xmm2", "xmm5"
4190#endif
4191 );
4192}
4193
fbarchard@google.come91bdac2012-10-09 21:09:33 +00004194void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
4195 uint8* dst_uv, int pix) {
4196 asm volatile (
4197 "sub %0,%1 \n"
4198 ".p2align 4 \n"
4199 "1: \n"
4200 "movdqa (%0),%%xmm0 \n"
4201 "pavgb (%0,%3),%%xmm0 \n"
4202 "sub $0x10,%2 \n"
4203 "movdqa %%xmm0,(%0,%1) \n"
4204 "lea 0x10(%0),%0 \n"
4205 "jg 1b \n"
4206 : "+r"(src_uv), // %0
4207 "+r"(dst_uv), // %1
4208 "+r"(pix) // %2
4209 : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
4210 : "memory", "cc"
4211#if defined(__SSE2__)
4212 , "xmm0"
4213#endif
4214 );
4215}
fbarchard@google.com8d37dd52012-10-11 00:07:30 +00004216
4217void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
4218 uint32 selector, int pix) {
4219 asm volatile (
4220 "movd %3,%%xmm5 \n"
4221 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4222 ".p2align 4 \n"
4223 "1: \n"
4224 "movdqa (%0),%%xmm0 \n"
4225 "lea 0x10(%0),%0 \n"
4226 "pshufb %%xmm5,%%xmm0 \n"
4227 "sub $0x4,%2 \n"
4228 "movd %%xmm0,(%1) \n"
4229 "lea 0x4(%1),%1 \n"
4230 "jg 1b \n"
4231 : "+r"(src_argb), // %0
4232 "+r"(dst_bayer), // %1
4233 "+r"(pix) // %2
4234 : "g"(selector) // %3
4235 : "memory", "cc"
4236#if defined(__SSE2__)
4237 , "xmm0", "xmm5"
4238#endif
4239 );
4240}
fbarchard@google.com9de88672012-10-12 06:23:33 +00004241
4242void I422ToYUY2Row_SSE2(const uint8* src_y,
4243 const uint8* src_u,
4244 const uint8* src_v,
4245 uint8* dst_frame, int width) {
4246 asm volatile (
4247 "sub %1,%2 \n"
4248 ".p2align 4 \n"
4249 "1: \n"
4250 "movq (%1),%%xmm2 \n"
4251 "movq (%1,%2,1),%%xmm3 \n"
4252 "lea 0x8(%1),%1 \n"
4253 "punpcklbw %%xmm3,%%xmm2 \n"
4254 "movdqa (%0),%%xmm0 \n"
4255 "lea 0x10(%0),%0 \n"
4256 "movdqa %%xmm0,%%xmm1 \n"
4257 "punpcklbw %%xmm2,%%xmm0 \n"
4258 "punpckhbw %%xmm2,%%xmm1 \n"
4259 "movdqa %%xmm0,(%3) \n"
4260 "movdqa %%xmm1,0x10(%3) \n"
4261 "lea 0x20(%3),%3 \n"
4262 "sub $0x10,%4 \n"
4263 "jg 1b \n"
4264 : "+r"(src_y), // %0
4265 "+r"(src_u), // %1
4266 "+r"(src_v), // %2
4267 "+r"(dst_frame), // %3
4268 "+rm"(width) // %4
4269 :
4270 : "memory", "cc"
4271#if defined(__SSE2__)
4272 , "xmm0", "xmm1", "xmm2", "xmm3"
4273#endif
4274 );
4275}
4276
4277void I422ToUYVYRow_SSE2(const uint8* src_y,
4278 const uint8* src_u,
4279 const uint8* src_v,
4280 uint8* dst_frame, int width) {
4281 asm volatile (
4282 "sub %1,%2 \n"
4283 ".p2align 4 \n"
4284 "1: \n"
4285 "movq (%1),%%xmm2 \n"
4286 "movq (%1,%2,1),%%xmm3 \n"
4287 "lea 0x8(%1),%1 \n"
4288 "punpcklbw %%xmm3,%%xmm2 \n"
4289 "movdqa (%0),%%xmm0 \n"
4290 "movdqa %%xmm2,%%xmm1 \n"
4291 "lea 0x10(%0),%0 \n"
4292 "punpcklbw %%xmm0,%%xmm1 \n"
4293 "punpckhbw %%xmm0,%%xmm2 \n"
4294 "movdqa %%xmm1,(%3) \n"
4295 "movdqa %%xmm2,0x10(%3) \n"
4296 "lea 0x20(%3),%3 \n"
4297 "sub $0x10,%4 \n"
4298 "jg 1b \n"
4299 : "+r"(src_y), // %0
4300 "+r"(src_u), // %1
4301 "+r"(src_v), // %2
4302 "+r"(dst_frame), // %3
4303 "+rm"(width) // %4
4304 :
4305 : "memory", "cc"
4306#if defined(__SSE2__)
4307 , "xmm0", "xmm1", "xmm2", "xmm3"
4308#endif
4309 );
4310}
4311
fbarchard@google.com2d11d432012-02-16 02:50:39 +00004312#endif // defined(__x86_64__) || defined(__i386__)
4313
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004314#ifdef __cplusplus
mikhal@webrtc.orgaed1cc92011-09-28 00:06:25 +00004315} // extern "C"
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +00004316} // namespace libyuv
4317#endif