blob: d7bf26654033caf9c0a7b4900451c6a4ef75c474 [file] [log] [blame]
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +00001/*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/basic_types.h"
12#include "libyuv/row.h"
13
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19// This module is for GCC Neon
20#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
21
22/**
23 * NEON downscalers with interpolation.
24 *
25 * Provided by Fritz Koenig
26 *
27 */
28
29void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
30 uint8* dst, int dst_width) {
31 asm volatile (
fbarchard@google.com31d05032012-10-23 09:04:37 +000032 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000033 // load even pixels into q0, odd into q1
34 "vld2.u8 {q0,q1}, [%0]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000035 "subs %2, %2, #16 \n" // 16 processed per loop
fbarchard@google.com8e50e612012-11-16 19:32:16 +000036 "vst1.u8 {q0}, [%1]! \n" // store even pixels
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000037 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +000038 : "+r"(src_ptr), // %0
39 "+r"(dst), // %1
40 "+r"(dst_width) // %2
41 :
42 : "q0", "q1" // Clobber List
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000043 );
44}
45
46void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
47 uint8* dst, int dst_width) {
48 asm volatile (
49 // change the stride to row 2 pointer
50 "add %1, %0 \n"
fbarchard@google.com31d05032012-10-23 09:04:37 +000051 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000052 "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc
53 "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc
fbarchard@google.com8e50e612012-11-16 19:32:16 +000054 "subs %3, %3, #16 \n" // 16 processed per loop
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000055 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
56 "vpaddl.u8 q1, q1 \n"
57 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
58 "vpadal.u8 q1, q3 \n"
59 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
60 "vrshrn.u16 d1, q1, #2 \n"
61 "vst1.u8 {q0}, [%2]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000062 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +000063 : "+r"(src_ptr), // %0
64 "+r"(src_stride), // %1
65 "+r"(dst), // %2
66 "+r"(dst_width) // %3
67 :
68 : "q0", "q1", "q2", "q3" // Clobber List
69 );
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000070}
71
72void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
73 uint8* dst_ptr, int dst_width) {
74 asm volatile (
fbarchard@google.com31d05032012-10-23 09:04:37 +000075 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000076 "vld2.u8 {d0, d1}, [%0]! \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +000077 "subs %2, #4 \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000078 "vtrn.u8 d1, d0 \n"
79 "vshrn.u16 d0, q0, #8 \n"
80 "vst1.u32 {d0[1]}, [%1]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000081 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +000082 : "+r"(src_ptr), // %0
83 "+r"(dst_ptr), // %1
84 "+r"(dst_width) // %2
85 :
86 : "q0", "q1", "memory", "cc"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000087 );
88}
89
90void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
91 uint8* dst_ptr, int dst_width) {
92 asm volatile (
93 "add r4, %0, %3 \n"
94 "add r5, r4, %3 \n"
95 "add %3, r5, %3 \n"
fbarchard@google.com31d05032012-10-23 09:04:37 +000096 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000097 "vld1.u8 {q0}, [%0]! \n" // load up 16x4
98 "vld1.u8 {q1}, [r4]! \n"
99 "vld1.u8 {q2}, [r5]! \n"
100 "vld1.u8 {q3}, [%3]! \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000101 "subs %2, #4 \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000102 "vpaddl.u8 q0, q0 \n"
103 "vpadal.u8 q0, q1 \n"
104 "vpadal.u8 q0, q2 \n"
105 "vpadal.u8 q0, q3 \n"
106 "vpaddl.u16 q0, q0 \n"
107 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
108 "vmovn.u16 d0, q0 \n"
109 "vst1.u32 {d0[0]}, [%1]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000110 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000111 : "+r"(src_ptr), // %0
112 "+r"(dst_ptr), // %1
113 "+r"(dst_width) // %2
114 : "r"(src_stride) // %3
115 : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000116 );
117}
118
119// Down scale from 4 to 3 pixels. Use the neon multilane read/write
120// to load up the every 4th pixel into a 4 different registers.
121// Point samples 32 pixels to 24 pixels.
122void ScaleRowDown34_NEON(const uint8* src_ptr,
123 ptrdiff_t /* src_stride */,
124 uint8* dst_ptr, int dst_width) {
125 asm volatile (
fbarchard@google.com31d05032012-10-23 09:04:37 +0000126 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000127 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000128 "subs %2, #24 \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000129 "vmov d2, d3 \n" // order d0, d1, d2
130 "vst3.u8 {d0, d1, d2}, [%1]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000131 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000132 : "+r"(src_ptr), // %0
133 "+r"(dst_ptr), // %1
134 "+r"(dst_width) // %2
135 :
136 : "d0", "d1", "d2", "d3", "memory", "cc"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000137 );
138}
139
140void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
141 ptrdiff_t src_stride,
142 uint8* dst_ptr, int dst_width) {
143 asm volatile (
144 "vmov.u8 d24, #3 \n"
145 "add %3, %0 \n"
fbarchard@google.com31d05032012-10-23 09:04:37 +0000146 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000147 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
148 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000149 "subs %2, #24 \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000150
151 // filter src line 0 with src line 1
152 // expand chars to shorts to allow for room
153 // when adding lines together
154 "vmovl.u8 q8, d4 \n"
155 "vmovl.u8 q9, d5 \n"
156 "vmovl.u8 q10, d6 \n"
157 "vmovl.u8 q11, d7 \n"
158
159 // 3 * line_0 + line_1
160 "vmlal.u8 q8, d0, d24 \n"
161 "vmlal.u8 q9, d1, d24 \n"
162 "vmlal.u8 q10, d2, d24 \n"
163 "vmlal.u8 q11, d3, d24 \n"
164
165 // (3 * line_0 + line_1) >> 2
166 "vqrshrn.u16 d0, q8, #2 \n"
167 "vqrshrn.u16 d1, q9, #2 \n"
168 "vqrshrn.u16 d2, q10, #2 \n"
169 "vqrshrn.u16 d3, q11, #2 \n"
170
171 // a0 = (src[0] * 3 + s[1] * 1) >> 2
172 "vmovl.u8 q8, d1 \n"
173 "vmlal.u8 q8, d0, d24 \n"
174 "vqrshrn.u16 d0, q8, #2 \n"
175
176 // a1 = (src[1] * 1 + s[2] * 1) >> 1
177 "vrhadd.u8 d1, d1, d2 \n"
178
179 // a2 = (src[2] * 1 + s[3] * 3) >> 2
180 "vmovl.u8 q8, d2 \n"
181 "vmlal.u8 q8, d3, d24 \n"
182 "vqrshrn.u16 d2, q8, #2 \n"
183
184 "vst3.u8 {d0, d1, d2}, [%1]! \n"
185
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000186 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000187 : "+r"(src_ptr), // %0
188 "+r"(dst_ptr), // %1
189 "+r"(dst_width), // %2
190 "+r"(src_stride) // %3
191 :
192 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000193 );
194}
195
196void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
197 ptrdiff_t src_stride,
198 uint8* dst_ptr, int dst_width) {
199 asm volatile (
200 "vmov.u8 d24, #3 \n"
201 "add %3, %0 \n"
fbarchard@google.com31d05032012-10-23 09:04:37 +0000202 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000203 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
204 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000205 "subs %2, #24 \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000206 // average src line 0 with src line 1
207 "vrhadd.u8 q0, q0, q2 \n"
208 "vrhadd.u8 q1, q1, q3 \n"
209
210 // a0 = (src[0] * 3 + s[1] * 1) >> 2
211 "vmovl.u8 q3, d1 \n"
212 "vmlal.u8 q3, d0, d24 \n"
213 "vqrshrn.u16 d0, q3, #2 \n"
214
215 // a1 = (src[1] * 1 + s[2] * 1) >> 1
216 "vrhadd.u8 d1, d1, d2 \n"
217
218 // a2 = (src[2] * 1 + s[3] * 3) >> 2
219 "vmovl.u8 q3, d2 \n"
220 "vmlal.u8 q3, d3, d24 \n"
221 "vqrshrn.u16 d2, q3, #2 \n"
222
223 "vst3.u8 {d0, d1, d2}, [%1]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000224 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000225 : "+r"(src_ptr), // %0
226 "+r"(dst_ptr), // %1
227 "+r"(dst_width), // %2
228 "+r"(src_stride) // %3
229 :
230 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000231 );
232}
233
234#define HAS_SCALEROWDOWN38_NEON
235const uvec8 kShuf38 =
236 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
237const uvec8 kShuf38_2 =
238 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
239const vec16 kMult38_Div6 =
240 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
241 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
242const vec16 kMult38_Div9 =
243 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
244 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
245
246// 32 -> 12
247void ScaleRowDown38_NEON(const uint8* src_ptr,
248 ptrdiff_t /* src_stride */,
249 uint8* dst_ptr, int dst_width) {
250 asm volatile (
251 "vld1.u8 {q3}, [%3] \n"
fbarchard@google.com31d05032012-10-23 09:04:37 +0000252 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000253 "vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000254 "subs %2, #12 \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000255 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
256 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
257 "vst1.u8 {d4}, [%1]! \n"
258 "vst1.u32 {d5[0]}, [%1]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000259 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000260 : "+r"(src_ptr), // %0
261 "+r"(dst_ptr), // %1
262 "+r"(dst_width) // %2
263 : "r"(&kShuf38) // %3
264 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000265 );
266}
267
268// 32x3 -> 12x1
269void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
270 ptrdiff_t src_stride,
271 uint8* dst_ptr, int dst_width) {
272 asm volatile (
273 "vld1.u16 {q13}, [%4] \n"
274 "vld1.u8 {q14}, [%5] \n"
275 "vld1.u8 {q15}, [%6] \n"
276 "add r4, %0, %3, lsl #1 \n"
277 "add %3, %0 \n"
fbarchard@google.com31d05032012-10-23 09:04:37 +0000278 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000279
280 // d0 = 00 40 01 41 02 42 03 43
281 // d1 = 10 50 11 51 12 52 13 53
282 // d2 = 20 60 21 61 22 62 23 63
283 // d3 = 30 70 31 71 32 72 33 73
284 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
285 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
286 "vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000287 "subs %2, #12 \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000288
289 // Shuffle the input data around to get align the data
290 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
291 // d0 = 00 10 01 11 02 12 03 13
292 // d1 = 40 50 41 51 42 52 43 53
293 "vtrn.u8 d0, d1 \n"
294 "vtrn.u8 d4, d5 \n"
295 "vtrn.u8 d16, d17 \n"
296
297 // d2 = 20 30 21 31 22 32 23 33
298 // d3 = 60 70 61 71 62 72 63 73
299 "vtrn.u8 d2, d3 \n"
300 "vtrn.u8 d6, d7 \n"
301 "vtrn.u8 d18, d19 \n"
302
303 // d0 = 00+10 01+11 02+12 03+13
304 // d2 = 40+50 41+51 42+52 43+53
305 "vpaddl.u8 q0, q0 \n"
306 "vpaddl.u8 q2, q2 \n"
307 "vpaddl.u8 q8, q8 \n"
308
309 // d3 = 60+70 61+71 62+72 63+73
310 "vpaddl.u8 d3, d3 \n"
311 "vpaddl.u8 d7, d7 \n"
312 "vpaddl.u8 d19, d19 \n"
313
314 // combine source lines
315 "vadd.u16 q0, q2 \n"
316 "vadd.u16 q0, q8 \n"
317 "vadd.u16 d4, d3, d7 \n"
318 "vadd.u16 d4, d19 \n"
319
320 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
321 // + s[6 + st * 1] + s[7 + st * 1]
322 // + s[6 + st * 2] + s[7 + st * 2]) / 6
323 "vqrdmulh.s16 q2, q2, q13 \n"
324 "vmovn.u16 d4, q2 \n"
325
326 // Shuffle 2,3 reg around so that 2 can be added to the
327 // 0,1 reg and 3 can be added to the 4,5 reg. This
328 // requires expanding from u8 to u16 as the 0,1 and 4,5
329 // registers are already expanded. Then do transposes
330 // to get aligned.
331 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
332 "vmovl.u8 q1, d2 \n"
333 "vmovl.u8 q3, d6 \n"
334 "vmovl.u8 q9, d18 \n"
335
336 // combine source lines
337 "vadd.u16 q1, q3 \n"
338 "vadd.u16 q1, q9 \n"
339
340 // d4 = xx 20 xx 30 xx 22 xx 32
341 // d5 = xx 21 xx 31 xx 23 xx 33
342 "vtrn.u32 d2, d3 \n"
343
344 // d4 = xx 20 xx 21 xx 22 xx 23
345 // d5 = xx 30 xx 31 xx 32 xx 33
346 "vtrn.u16 d2, d3 \n"
347
348 // 0+1+2, 3+4+5
349 "vadd.u16 q0, q1 \n"
350
351 // Need to divide, but can't downshift as the the value
352 // isn't a power of 2. So multiply by 65536 / n
353 // and take the upper 16 bits.
354 "vqrdmulh.s16 q0, q0, q15 \n"
355
356 // Align for table lookup, vtbl requires registers to
357 // be adjacent
358 "vmov.u8 d2, d4 \n"
359
360 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
361 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
362
363 "vst1.u8 {d3}, [%1]! \n"
364 "vst1.u32 {d4[0]}, [%1]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000365 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000366 : "+r"(src_ptr), // %0
367 "+r"(dst_ptr), // %1
368 "+r"(dst_width), // %2
369 "+r"(src_stride) // %3
370 : "r"(&kMult38_Div6), // %4
371 "r"(&kShuf38_2), // %5
372 "r"(&kMult38_Div9) // %6
373 : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
374 "q13", "q14", "q15", "memory", "cc"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000375 );
376}
377
378// 32x2 -> 12x1
379void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
380 ptrdiff_t src_stride,
381 uint8* dst_ptr, int dst_width) {
382 asm volatile (
383 "vld1.u16 {q13}, [%4] \n"
384 "vld1.u8 {q14}, [%5] \n"
385 "add %3, %0 \n"
fbarchard@google.com31d05032012-10-23 09:04:37 +0000386 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000387
388 // d0 = 00 40 01 41 02 42 03 43
389 // d1 = 10 50 11 51 12 52 13 53
390 // d2 = 20 60 21 61 22 62 23 63
391 // d3 = 30 70 31 71 32 72 33 73
392 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
393 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000394 "subs %2, #12 \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000395
396 // Shuffle the input data around to get align the data
397 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
398 // d0 = 00 10 01 11 02 12 03 13
399 // d1 = 40 50 41 51 42 52 43 53
400 "vtrn.u8 d0, d1 \n"
401 "vtrn.u8 d4, d5 \n"
402
403 // d2 = 20 30 21 31 22 32 23 33
404 // d3 = 60 70 61 71 62 72 63 73
405 "vtrn.u8 d2, d3 \n"
406 "vtrn.u8 d6, d7 \n"
407
408 // d0 = 00+10 01+11 02+12 03+13
409 // d2 = 40+50 41+51 42+52 43+53
410 "vpaddl.u8 q0, q0 \n"
411 "vpaddl.u8 q2, q2 \n"
412
413 // d3 = 60+70 61+71 62+72 63+73
414 "vpaddl.u8 d3, d3 \n"
415 "vpaddl.u8 d7, d7 \n"
416
417 // combine source lines
418 "vadd.u16 q0, q2 \n"
419 "vadd.u16 d4, d3, d7 \n"
420
421 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
422 "vqrshrn.u16 d4, q2, #2 \n"
423
424 // Shuffle 2,3 reg around so that 2 can be added to the
425 // 0,1 reg and 3 can be added to the 4,5 reg. This
426 // requires expanding from u8 to u16 as the 0,1 and 4,5
427 // registers are already expanded. Then do transposes
428 // to get aligned.
429 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
430 "vmovl.u8 q1, d2 \n"
431 "vmovl.u8 q3, d6 \n"
432
433 // combine source lines
434 "vadd.u16 q1, q3 \n"
435
436 // d4 = xx 20 xx 30 xx 22 xx 32
437 // d5 = xx 21 xx 31 xx 23 xx 33
438 "vtrn.u32 d2, d3 \n"
439
440 // d4 = xx 20 xx 21 xx 22 xx 23
441 // d5 = xx 30 xx 31 xx 32 xx 33
442 "vtrn.u16 d2, d3 \n"
443
444 // 0+1+2, 3+4+5
445 "vadd.u16 q0, q1 \n"
446
447 // Need to divide, but can't downshift as the the value
448 // isn't a power of 2. So multiply by 65536 / n
449 // and take the upper 16 bits.
450 "vqrdmulh.s16 q0, q0, q13 \n"
451
452 // Align for table lookup, vtbl requires registers to
453 // be adjacent
454 "vmov.u8 d2, d4 \n"
455
456 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
457 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
458
459 "vst1.u8 {d3}, [%1]! \n"
460 "vst1.u32 {d4[0]}, [%1]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000461 "bgt 1b \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000462 : "+r"(src_ptr), // %0
463 "+r"(dst_ptr), // %1
464 "+r"(dst_width), // %2
465 "+r"(src_stride) // %3
466 : "r"(&kMult38_Div6), // %4
467 "r"(&kShuf38_2) // %5
468 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000469 );
470}
471
472// 16x2 -> 16x1
473void ScaleFilterRows_NEON(uint8* dst_ptr,
474 const uint8* src_ptr, ptrdiff_t src_stride,
475 int dst_width, int source_y_fraction) {
476 asm volatile (
477 "cmp %4, #0 \n"
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000478 "beq 100f \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000479 "add %2, %1 \n"
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000480 "cmp %4, #64 \n"
481 "beq 75f \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000482 "cmp %4, #128 \n"
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000483 "beq 50f \n"
484 "cmp %4, #192 \n"
485 "beq 25f \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000486
487 "vdup.8 d5, %4 \n"
488 "rsb %4, #256 \n"
489 "vdup.8 d4, %4 \n"
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000490 // General purpose row blend.
fbarchard@google.com31d05032012-10-23 09:04:37 +0000491 "1: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000492 "vld1.u8 {q0}, [%1]! \n"
493 "vld1.u8 {q1}, [%2]! \n"
494 "subs %3, #16 \n"
495 "vmull.u8 q13, d0, d4 \n"
496 "vmull.u8 q14, d1, d4 \n"
497 "vmlal.u8 q13, d2, d5 \n"
498 "vmlal.u8 q14, d3, d5 \n"
499 "vrshrn.u16 d0, q13, #8 \n"
500 "vrshrn.u16 d1, q14, #8 \n"
501 "vst1.u8 {q0}, [%0]! \n"
502 "bgt 1b \n"
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000503 "b 99f \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000504
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000505 // Blend 25 / 75.
506 "25: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000507 "vld1.u8 {q0}, [%1]! \n"
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000508 "vld1.u8 {q1}, [%2]! \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000509 "subs %3, #16 \n"
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000510 "vrhadd.u8 q0, q1 \n"
511 "vrhadd.u8 q0, q1 \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000512 "vst1.u8 {q0}, [%0]! \n"
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000513 "bgt 25b \n"
514 "b 99f \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000515
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000516 // Blend 50 / 50.
517 "50: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000518 "vld1.u8 {q0}, [%1]! \n"
519 "vld1.u8 {q1}, [%2]! \n"
520 "subs %3, #16 \n"
521 "vrhadd.u8 q0, q1 \n"
522 "vst1.u8 {q0}, [%0]! \n"
fbarchard@google.com66d16f42012-11-13 20:43:25 +0000523 "bgt 50b \n"
524 "b 99f \n"
525
526 // Blend 75 / 25.
527 "75: \n"
528 "vld1.u8 {q1}, [%1]! \n"
529 "vld1.u8 {q0}, [%2]! \n"
530 "subs %3, #16 \n"
531 "vrhadd.u8 q0, q1 \n"
532 "vrhadd.u8 q0, q1 \n"
533 "vst1.u8 {q0}, [%0]! \n"
534 "bgt 75b \n"
535 "b 99f \n"
536
537 // Blend 100 / 0 - Copy row unchanged.
538 "100: \n"
539 "vld1.u8 {q0}, [%1]! \n"
540 "subs %3, #16 \n"
541 "vst1.u8 {q0}, [%0]! \n"
542 "bgt 100b \n"
543
544 "99: \n"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000545 "vst1.u8 {d1[7]}, [%0] \n"
fbarchard@google.com8e50e612012-11-16 19:32:16 +0000546 : "+r"(dst_ptr), // %0
547 "+r"(src_ptr), // %1
548 "+r"(src_stride), // %2
549 "+r"(dst_width), // %3
550 "+r"(source_y_fraction) // %4
551 :
552 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000553 );
554}
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000555#endif // __ARM_NEON__
556
557#ifdef __cplusplus
558} // extern "C"
559} // namespace libyuv
560#endif