blob: 9b4dce33f900c9c88bd1fbfdf04de5f2e75efa6c [file] [log] [blame]
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -08001/*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
Hangyu Kuangf047e7c2016-07-06 14:21:45 -07007 * in the file PATENTS. All contributing project authors may
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -08008 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080011#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070018// This module is for GCC Neon.
19#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20 !defined(__aarch64__)
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080021
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070022// NEON downscalers with interpolation.
23// Provided by Fritz Koenig
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080024
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070025// Read 32x1 throw away even pixels, and write 16x1.
Frank Barchardb83bb382017-02-22 18:01:07 -080026void ScaleRowDown2_NEON(const uint8* src_ptr,
27 ptrdiff_t src_stride,
28 uint8* dst,
29 int dst_width) {
30 (void)src_stride;
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080031 asm volatile (
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070032 "1: \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080033 // load even pixels into q0, odd into q1
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070034 MEMACCESS(0)
35 "vld2.8 {q0, q1}, [%0]! \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080036 "subs %2, %2, #16 \n" // 16 processed per loop
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070037 MEMACCESS(1)
38 "vst1.8 {q1}, [%1]! \n" // store odd pixels
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080039 "bgt 1b \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070040 : "+r"(src_ptr), // %0
41 "+r"(dst), // %1
42 "+r"(dst_width) // %2
43 :
44 : "q0", "q1" // Clobber List
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080045 );
46}
47
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070048// Read 32x1 average down and write 16x1.
Frank Barchardb83bb382017-02-22 18:01:07 -080049void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
50 ptrdiff_t src_stride,
51 uint8* dst,
52 int dst_width) {
53 (void)src_stride;
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070054 asm volatile (
55 "1: \n"
56 MEMACCESS(0)
57 "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
58 "subs %2, %2, #16 \n" // 16 processed per loop
59 "vpaddl.u8 q0, q0 \n" // add adjacent
60 "vpaddl.u8 q1, q1 \n"
61 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
62 "vrshrn.u16 d1, q1, #1 \n"
63 MEMACCESS(1)
64 "vst1.8 {q0}, [%1]! \n"
65 "bgt 1b \n"
66 : "+r"(src_ptr), // %0
67 "+r"(dst), // %1
68 "+r"(dst_width) // %2
69 :
70 : "q0", "q1" // Clobber List
71 );
72}
73
74// Read 32x2 average down and write 16x1.
Frank Barchardb83bb382017-02-22 18:01:07 -080075void ScaleRowDown2Box_NEON(const uint8* src_ptr,
76 ptrdiff_t src_stride,
77 uint8* dst,
78 int dst_width) {
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080079 asm volatile (
80 // change the stride to row 2 pointer
81 "add %1, %0 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070082 "1: \n"
83 MEMACCESS(0)
84 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
85 MEMACCESS(1)
86 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
87 "subs %3, %3, #16 \n" // 16 processed per loop
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080088 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
89 "vpaddl.u8 q1, q1 \n"
90 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
91 "vpadal.u8 q1, q3 \n"
92 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
93 "vrshrn.u16 d1, q1, #2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070094 MEMACCESS(2)
95 "vst1.8 {q0}, [%2]! \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080096 "bgt 1b \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070097 : "+r"(src_ptr), // %0
98 "+r"(src_stride), // %1
99 "+r"(dst), // %2
100 "+r"(dst_width) // %3
101 :
102 : "q0", "q1", "q2", "q3" // Clobber List
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800103 );
104}
105
Frank Barchardb83bb382017-02-22 18:01:07 -0800106void ScaleRowDown4_NEON(const uint8* src_ptr,
107 ptrdiff_t src_stride,
108 uint8* dst_ptr,
109 int dst_width) {
110 (void)src_stride;
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800111 asm volatile (
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700112 "1: \n"
113 MEMACCESS(0)
114 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
115 "subs %2, %2, #8 \n" // 8 processed per loop
116 MEMACCESS(1)
117 "vst1.8 {d2}, [%1]! \n"
118 "bgt 1b \n"
119 : "+r"(src_ptr), // %0
120 "+r"(dst_ptr), // %1
121 "+r"(dst_width) // %2
122 :
123 : "q0", "q1", "memory", "cc"
124 );
125}
126
Frank Barchardb83bb382017-02-22 18:01:07 -0800127void ScaleRowDown4Box_NEON(const uint8* src_ptr,
128 ptrdiff_t src_stride,
129 uint8* dst_ptr,
130 int dst_width) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700131 const uint8* src_ptr1 = src_ptr + src_stride;
132 const uint8* src_ptr2 = src_ptr + src_stride * 2;
133 const uint8* src_ptr3 = src_ptr + src_stride * 3;
Frank Barchardb83bb382017-02-22 18:01:07 -0800134 asm volatile (
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700135 "1: \n"
136 MEMACCESS(0)
137 "vld1.8 {q0}, [%0]! \n" // load up 16x4
138 MEMACCESS(3)
139 "vld1.8 {q1}, [%3]! \n"
140 MEMACCESS(4)
141 "vld1.8 {q2}, [%4]! \n"
142 MEMACCESS(5)
143 "vld1.8 {q3}, [%5]! \n"
144 "subs %2, %2, #4 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800145 "vpaddl.u8 q0, q0 \n"
146 "vpadal.u8 q0, q1 \n"
147 "vpadal.u8 q0, q2 \n"
148 "vpadal.u8 q0, q3 \n"
149 "vpaddl.u16 q0, q0 \n"
150 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
151 "vmovn.u16 d0, q0 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700152 MEMACCESS(1)
153 "vst1.32 {d0[0]}, [%1]! \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800154 "bgt 1b \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700155 : "+r"(src_ptr), // %0
156 "+r"(dst_ptr), // %1
157 "+r"(dst_width), // %2
158 "+r"(src_ptr1), // %3
159 "+r"(src_ptr2), // %4
160 "+r"(src_ptr3) // %5
161 :
162 : "q0", "q1", "q2", "q3", "memory", "cc"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800163 );
164}
165
166// Down scale from 4 to 3 pixels. Use the neon multilane read/write
167// to load up the every 4th pixel into a 4 different registers.
168// Point samples 32 pixels to 24 pixels.
169void ScaleRowDown34_NEON(const uint8* src_ptr,
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700170 ptrdiff_t src_stride,
Frank Barchardb83bb382017-02-22 18:01:07 -0800171 uint8* dst_ptr,
172 int dst_width) {
173 (void)src_stride;
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800174 asm volatile (
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700175 "1: \n"
176 MEMACCESS(0)
177 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
178 "subs %2, %2, #24 \n"
179 "vmov d2, d3 \n" // order d0, d1, d2
180 MEMACCESS(1)
181 "vst3.8 {d0, d1, d2}, [%1]! \n"
182 "bgt 1b \n"
183 : "+r"(src_ptr), // %0
184 "+r"(dst_ptr), // %1
185 "+r"(dst_width) // %2
186 :
187 : "d0", "d1", "d2", "d3", "memory", "cc"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800188 );
189}
190
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700191void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800192 ptrdiff_t src_stride,
Frank Barchardb83bb382017-02-22 18:01:07 -0800193 uint8* dst_ptr,
194 int dst_width) {
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800195 asm volatile (
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700196 "vmov.u8 d24, #3 \n"
197 "add %3, %0 \n"
198 "1: \n"
199 MEMACCESS(0)
200 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
201 MEMACCESS(3)
202 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
203 "subs %2, %2, #24 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800204
205 // filter src line 0 with src line 1
206 // expand chars to shorts to allow for room
207 // when adding lines together
208 "vmovl.u8 q8, d4 \n"
209 "vmovl.u8 q9, d5 \n"
210 "vmovl.u8 q10, d6 \n"
211 "vmovl.u8 q11, d7 \n"
212
213 // 3 * line_0 + line_1
214 "vmlal.u8 q8, d0, d24 \n"
215 "vmlal.u8 q9, d1, d24 \n"
216 "vmlal.u8 q10, d2, d24 \n"
217 "vmlal.u8 q11, d3, d24 \n"
218
219 // (3 * line_0 + line_1) >> 2
220 "vqrshrn.u16 d0, q8, #2 \n"
221 "vqrshrn.u16 d1, q9, #2 \n"
222 "vqrshrn.u16 d2, q10, #2 \n"
223 "vqrshrn.u16 d3, q11, #2 \n"
224
225 // a0 = (src[0] * 3 + s[1] * 1) >> 2
226 "vmovl.u8 q8, d1 \n"
227 "vmlal.u8 q8, d0, d24 \n"
228 "vqrshrn.u16 d0, q8, #2 \n"
229
230 // a1 = (src[1] * 1 + s[2] * 1) >> 1
231 "vrhadd.u8 d1, d1, d2 \n"
232
233 // a2 = (src[2] * 1 + s[3] * 3) >> 2
234 "vmovl.u8 q8, d2 \n"
235 "vmlal.u8 q8, d3, d24 \n"
236 "vqrshrn.u16 d2, q8, #2 \n"
237
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700238 MEMACCESS(1)
239 "vst3.8 {d0, d1, d2}, [%1]! \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800240
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800241 "bgt 1b \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700242 : "+r"(src_ptr), // %0
243 "+r"(dst_ptr), // %1
244 "+r"(dst_width), // %2
245 "+r"(src_stride) // %3
246 :
247 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800248 );
249}
250
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700251void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800252 ptrdiff_t src_stride,
Frank Barchardb83bb382017-02-22 18:01:07 -0800253 uint8* dst_ptr,
254 int dst_width) {
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800255 asm volatile (
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700256 "vmov.u8 d24, #3 \n"
257 "add %3, %0 \n"
258 "1: \n"
259 MEMACCESS(0)
260 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
261 MEMACCESS(3)
262 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
263 "subs %2, %2, #24 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800264 // average src line 0 with src line 1
265 "vrhadd.u8 q0, q0, q2 \n"
266 "vrhadd.u8 q1, q1, q3 \n"
267
268 // a0 = (src[0] * 3 + s[1] * 1) >> 2
269 "vmovl.u8 q3, d1 \n"
270 "vmlal.u8 q3, d0, d24 \n"
271 "vqrshrn.u16 d0, q3, #2 \n"
272
273 // a1 = (src[1] * 1 + s[2] * 1) >> 1
274 "vrhadd.u8 d1, d1, d2 \n"
275
276 // a2 = (src[2] * 1 + s[3] * 3) >> 2
277 "vmovl.u8 q3, d2 \n"
278 "vmlal.u8 q3, d3, d24 \n"
279 "vqrshrn.u16 d2, q3, #2 \n"
280
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700281 MEMACCESS(1)
282 "vst3.8 {d0, d1, d2}, [%1]! \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800283 "bgt 1b \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700284 : "+r"(src_ptr), // %0
285 "+r"(dst_ptr), // %1
286 "+r"(dst_width), // %2
287 "+r"(src_stride) // %3
288 :
289 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800290 );
291}
292
293#define HAS_SCALEROWDOWN38_NEON
Frank Barchardb83bb382017-02-22 18:01:07 -0800294static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
295static uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
296 18, 6, 14, 19, 0, 0, 0, 0};
297static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
298 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
299static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
300 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800301
302// 32 -> 12
303void ScaleRowDown38_NEON(const uint8* src_ptr,
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700304 ptrdiff_t src_stride,
Frank Barchardb83bb382017-02-22 18:01:07 -0800305 uint8* dst_ptr,
306 int dst_width) {
307 (void)src_stride;
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800308 asm volatile (
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700309 MEMACCESS(3)
310 "vld1.8 {q3}, [%3] \n"
311 "1: \n"
312 MEMACCESS(0)
313 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
314 "subs %2, %2, #12 \n"
315 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
316 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
317 MEMACCESS(1)
318 "vst1.8 {d4}, [%1]! \n"
319 MEMACCESS(1)
320 "vst1.32 {d5[0]}, [%1]! \n"
321 "bgt 1b \n"
322 : "+r"(src_ptr), // %0
323 "+r"(dst_ptr), // %1
324 "+r"(dst_width) // %2
325 : "r"(&kShuf38) // %3
326 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800327 );
328}
329
330// 32x3 -> 12x1
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700331void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800332 ptrdiff_t src_stride,
Frank Barchardb83bb382017-02-22 18:01:07 -0800333 uint8* dst_ptr,
334 int dst_width) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700335 const uint8* src_ptr1 = src_ptr + src_stride * 2;
336
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800337 asm volatile (
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700338 MEMACCESS(5)
339 "vld1.16 {q13}, [%5] \n"
340 MEMACCESS(6)
341 "vld1.8 {q14}, [%6] \n"
342 MEMACCESS(7)
343 "vld1.8 {q15}, [%7] \n"
344 "add %3, %0 \n"
345 "1: \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800346
347 // d0 = 00 40 01 41 02 42 03 43
348 // d1 = 10 50 11 51 12 52 13 53
349 // d2 = 20 60 21 61 22 62 23 63
350 // d3 = 30 70 31 71 32 72 33 73
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700351 MEMACCESS(0)
352 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
353 MEMACCESS(3)
354 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
355 MEMACCESS(4)
356 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
357 "subs %2, %2, #12 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800358
359 // Shuffle the input data around to get align the data
360 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
361 // d0 = 00 10 01 11 02 12 03 13
362 // d1 = 40 50 41 51 42 52 43 53
363 "vtrn.u8 d0, d1 \n"
364 "vtrn.u8 d4, d5 \n"
365 "vtrn.u8 d16, d17 \n"
366
367 // d2 = 20 30 21 31 22 32 23 33
368 // d3 = 60 70 61 71 62 72 63 73
369 "vtrn.u8 d2, d3 \n"
370 "vtrn.u8 d6, d7 \n"
371 "vtrn.u8 d18, d19 \n"
372
373 // d0 = 00+10 01+11 02+12 03+13
374 // d2 = 40+50 41+51 42+52 43+53
375 "vpaddl.u8 q0, q0 \n"
376 "vpaddl.u8 q2, q2 \n"
377 "vpaddl.u8 q8, q8 \n"
378
379 // d3 = 60+70 61+71 62+72 63+73
380 "vpaddl.u8 d3, d3 \n"
381 "vpaddl.u8 d7, d7 \n"
382 "vpaddl.u8 d19, d19 \n"
383
384 // combine source lines
385 "vadd.u16 q0, q2 \n"
386 "vadd.u16 q0, q8 \n"
387 "vadd.u16 d4, d3, d7 \n"
388 "vadd.u16 d4, d19 \n"
389
390 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
391 // + s[6 + st * 1] + s[7 + st * 1]
392 // + s[6 + st * 2] + s[7 + st * 2]) / 6
393 "vqrdmulh.s16 q2, q2, q13 \n"
394 "vmovn.u16 d4, q2 \n"
395
396 // Shuffle 2,3 reg around so that 2 can be added to the
397 // 0,1 reg and 3 can be added to the 4,5 reg. This
398 // requires expanding from u8 to u16 as the 0,1 and 4,5
399 // registers are already expanded. Then do transposes
400 // to get aligned.
401 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
402 "vmovl.u8 q1, d2 \n"
403 "vmovl.u8 q3, d6 \n"
404 "vmovl.u8 q9, d18 \n"
405
406 // combine source lines
407 "vadd.u16 q1, q3 \n"
408 "vadd.u16 q1, q9 \n"
409
410 // d4 = xx 20 xx 30 xx 22 xx 32
411 // d5 = xx 21 xx 31 xx 23 xx 33
412 "vtrn.u32 d2, d3 \n"
413
414 // d4 = xx 20 xx 21 xx 22 xx 23
415 // d5 = xx 30 xx 31 xx 32 xx 33
416 "vtrn.u16 d2, d3 \n"
417
418 // 0+1+2, 3+4+5
419 "vadd.u16 q0, q1 \n"
420
421 // Need to divide, but can't downshift as the the value
422 // isn't a power of 2. So multiply by 65536 / n
423 // and take the upper 16 bits.
424 "vqrdmulh.s16 q0, q0, q15 \n"
425
426 // Align for table lookup, vtbl requires registers to
427 // be adjacent
428 "vmov.u8 d2, d4 \n"
429
430 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
431 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
432
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700433 MEMACCESS(1)
434 "vst1.8 {d3}, [%1]! \n"
435 MEMACCESS(1)
436 "vst1.32 {d4[0]}, [%1]! \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800437 "bgt 1b \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700438 : "+r"(src_ptr), // %0
439 "+r"(dst_ptr), // %1
440 "+r"(dst_width), // %2
441 "+r"(src_stride), // %3
442 "+r"(src_ptr1) // %4
443 : "r"(&kMult38_Div6), // %5
444 "r"(&kShuf38_2), // %6
445 "r"(&kMult38_Div9) // %7
446 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800447 );
448}
449
450// 32x2 -> 12x1
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700451void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800452 ptrdiff_t src_stride,
Frank Barchardb83bb382017-02-22 18:01:07 -0800453 uint8* dst_ptr,
454 int dst_width) {
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800455 asm volatile (
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700456 MEMACCESS(4)
457 "vld1.16 {q13}, [%4] \n"
458 MEMACCESS(5)
459 "vld1.8 {q14}, [%5] \n"
460 "add %3, %0 \n"
461 "1: \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800462
463 // d0 = 00 40 01 41 02 42 03 43
464 // d1 = 10 50 11 51 12 52 13 53
465 // d2 = 20 60 21 61 22 62 23 63
466 // d3 = 30 70 31 71 32 72 33 73
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700467 MEMACCESS(0)
468 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
469 MEMACCESS(3)
470 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
471 "subs %2, %2, #12 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800472
473 // Shuffle the input data around to get align the data
474 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
475 // d0 = 00 10 01 11 02 12 03 13
476 // d1 = 40 50 41 51 42 52 43 53
477 "vtrn.u8 d0, d1 \n"
478 "vtrn.u8 d4, d5 \n"
479
480 // d2 = 20 30 21 31 22 32 23 33
481 // d3 = 60 70 61 71 62 72 63 73
482 "vtrn.u8 d2, d3 \n"
483 "vtrn.u8 d6, d7 \n"
484
485 // d0 = 00+10 01+11 02+12 03+13
486 // d2 = 40+50 41+51 42+52 43+53
487 "vpaddl.u8 q0, q0 \n"
488 "vpaddl.u8 q2, q2 \n"
489
490 // d3 = 60+70 61+71 62+72 63+73
491 "vpaddl.u8 d3, d3 \n"
492 "vpaddl.u8 d7, d7 \n"
493
494 // combine source lines
495 "vadd.u16 q0, q2 \n"
496 "vadd.u16 d4, d3, d7 \n"
497
498 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
499 "vqrshrn.u16 d4, q2, #2 \n"
500
501 // Shuffle 2,3 reg around so that 2 can be added to the
502 // 0,1 reg and 3 can be added to the 4,5 reg. This
503 // requires expanding from u8 to u16 as the 0,1 and 4,5
504 // registers are already expanded. Then do transposes
505 // to get aligned.
506 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
507 "vmovl.u8 q1, d2 \n"
508 "vmovl.u8 q3, d6 \n"
509
510 // combine source lines
511 "vadd.u16 q1, q3 \n"
512
513 // d4 = xx 20 xx 30 xx 22 xx 32
514 // d5 = xx 21 xx 31 xx 23 xx 33
515 "vtrn.u32 d2, d3 \n"
516
517 // d4 = xx 20 xx 21 xx 22 xx 23
518 // d5 = xx 30 xx 31 xx 32 xx 33
519 "vtrn.u16 d2, d3 \n"
520
521 // 0+1+2, 3+4+5
522 "vadd.u16 q0, q1 \n"
523
524 // Need to divide, but can't downshift as the the value
525 // isn't a power of 2. So multiply by 65536 / n
526 // and take the upper 16 bits.
527 "vqrdmulh.s16 q0, q0, q13 \n"
528
529 // Align for table lookup, vtbl requires registers to
530 // be adjacent
531 "vmov.u8 d2, d4 \n"
532
533 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
534 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
535
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700536 MEMACCESS(1)
537 "vst1.8 {d3}, [%1]! \n"
538 MEMACCESS(1)
539 "vst1.32 {d4[0]}, [%1]! \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800540 "bgt 1b \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700541 : "+r"(src_ptr), // %0
542 "+r"(dst_ptr), // %1
543 "+r"(dst_width), // %2
544 "+r"(src_stride) // %3
545 : "r"(&kMult38_Div6), // %4
546 "r"(&kShuf38_2) // %5
547 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800548 );
549}
550
Frank Barchardb83bb382017-02-22 18:01:07 -0800551void ScaleAddRows_NEON(const uint8* src_ptr,
552 ptrdiff_t src_stride,
553 uint16* dst_ptr,
554 int src_width,
555 int src_height) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700556 const uint8* src_tmp;
557 asm volatile (
558 "1: \n"
559 "mov %0, %1 \n"
560 "mov r12, %5 \n"
561 "veor q2, q2, q2 \n"
562 "veor q3, q3, q3 \n"
563 "2: \n"
564 // load 16 pixels into q0
565 MEMACCESS(0)
566 "vld1.8 {q0}, [%0], %3 \n"
567 "vaddw.u8 q3, q3, d1 \n"
568 "vaddw.u8 q2, q2, d0 \n"
569 "subs r12, r12, #1 \n"
570 "bgt 2b \n"
571 MEMACCESS(2)
572 "vst1.16 {q2, q3}, [%2]! \n" // store pixels
573 "add %1, %1, #16 \n"
574 "subs %4, %4, #16 \n" // 16 processed per loop
575 "bgt 1b \n"
576 : "=&r"(src_tmp), // %0
577 "+r"(src_ptr), // %1
578 "+r"(dst_ptr), // %2
579 "+r"(src_stride), // %3
580 "+r"(src_width), // %4
581 "+r"(src_height) // %5
582 :
583 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
584 );
585}
586
Frank Barchardb83bb382017-02-22 18:01:07 -0800587// clang-format off
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700588// TODO(Yang Zhang): Investigate less load instructions for
589// the x/dx stepping
590#define LOAD2_DATA8_LANE(n) \
591 "lsr %5, %3, #16 \n" \
592 "add %6, %1, %5 \n" \
593 "add %3, %3, %4 \n" \
594 MEMACCESS(6) \
595 "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
Frank Barchardb83bb382017-02-22 18:01:07 -0800596// clang-format on
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700597
Frank Barchardb83bb382017-02-22 18:01:07 -0800598// The NEON version mimics this formula (from row_common.cc):
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700599// #define BLENDER(a, b, f) (uint8)((int)(a) +
Frank Barchardb83bb382017-02-22 18:01:07 -0800600// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700601
Frank Barchardb83bb382017-02-22 18:01:07 -0800602void ScaleFilterCols_NEON(uint8* dst_ptr,
603 const uint8* src_ptr,
604 int dst_width,
605 int x,
606 int dx) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700607 int dx_offset[4] = {0, 1, 2, 3};
608 int* tmp = dx_offset;
609 const uint8* src_tmp = src_ptr;
610 asm volatile (
611 "vdup.32 q0, %3 \n" // x
612 "vdup.32 q1, %4 \n" // dx
613 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
614 "vshl.i32 q3, q1, #2 \n" // 4 * dx
615 "vmul.s32 q1, q1, q2 \n"
616 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
617 "vadd.s32 q1, q1, q0 \n"
618 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
619 "vadd.s32 q2, q1, q3 \n"
620 "vshl.i32 q0, q3, #1 \n" // 8 * dx
621 "1: \n"
622 LOAD2_DATA8_LANE(0)
623 LOAD2_DATA8_LANE(1)
624 LOAD2_DATA8_LANE(2)
625 LOAD2_DATA8_LANE(3)
626 LOAD2_DATA8_LANE(4)
627 LOAD2_DATA8_LANE(5)
628 LOAD2_DATA8_LANE(6)
629 LOAD2_DATA8_LANE(7)
630 "vmov q10, q1 \n"
631 "vmov q11, q2 \n"
632 "vuzp.16 q10, q11 \n"
633 "vmovl.u8 q8, d6 \n"
634 "vmovl.u8 q9, d7 \n"
635 "vsubl.s16 q11, d18, d16 \n"
636 "vsubl.s16 q12, d19, d17 \n"
637 "vmovl.u16 q13, d20 \n"
638 "vmovl.u16 q10, d21 \n"
639 "vmul.s32 q11, q11, q13 \n"
640 "vmul.s32 q12, q12, q10 \n"
641 "vrshrn.s32 d18, q11, #16 \n"
642 "vrshrn.s32 d19, q12, #16 \n"
643 "vadd.s16 q8, q8, q9 \n"
644 "vmovn.s16 d6, q8 \n"
645
646 MEMACCESS(0)
647 "vst1.8 {d6}, [%0]! \n" // store pixels
648 "vadd.s32 q1, q1, q0 \n"
649 "vadd.s32 q2, q2, q0 \n"
650 "subs %2, %2, #8 \n" // 8 processed per loop
651 "bgt 1b \n"
652 : "+r"(dst_ptr), // %0
653 "+r"(src_ptr), // %1
654 "+r"(dst_width), // %2
655 "+r"(x), // %3
656 "+r"(dx), // %4
657 "+r"(tmp), // %5
658 "+r"(src_tmp) // %6
659 :
660 : "memory", "cc", "q0", "q1", "q2", "q3",
661 "q8", "q9", "q10", "q11", "q12", "q13"
662 );
663}
664
665#undef LOAD2_DATA8_LANE
666
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800667// 16x2 -> 16x1
668void ScaleFilterRows_NEON(uint8* dst_ptr,
Frank Barchardb83bb382017-02-22 18:01:07 -0800669 const uint8* src_ptr,
670 ptrdiff_t src_stride,
671 int dst_width,
672 int source_y_fraction) {
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800673 asm volatile (
674 "cmp %4, #0 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700675 "beq 100f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800676 "add %2, %1 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700677 "cmp %4, #64 \n"
678 "beq 75f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800679 "cmp %4, #128 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700680 "beq 50f \n"
681 "cmp %4, #192 \n"
682 "beq 25f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800683
684 "vdup.8 d5, %4 \n"
685 "rsb %4, #256 \n"
686 "vdup.8 d4, %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700687 // General purpose row blend.
688 "1: \n"
689 MEMACCESS(1)
690 "vld1.8 {q0}, [%1]! \n"
691 MEMACCESS(2)
692 "vld1.8 {q1}, [%2]! \n"
693 "subs %3, %3, #16 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800694 "vmull.u8 q13, d0, d4 \n"
695 "vmull.u8 q14, d1, d4 \n"
696 "vmlal.u8 q13, d2, d5 \n"
697 "vmlal.u8 q14, d3, d5 \n"
698 "vrshrn.u16 d0, q13, #8 \n"
699 "vrshrn.u16 d1, q14, #8 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700700 MEMACCESS(0)
701 "vst1.8 {q0}, [%0]! \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800702 "bgt 1b \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700703 "b 99f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800704
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700705 // Blend 25 / 75.
706 "25: \n"
707 MEMACCESS(1)
708 "vld1.8 {q0}, [%1]! \n"
709 MEMACCESS(2)
710 "vld1.8 {q1}, [%2]! \n"
711 "subs %3, %3, #16 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800712 "vrhadd.u8 q0, q1 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700713 "vrhadd.u8 q0, q1 \n"
714 MEMACCESS(0)
715 "vst1.8 {q0}, [%0]! \n"
716 "bgt 25b \n"
717 "b 99f \n"
718
719 // Blend 50 / 50.
720 "50: \n"
721 MEMACCESS(1)
722 "vld1.8 {q0}, [%1]! \n"
723 MEMACCESS(2)
724 "vld1.8 {q1}, [%2]! \n"
725 "subs %3, %3, #16 \n"
726 "vrhadd.u8 q0, q1 \n"
727 MEMACCESS(0)
728 "vst1.8 {q0}, [%0]! \n"
729 "bgt 50b \n"
730 "b 99f \n"
731
732 // Blend 75 / 25.
733 "75: \n"
734 MEMACCESS(1)
735 "vld1.8 {q1}, [%1]! \n"
736 MEMACCESS(2)
737 "vld1.8 {q0}, [%2]! \n"
738 "subs %3, %3, #16 \n"
739 "vrhadd.u8 q0, q1 \n"
740 "vrhadd.u8 q0, q1 \n"
741 MEMACCESS(0)
742 "vst1.8 {q0}, [%0]! \n"
743 "bgt 75b \n"
744 "b 99f \n"
745
746 // Blend 100 / 0 - Copy row unchanged.
747 "100: \n"
748 MEMACCESS(1)
749 "vld1.8 {q0}, [%1]! \n"
750 "subs %3, %3, #16 \n"
751 MEMACCESS(0)
752 "vst1.8 {q0}, [%0]! \n"
753 "bgt 100b \n"
754
755 "99: \n"
756 MEMACCESS(0)
757 "vst1.8 {d1[7]}, [%0] \n"
758 : "+r"(dst_ptr), // %0
759 "+r"(src_ptr), // %1
760 "+r"(src_stride), // %2
761 "+r"(dst_width), // %3
762 "+r"(source_y_fraction) // %4
763 :
764 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800765 );
766}
767
Frank Barchardb83bb382017-02-22 18:01:07 -0800768void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
769 ptrdiff_t src_stride,
770 uint8* dst,
771 int dst_width) {
772 (void)src_stride;
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700773 asm volatile (
774 "1: \n"
775 // load even pixels into q0, odd into q1
776 MEMACCESS(0)
777 "vld2.32 {q0, q1}, [%0]! \n"
778 MEMACCESS(0)
779 "vld2.32 {q2, q3}, [%0]! \n"
780 "subs %2, %2, #8 \n" // 8 processed per loop
781 MEMACCESS(1)
782 "vst1.8 {q1}, [%1]! \n" // store odd pixels
783 MEMACCESS(1)
784 "vst1.8 {q3}, [%1]! \n"
785 "bgt 1b \n"
786 : "+r"(src_ptr), // %0
787 "+r"(dst), // %1
788 "+r"(dst_width) // %2
789 :
790 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
791 );
792}
793
Frank Barchardb83bb382017-02-22 18:01:07 -0800794void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
795 ptrdiff_t src_stride,
796 uint8* dst_argb,
797 int dst_width) {
798 (void)src_stride;
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700799 asm volatile (
800 "1: \n"
801 MEMACCESS(0)
802 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
803 MEMACCESS(0)
804 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
805 "subs %2, %2, #8 \n" // 8 processed per loop
806 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
807 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
808 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
809 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
810 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
811 "vrshrn.u16 d1, q1, #1 \n"
812 "vrshrn.u16 d2, q2, #1 \n"
813 "vrshrn.u16 d3, q3, #1 \n"
814 MEMACCESS(1)
815 "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
816 "bgt 1b \n"
817 : "+r"(src_argb), // %0
818 "+r"(dst_argb), // %1
819 "+r"(dst_width) // %2
820 :
821 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
822 );
823}
824
Frank Barchardb83bb382017-02-22 18:01:07 -0800825void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
826 ptrdiff_t src_stride,
827 uint8* dst,
828 int dst_width) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700829 asm volatile (
830 // change the stride to row 2 pointer
831 "add %1, %1, %0 \n"
832 "1: \n"
833 MEMACCESS(0)
834 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
835 MEMACCESS(0)
836 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
837 "subs %3, %3, #8 \n" // 8 processed per loop.
838 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
839 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
840 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
841 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
842 MEMACCESS(1)
843 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
844 MEMACCESS(1)
845 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
846 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
847 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
848 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
849 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
850 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
851 "vrshrn.u16 d1, q1, #2 \n"
852 "vrshrn.u16 d2, q2, #2 \n"
853 "vrshrn.u16 d3, q3, #2 \n"
854 MEMACCESS(2)
855 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
856 "bgt 1b \n"
857 : "+r"(src_ptr), // %0
858 "+r"(src_stride), // %1
859 "+r"(dst), // %2
860 "+r"(dst_width) // %3
861 :
862 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
863 );
864}
865
866// Reads 4 pixels at a time.
867// Alignment requirement: src_argb 4 byte aligned.
Frank Barchardb83bb382017-02-22 18:01:07 -0800868void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
869 ptrdiff_t src_stride,
870 int src_stepx,
871 uint8* dst_argb,
872 int dst_width) {
873 (void)src_stride;
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700874 asm volatile (
875 "mov r12, %3, lsl #2 \n"
876 "1: \n"
877 MEMACCESS(0)
878 "vld1.32 {d0[0]}, [%0], r12 \n"
879 MEMACCESS(0)
880 "vld1.32 {d0[1]}, [%0], r12 \n"
881 MEMACCESS(0)
882 "vld1.32 {d1[0]}, [%0], r12 \n"
883 MEMACCESS(0)
884 "vld1.32 {d1[1]}, [%0], r12 \n"
885 "subs %2, %2, #4 \n" // 4 pixels per loop.
886 MEMACCESS(1)
887 "vst1.8 {q0}, [%1]! \n"
888 "bgt 1b \n"
889 : "+r"(src_argb), // %0
890 "+r"(dst_argb), // %1
891 "+r"(dst_width) // %2
892 : "r"(src_stepx) // %3
893 : "memory", "cc", "r12", "q0"
894 );
895}
896
897// Reads 4 pixels at a time.
898// Alignment requirement: src_argb 4 byte aligned.
Frank Barchardb83bb382017-02-22 18:01:07 -0800899void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
900 ptrdiff_t src_stride,
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700901 int src_stepx,
Frank Barchardb83bb382017-02-22 18:01:07 -0800902 uint8* dst_argb,
903 int dst_width) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700904 asm volatile (
905 "mov r12, %4, lsl #2 \n"
906 "add %1, %1, %0 \n"
907 "1: \n"
908 MEMACCESS(0)
909 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
910 MEMACCESS(1)
911 "vld1.8 {d1}, [%1], r12 \n"
912 MEMACCESS(0)
913 "vld1.8 {d2}, [%0], r12 \n"
914 MEMACCESS(1)
915 "vld1.8 {d3}, [%1], r12 \n"
916 MEMACCESS(0)
917 "vld1.8 {d4}, [%0], r12 \n"
918 MEMACCESS(1)
919 "vld1.8 {d5}, [%1], r12 \n"
920 MEMACCESS(0)
921 "vld1.8 {d6}, [%0], r12 \n"
922 MEMACCESS(1)
923 "vld1.8 {d7}, [%1], r12 \n"
924 "vaddl.u8 q0, d0, d1 \n"
925 "vaddl.u8 q1, d2, d3 \n"
926 "vaddl.u8 q2, d4, d5 \n"
927 "vaddl.u8 q3, d6, d7 \n"
928 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
929 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
930 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
931 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
932 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
933 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
934 "subs %3, %3, #4 \n" // 4 pixels per loop.
935 MEMACCESS(2)
936 "vst1.8 {q0}, [%2]! \n"
937 "bgt 1b \n"
938 : "+r"(src_argb), // %0
939 "+r"(src_stride), // %1
940 "+r"(dst_argb), // %2
941 "+r"(dst_width) // %3
942 : "r"(src_stepx) // %4
943 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
944 );
945}
946
Frank Barchardb83bb382017-02-22 18:01:07 -0800947// clang-format off
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700948// TODO(Yang Zhang): Investigate less load instructions for
949// the x/dx stepping
Frank Barchardb83bb382017-02-22 18:01:07 -0800950#define LOAD1_DATA32_LANE(dn, n) \
951 "lsr %5, %3, #16 \n" \
952 "add %6, %1, %5, lsl #2 \n" \
953 "add %3, %3, %4 \n" \
954 MEMACCESS(6) \
955 "vld1.32 {" #dn "[" #n "]}, [%6] \n"
956// clang-format on
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700957
Frank Barchardb83bb382017-02-22 18:01:07 -0800958void ScaleARGBCols_NEON(uint8* dst_argb,
959 const uint8* src_argb,
960 int dst_width,
961 int x,
962 int dx) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700963 int tmp;
964 const uint8* src_tmp = src_argb;
965 asm volatile (
966 "1: \n"
967 LOAD1_DATA32_LANE(d0, 0)
968 LOAD1_DATA32_LANE(d0, 1)
969 LOAD1_DATA32_LANE(d1, 0)
970 LOAD1_DATA32_LANE(d1, 1)
971 LOAD1_DATA32_LANE(d2, 0)
972 LOAD1_DATA32_LANE(d2, 1)
973 LOAD1_DATA32_LANE(d3, 0)
974 LOAD1_DATA32_LANE(d3, 1)
975
976 MEMACCESS(0)
977 "vst1.32 {q0, q1}, [%0]! \n" // store pixels
978 "subs %2, %2, #8 \n" // 8 processed per loop
979 "bgt 1b \n"
980 : "+r"(dst_argb), // %0
981 "+r"(src_argb), // %1
982 "+r"(dst_width), // %2
983 "+r"(x), // %3
984 "+r"(dx), // %4
985 "=&r"(tmp), // %5
986 "+r"(src_tmp) // %6
987 :
988 : "memory", "cc", "q0", "q1"
989 );
990}
991
992#undef LOAD1_DATA32_LANE
993
Frank Barchardb83bb382017-02-22 18:01:07 -0800994// clang-format off
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700995// TODO(Yang Zhang): Investigate less load instructions for
996// the x/dx stepping
Frank Barchardb83bb382017-02-22 18:01:07 -0800997#define LOAD2_DATA32_LANE(dn1, dn2, n) \
998 "lsr %5, %3, #16 \n" \
999 "add %6, %1, %5, lsl #2 \n" \
1000 "add %3, %3, %4 \n" \
1001 MEMACCESS(6) \
1002 "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
1003// clang-format on
Hangyu Kuangf047e7c2016-07-06 14:21:45 -07001004
Frank Barchardb83bb382017-02-22 18:01:07 -08001005void ScaleARGBFilterCols_NEON(uint8* dst_argb,
1006 const uint8* src_argb,
1007 int dst_width,
1008 int x,
1009 int dx) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -07001010 int dx_offset[4] = {0, 1, 2, 3};
1011 int* tmp = dx_offset;
1012 const uint8* src_tmp = src_argb;
1013 asm volatile (
1014 "vdup.32 q0, %3 \n" // x
1015 "vdup.32 q1, %4 \n" // dx
1016 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
1017 "vshl.i32 q9, q1, #2 \n" // 4 * dx
1018 "vmul.s32 q1, q1, q2 \n"
1019 "vmov.i8 q3, #0x7f \n" // 0x7F
1020 "vmov.i16 q15, #0x7f \n" // 0x7F
1021 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
1022 "vadd.s32 q8, q1, q0 \n"
1023 "1: \n"
1024 // d0, d1: a
1025 // d2, d3: b
1026 LOAD2_DATA32_LANE(d0, d2, 0)
1027 LOAD2_DATA32_LANE(d0, d2, 1)
1028 LOAD2_DATA32_LANE(d1, d3, 0)
1029 LOAD2_DATA32_LANE(d1, d3, 1)
1030 "vshrn.i32 d22, q8, #9 \n"
1031 "vand.16 d22, d22, d30 \n"
1032 "vdup.8 d24, d22[0] \n"
1033 "vdup.8 d25, d22[2] \n"
1034 "vdup.8 d26, d22[4] \n"
1035 "vdup.8 d27, d22[6] \n"
1036 "vext.8 d4, d24, d25, #4 \n"
1037 "vext.8 d5, d26, d27, #4 \n" // f
1038 "veor.8 q10, q2, q3 \n" // 0x7f ^ f
1039 "vmull.u8 q11, d0, d20 \n"
1040 "vmull.u8 q12, d1, d21 \n"
1041 "vmull.u8 q13, d2, d4 \n"
1042 "vmull.u8 q14, d3, d5 \n"
1043 "vadd.i16 q11, q11, q13 \n"
1044 "vadd.i16 q12, q12, q14 \n"
1045 "vshrn.i16 d0, q11, #7 \n"
1046 "vshrn.i16 d1, q12, #7 \n"
1047
1048 MEMACCESS(0)
1049 "vst1.32 {d0, d1}, [%0]! \n" // store pixels
1050 "vadd.s32 q8, q8, q9 \n"
1051 "subs %2, %2, #4 \n" // 4 processed per loop
1052 "bgt 1b \n"
1053 : "+r"(dst_argb), // %0
1054 "+r"(src_argb), // %1
1055 "+r"(dst_width), // %2
1056 "+r"(x), // %3
1057 "+r"(dx), // %4
1058 "+r"(tmp), // %5
1059 "+r"(src_tmp) // %6
1060 :
1061 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
1062 "q10", "q11", "q12", "q13", "q14", "q15"
1063 );
1064}
1065
1066#undef LOAD2_DATA32_LANE
1067
1068#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -08001069
1070#ifdef __cplusplus
1071} // extern "C"
1072} // namespace libyuv
1073#endif