blob: ef5c235625e2b626113bef2326cb601941b6d7af [file] [log] [blame]
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -08001/*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
Hangyu Kuangf047e7c2016-07-06 14:21:45 -07007 * in the file PATENTS. All contributing project authors may
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -08008 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070011#include "libyuv/rotate_row.h"
Frank Barchardcead1e02017-03-10 12:03:05 -080012#include "libyuv/row.h"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080013
14#include "libyuv/basic_types.h"
15
16#ifdef __cplusplus
17namespace libyuv {
18extern "C" {
19#endif
20
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070021#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
22 !defined(__aarch64__)
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080023
Frank Barchardb83bb382017-02-22 18:01:07 -080024static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
25 2, 6, 10, 14, 3, 7, 11, 15};
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080026
Frank Barchardb83bb382017-02-22 18:01:07 -080027void TransposeWx8_NEON(const uint8* src,
28 int src_stride,
29 uint8* dst,
30 int dst_stride,
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080031 int width) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070032 const uint8* src_temp;
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080033 asm volatile (
34 // loops are on blocks of 8. loop will stop when
35 // counter gets to or below 0. starting the counter
36 // at w-8 allow for this
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070037 "sub %5, #8 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080038
39 // handle 8x8 blocks. this should be the majority of the plane
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080040 "1: \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070041 "mov %0, %1 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080042
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070043 MEMACCESS(0)
44 "vld1.8 {d0}, [%0], %2 \n"
45 MEMACCESS(0)
46 "vld1.8 {d1}, [%0], %2 \n"
47 MEMACCESS(0)
48 "vld1.8 {d2}, [%0], %2 \n"
49 MEMACCESS(0)
50 "vld1.8 {d3}, [%0], %2 \n"
51 MEMACCESS(0)
52 "vld1.8 {d4}, [%0], %2 \n"
53 MEMACCESS(0)
54 "vld1.8 {d5}, [%0], %2 \n"
55 MEMACCESS(0)
56 "vld1.8 {d6}, [%0], %2 \n"
57 MEMACCESS(0)
58 "vld1.8 {d7}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080059
60 "vtrn.8 d1, d0 \n"
61 "vtrn.8 d3, d2 \n"
62 "vtrn.8 d5, d4 \n"
63 "vtrn.8 d7, d6 \n"
64
65 "vtrn.16 d1, d3 \n"
66 "vtrn.16 d0, d2 \n"
67 "vtrn.16 d5, d7 \n"
68 "vtrn.16 d4, d6 \n"
69
70 "vtrn.32 d1, d5 \n"
71 "vtrn.32 d0, d4 \n"
72 "vtrn.32 d3, d7 \n"
73 "vtrn.32 d2, d6 \n"
74
75 "vrev16.8 q0, q0 \n"
76 "vrev16.8 q1, q1 \n"
77 "vrev16.8 q2, q2 \n"
78 "vrev16.8 q3, q3 \n"
79
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070080 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080081
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070082 MEMACCESS(0)
83 "vst1.8 {d1}, [%0], %4 \n"
84 MEMACCESS(0)
85 "vst1.8 {d0}, [%0], %4 \n"
86 MEMACCESS(0)
87 "vst1.8 {d3}, [%0], %4 \n"
88 MEMACCESS(0)
89 "vst1.8 {d2}, [%0], %4 \n"
90 MEMACCESS(0)
91 "vst1.8 {d5}, [%0], %4 \n"
92 MEMACCESS(0)
93 "vst1.8 {d4}, [%0], %4 \n"
94 MEMACCESS(0)
95 "vst1.8 {d7}, [%0], %4 \n"
96 MEMACCESS(0)
97 "vst1.8 {d6}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080098
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070099 "add %1, #8 \n" // src += 8
100 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
101 "subs %5, #8 \n" // w -= 8
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800102 "bge 1b \n"
103
104 // add 8 back to counter. if the result is 0 there are
105 // no residuals.
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700106 "adds %5, #8 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800107 "beq 4f \n"
108
109 // some residual, so between 1 and 7 lines left to transpose
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700110 "cmp %5, #2 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800111 "blt 3f \n"
112
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700113 "cmp %5, #4 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800114 "blt 2f \n"
115
116 // 4x8 block
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700117 "mov %0, %1 \n"
118 MEMACCESS(0)
119 "vld1.32 {d0[0]}, [%0], %2 \n"
120 MEMACCESS(0)
121 "vld1.32 {d0[1]}, [%0], %2 \n"
122 MEMACCESS(0)
123 "vld1.32 {d1[0]}, [%0], %2 \n"
124 MEMACCESS(0)
125 "vld1.32 {d1[1]}, [%0], %2 \n"
126 MEMACCESS(0)
127 "vld1.32 {d2[0]}, [%0], %2 \n"
128 MEMACCESS(0)
129 "vld1.32 {d2[1]}, [%0], %2 \n"
130 MEMACCESS(0)
131 "vld1.32 {d3[0]}, [%0], %2 \n"
132 MEMACCESS(0)
133 "vld1.32 {d3[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800134
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700135 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800136
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700137 MEMACCESS(6)
138 "vld1.8 {q3}, [%6] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800139
140 "vtbl.8 d4, {d0, d1}, d6 \n"
141 "vtbl.8 d5, {d0, d1}, d7 \n"
142 "vtbl.8 d0, {d2, d3}, d6 \n"
143 "vtbl.8 d1, {d2, d3}, d7 \n"
144
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700145 // TODO(frkoenig): Rework shuffle above to
146 // write out with 4 instead of 8 writes.
147 MEMACCESS(0)
148 "vst1.32 {d4[0]}, [%0], %4 \n"
149 MEMACCESS(0)
150 "vst1.32 {d4[1]}, [%0], %4 \n"
151 MEMACCESS(0)
152 "vst1.32 {d5[0]}, [%0], %4 \n"
153 MEMACCESS(0)
154 "vst1.32 {d5[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800155
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700156 "add %0, %3, #4 \n"
157 MEMACCESS(0)
158 "vst1.32 {d0[0]}, [%0], %4 \n"
159 MEMACCESS(0)
160 "vst1.32 {d0[1]}, [%0], %4 \n"
161 MEMACCESS(0)
162 "vst1.32 {d1[0]}, [%0], %4 \n"
163 MEMACCESS(0)
164 "vst1.32 {d1[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800165
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700166 "add %1, #4 \n" // src += 4
167 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
168 "subs %5, #4 \n" // w -= 4
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800169 "beq 4f \n"
170
171 // some residual, check to see if it includes a 2x8 block,
172 // or less
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700173 "cmp %5, #2 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800174 "blt 3f \n"
175
176 // 2x8 block
177 "2: \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700178 "mov %0, %1 \n"
179 MEMACCESS(0)
180 "vld1.16 {d0[0]}, [%0], %2 \n"
181 MEMACCESS(0)
182 "vld1.16 {d1[0]}, [%0], %2 \n"
183 MEMACCESS(0)
184 "vld1.16 {d0[1]}, [%0], %2 \n"
185 MEMACCESS(0)
186 "vld1.16 {d1[1]}, [%0], %2 \n"
187 MEMACCESS(0)
188 "vld1.16 {d0[2]}, [%0], %2 \n"
189 MEMACCESS(0)
190 "vld1.16 {d1[2]}, [%0], %2 \n"
191 MEMACCESS(0)
192 "vld1.16 {d0[3]}, [%0], %2 \n"
193 MEMACCESS(0)
194 "vld1.16 {d1[3]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800195
196 "vtrn.8 d0, d1 \n"
197
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700198 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800199
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700200 MEMACCESS(0)
201 "vst1.64 {d0}, [%0], %4 \n"
202 MEMACCESS(0)
203 "vst1.64 {d1}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800204
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700205 "add %1, #2 \n" // src += 2
206 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
207 "subs %5, #2 \n" // w -= 2
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800208 "beq 4f \n"
209
210 // 1x8 block
211 "3: \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700212 MEMACCESS(1)
213 "vld1.8 {d0[0]}, [%1], %2 \n"
214 MEMACCESS(1)
215 "vld1.8 {d0[1]}, [%1], %2 \n"
216 MEMACCESS(1)
217 "vld1.8 {d0[2]}, [%1], %2 \n"
218 MEMACCESS(1)
219 "vld1.8 {d0[3]}, [%1], %2 \n"
220 MEMACCESS(1)
221 "vld1.8 {d0[4]}, [%1], %2 \n"
222 MEMACCESS(1)
223 "vld1.8 {d0[5]}, [%1], %2 \n"
224 MEMACCESS(1)
225 "vld1.8 {d0[6]}, [%1], %2 \n"
226 MEMACCESS(1)
227 "vld1.8 {d0[7]}, [%1] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800228
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700229 MEMACCESS(3)
230 "vst1.64 {d0}, [%3] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800231
232 "4: \n"
233
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700234 : "=&r"(src_temp), // %0
235 "+r"(src), // %1
236 "+r"(src_stride), // %2
237 "+r"(dst), // %3
238 "+r"(dst_stride), // %4
239 "+r"(width) // %5
240 : "r"(&kVTbl4x4Transpose) // %6
241 : "memory", "cc", "q0", "q1", "q2", "q3"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800242 );
243}
244
Frank Barchardb83bb382017-02-22 18:01:07 -0800245static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
246 4, 12, 5, 13, 6, 14, 7, 15};
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800247
Frank Barchardb83bb382017-02-22 18:01:07 -0800248void TransposeUVWx8_NEON(const uint8* src,
249 int src_stride,
250 uint8* dst_a,
251 int dst_stride_a,
252 uint8* dst_b,
253 int dst_stride_b,
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800254 int width) {
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700255 const uint8* src_temp;
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800256 asm volatile (
257 // loops are on blocks of 8. loop will stop when
258 // counter gets to or below 0. starting the counter
259 // at w-8 allow for this
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700260 "sub %7, #8 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800261
262 // handle 8x8 blocks. this should be the majority of the plane
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800263 "1: \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700264 "mov %0, %1 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800265
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700266 MEMACCESS(0)
267 "vld2.8 {d0, d1}, [%0], %2 \n"
268 MEMACCESS(0)
269 "vld2.8 {d2, d3}, [%0], %2 \n"
270 MEMACCESS(0)
271 "vld2.8 {d4, d5}, [%0], %2 \n"
272 MEMACCESS(0)
273 "vld2.8 {d6, d7}, [%0], %2 \n"
274 MEMACCESS(0)
275 "vld2.8 {d16, d17}, [%0], %2 \n"
276 MEMACCESS(0)
277 "vld2.8 {d18, d19}, [%0], %2 \n"
278 MEMACCESS(0)
279 "vld2.8 {d20, d21}, [%0], %2 \n"
280 MEMACCESS(0)
281 "vld2.8 {d22, d23}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800282
283 "vtrn.8 q1, q0 \n"
284 "vtrn.8 q3, q2 \n"
285 "vtrn.8 q9, q8 \n"
286 "vtrn.8 q11, q10 \n"
287
288 "vtrn.16 q1, q3 \n"
289 "vtrn.16 q0, q2 \n"
290 "vtrn.16 q9, q11 \n"
291 "vtrn.16 q8, q10 \n"
292
293 "vtrn.32 q1, q9 \n"
294 "vtrn.32 q0, q8 \n"
295 "vtrn.32 q3, q11 \n"
296 "vtrn.32 q2, q10 \n"
297
298 "vrev16.8 q0, q0 \n"
299 "vrev16.8 q1, q1 \n"
300 "vrev16.8 q2, q2 \n"
301 "vrev16.8 q3, q3 \n"
302 "vrev16.8 q8, q8 \n"
303 "vrev16.8 q9, q9 \n"
304 "vrev16.8 q10, q10 \n"
305 "vrev16.8 q11, q11 \n"
306
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700307 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800308
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700309 MEMACCESS(0)
310 "vst1.8 {d2}, [%0], %4 \n"
311 MEMACCESS(0)
312 "vst1.8 {d0}, [%0], %4 \n"
313 MEMACCESS(0)
314 "vst1.8 {d6}, [%0], %4 \n"
315 MEMACCESS(0)
316 "vst1.8 {d4}, [%0], %4 \n"
317 MEMACCESS(0)
318 "vst1.8 {d18}, [%0], %4 \n"
319 MEMACCESS(0)
320 "vst1.8 {d16}, [%0], %4 \n"
321 MEMACCESS(0)
322 "vst1.8 {d22}, [%0], %4 \n"
323 MEMACCESS(0)
324 "vst1.8 {d20}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800325
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700326 "mov %0, %5 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800327
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700328 MEMACCESS(0)
329 "vst1.8 {d3}, [%0], %6 \n"
330 MEMACCESS(0)
331 "vst1.8 {d1}, [%0], %6 \n"
332 MEMACCESS(0)
333 "vst1.8 {d7}, [%0], %6 \n"
334 MEMACCESS(0)
335 "vst1.8 {d5}, [%0], %6 \n"
336 MEMACCESS(0)
337 "vst1.8 {d19}, [%0], %6 \n"
338 MEMACCESS(0)
339 "vst1.8 {d17}, [%0], %6 \n"
340 MEMACCESS(0)
341 "vst1.8 {d23}, [%0], %6 \n"
342 MEMACCESS(0)
343 "vst1.8 {d21}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800344
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700345 "add %1, #8*2 \n" // src += 8*2
346 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
347 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
348 "subs %7, #8 \n" // w -= 8
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800349 "bge 1b \n"
350
351 // add 8 back to counter. if the result is 0 there are
352 // no residuals.
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700353 "adds %7, #8 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800354 "beq 4f \n"
355
356 // some residual, so between 1 and 7 lines left to transpose
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700357 "cmp %7, #2 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800358 "blt 3f \n"
359
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700360 "cmp %7, #4 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800361 "blt 2f \n"
362
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700363 // TODO(frkoenig): Clean this up
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800364 // 4x8 block
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700365 "mov %0, %1 \n"
366 MEMACCESS(0)
367 "vld1.64 {d0}, [%0], %2 \n"
368 MEMACCESS(0)
369 "vld1.64 {d1}, [%0], %2 \n"
370 MEMACCESS(0)
371 "vld1.64 {d2}, [%0], %2 \n"
372 MEMACCESS(0)
373 "vld1.64 {d3}, [%0], %2 \n"
374 MEMACCESS(0)
375 "vld1.64 {d4}, [%0], %2 \n"
376 MEMACCESS(0)
377 "vld1.64 {d5}, [%0], %2 \n"
378 MEMACCESS(0)
379 "vld1.64 {d6}, [%0], %2 \n"
380 MEMACCESS(0)
381 "vld1.64 {d7}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800382
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700383 MEMACCESS(8)
384 "vld1.8 {q15}, [%8] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800385
386 "vtrn.8 q0, q1 \n"
387 "vtrn.8 q2, q3 \n"
388
389 "vtbl.8 d16, {d0, d1}, d30 \n"
390 "vtbl.8 d17, {d0, d1}, d31 \n"
391 "vtbl.8 d18, {d2, d3}, d30 \n"
392 "vtbl.8 d19, {d2, d3}, d31 \n"
393 "vtbl.8 d20, {d4, d5}, d30 \n"
394 "vtbl.8 d21, {d4, d5}, d31 \n"
395 "vtbl.8 d22, {d6, d7}, d30 \n"
396 "vtbl.8 d23, {d6, d7}, d31 \n"
397
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700398 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800399
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700400 MEMACCESS(0)
401 "vst1.32 {d16[0]}, [%0], %4 \n"
402 MEMACCESS(0)
403 "vst1.32 {d16[1]}, [%0], %4 \n"
404 MEMACCESS(0)
405 "vst1.32 {d17[0]}, [%0], %4 \n"
406 MEMACCESS(0)
407 "vst1.32 {d17[1]}, [%0], %4 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800408
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700409 "add %0, %3, #4 \n"
410 MEMACCESS(0)
411 "vst1.32 {d20[0]}, [%0], %4 \n"
412 MEMACCESS(0)
413 "vst1.32 {d20[1]}, [%0], %4 \n"
414 MEMACCESS(0)
415 "vst1.32 {d21[0]}, [%0], %4 \n"
416 MEMACCESS(0)
417 "vst1.32 {d21[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800418
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700419 "mov %0, %5 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800420
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700421 MEMACCESS(0)
422 "vst1.32 {d18[0]}, [%0], %6 \n"
423 MEMACCESS(0)
424 "vst1.32 {d18[1]}, [%0], %6 \n"
425 MEMACCESS(0)
426 "vst1.32 {d19[0]}, [%0], %6 \n"
427 MEMACCESS(0)
428 "vst1.32 {d19[1]}, [%0], %6 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800429
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700430 "add %0, %5, #4 \n"
431 MEMACCESS(0)
432 "vst1.32 {d22[0]}, [%0], %6 \n"
433 MEMACCESS(0)
434 "vst1.32 {d22[1]}, [%0], %6 \n"
435 MEMACCESS(0)
436 "vst1.32 {d23[0]}, [%0], %6 \n"
437 MEMACCESS(0)
438 "vst1.32 {d23[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800439
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700440 "add %1, #4*2 \n" // src += 4 * 2
441 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
442 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
443 "subs %7, #4 \n" // w -= 4
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800444 "beq 4f \n"
445
446 // some residual, check to see if it includes a 2x8 block,
447 // or less
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700448 "cmp %7, #2 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800449 "blt 3f \n"
450
451 // 2x8 block
452 "2: \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700453 "mov %0, %1 \n"
454 MEMACCESS(0)
455 "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
456 MEMACCESS(0)
457 "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
458 MEMACCESS(0)
459 "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
460 MEMACCESS(0)
461 "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
462 MEMACCESS(0)
463 "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
464 MEMACCESS(0)
465 "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
466 MEMACCESS(0)
467 "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
468 MEMACCESS(0)
469 "vld2.16 {d1[3], d3[3]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800470
471 "vtrn.8 d0, d1 \n"
472 "vtrn.8 d2, d3 \n"
473
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700474 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800475
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700476 MEMACCESS(0)
477 "vst1.64 {d0}, [%0], %4 \n"
478 MEMACCESS(0)
479 "vst1.64 {d2}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800480
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700481 "mov %0, %5 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800482
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700483 MEMACCESS(0)
484 "vst1.64 {d1}, [%0], %6 \n"
485 MEMACCESS(0)
486 "vst1.64 {d3}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800487
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700488 "add %1, #2*2 \n" // src += 2 * 2
489 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
490 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
491 "subs %7, #2 \n" // w -= 2
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800492 "beq 4f \n"
493
494 // 1x8 block
495 "3: \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700496 MEMACCESS(1)
497 "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
498 MEMACCESS(1)
499 "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
500 MEMACCESS(1)
501 "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
502 MEMACCESS(1)
503 "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
504 MEMACCESS(1)
505 "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
506 MEMACCESS(1)
507 "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
508 MEMACCESS(1)
509 "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
510 MEMACCESS(1)
511 "vld2.8 {d0[7], d1[7]}, [%1] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800512
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700513 MEMACCESS(3)
514 "vst1.64 {d0}, [%3] \n"
515 MEMACCESS(5)
516 "vst1.64 {d1}, [%5] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800517
518 "4: \n"
519
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700520 : "=&r"(src_temp), // %0
521 "+r"(src), // %1
522 "+r"(src_stride), // %2
523 "+r"(dst_a), // %3
524 "+r"(dst_stride_a), // %4
525 "+r"(dst_b), // %5
526 "+r"(dst_stride_b), // %6
527 "+r"(width) // %7
528 : "r"(&kVTbl4x4TransposeDi) // %8
529 : "memory", "cc",
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800530 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
531 );
532}
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700533#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800534
535#ifdef __cplusplus
536} // extern "C"
537} // namespace libyuv
538#endif