blob: fdc0dd476c6d2933d8df0f251135dc0880eacfab [file] [log] [blame]
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -08001/*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
Hangyu Kuangf047e7c2016-07-06 14:21:45 -07007 * in the file PATENTS. All contributing project authors may
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -08008 * be found in the AUTHORS file in the root of the source tree.
9 */
10
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070011#include "libyuv/rotate_row.h"
Frank Barchardcead1e02017-03-10 12:03:05 -080012#include "libyuv/row.h"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080013
14#include "libyuv/basic_types.h"
15
16#ifdef __cplusplus
17namespace libyuv {
18extern "C" {
19#endif
20
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070021#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
22 !defined(__aarch64__)
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080023
Chong Zhangab123ac2019-06-27 14:28:37 -070024static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
25 2, 6, 10, 14, 3, 7, 11, 15};
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080026
Chong Zhangab123ac2019-06-27 14:28:37 -070027void TransposeWx8_NEON(const uint8_t* src,
Frank Barchardb83bb382017-02-22 18:01:07 -080028 int src_stride,
Chong Zhangab123ac2019-06-27 14:28:37 -070029 uint8_t* dst,
Frank Barchardb83bb382017-02-22 18:01:07 -080030 int dst_stride,
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080031 int width) {
Chong Zhangab123ac2019-06-27 14:28:37 -070032 const uint8_t* src_temp;
33 asm volatile(
34 // loops are on blocks of 8. loop will stop when
35 // counter gets to or below 0. starting the counter
36 // at w-8 allow for this
37 "sub %5, #8 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080038
Chong Zhangab123ac2019-06-27 14:28:37 -070039 // handle 8x8 blocks. this should be the majority of the plane
40 "1: \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070041 "mov %0, %1 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080042
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070043 "vld1.8 {d0}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070044 "vld1.8 {d1}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070045 "vld1.8 {d2}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070046 "vld1.8 {d3}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070047 "vld1.8 {d4}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070048 "vld1.8 {d5}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070049 "vld1.8 {d6}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070050 "vld1.8 {d7}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080051
52 "vtrn.8 d1, d0 \n"
53 "vtrn.8 d3, d2 \n"
54 "vtrn.8 d5, d4 \n"
55 "vtrn.8 d7, d6 \n"
56
57 "vtrn.16 d1, d3 \n"
58 "vtrn.16 d0, d2 \n"
59 "vtrn.16 d5, d7 \n"
60 "vtrn.16 d4, d6 \n"
61
62 "vtrn.32 d1, d5 \n"
63 "vtrn.32 d0, d4 \n"
64 "vtrn.32 d3, d7 \n"
65 "vtrn.32 d2, d6 \n"
66
67 "vrev16.8 q0, q0 \n"
68 "vrev16.8 q1, q1 \n"
69 "vrev16.8 q2, q2 \n"
70 "vrev16.8 q3, q3 \n"
71
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070072 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080073
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070074 "vst1.8 {d1}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070075 "vst1.8 {d0}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070076 "vst1.8 {d3}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070077 "vst1.8 {d2}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070078 "vst1.8 {d5}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070079 "vst1.8 {d4}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070080 "vst1.8 {d7}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070081 "vst1.8 {d6}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080082
Hangyu Kuangf047e7c2016-07-06 14:21:45 -070083 "add %1, #8 \n" // src += 8
84 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
85 "subs %5, #8 \n" // w -= 8
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080086 "bge 1b \n"
87
Chong Zhangab123ac2019-06-27 14:28:37 -070088 // add 8 back to counter. if the result is 0 there are
89 // no residuals.
90 "adds %5, #8 \n"
91 "beq 4f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080092
Chong Zhangab123ac2019-06-27 14:28:37 -070093 // some residual, so between 1 and 7 lines left to transpose
94 "cmp %5, #2 \n"
95 "blt 3f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080096
Chong Zhangab123ac2019-06-27 14:28:37 -070097 "cmp %5, #4 \n"
98 "blt 2f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -080099
Chong Zhangab123ac2019-06-27 14:28:37 -0700100 // 4x8 block
101 "mov %0, %1 \n"
102 "vld1.32 {d0[0]}, [%0], %2 \n"
103 "vld1.32 {d0[1]}, [%0], %2 \n"
104 "vld1.32 {d1[0]}, [%0], %2 \n"
105 "vld1.32 {d1[1]}, [%0], %2 \n"
106 "vld1.32 {d2[0]}, [%0], %2 \n"
107 "vld1.32 {d2[1]}, [%0], %2 \n"
108 "vld1.32 {d3[0]}, [%0], %2 \n"
109 "vld1.32 {d3[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800110
Chong Zhangab123ac2019-06-27 14:28:37 -0700111 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800112
Chong Zhangab123ac2019-06-27 14:28:37 -0700113 "vld1.8 {q3}, [%6] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800114
Chong Zhangab123ac2019-06-27 14:28:37 -0700115 "vtbl.8 d4, {d0, d1}, d6 \n"
116 "vtbl.8 d5, {d0, d1}, d7 \n"
117 "vtbl.8 d0, {d2, d3}, d6 \n"
118 "vtbl.8 d1, {d2, d3}, d7 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800119
Chong Zhangab123ac2019-06-27 14:28:37 -0700120 // TODO(frkoenig): Rework shuffle above to
121 // write out with 4 instead of 8 writes.
122 "vst1.32 {d4[0]}, [%0], %4 \n"
123 "vst1.32 {d4[1]}, [%0], %4 \n"
124 "vst1.32 {d5[0]}, [%0], %4 \n"
125 "vst1.32 {d5[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800126
Chong Zhangab123ac2019-06-27 14:28:37 -0700127 "add %0, %3, #4 \n"
128 "vst1.32 {d0[0]}, [%0], %4 \n"
129 "vst1.32 {d0[1]}, [%0], %4 \n"
130 "vst1.32 {d1[0]}, [%0], %4 \n"
131 "vst1.32 {d1[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800132
Chong Zhangab123ac2019-06-27 14:28:37 -0700133 "add %1, #4 \n" // src += 4
134 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
135 "subs %5, #4 \n" // w -= 4
136 "beq 4f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800137
Chong Zhangab123ac2019-06-27 14:28:37 -0700138 // some residual, check to see if it includes a 2x8 block,
139 // or less
140 "cmp %5, #2 \n"
141 "blt 3f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800142
Chong Zhangab123ac2019-06-27 14:28:37 -0700143 // 2x8 block
144 "2: \n"
145 "mov %0, %1 \n"
146 "vld1.16 {d0[0]}, [%0], %2 \n"
147 "vld1.16 {d1[0]}, [%0], %2 \n"
148 "vld1.16 {d0[1]}, [%0], %2 \n"
149 "vld1.16 {d1[1]}, [%0], %2 \n"
150 "vld1.16 {d0[2]}, [%0], %2 \n"
151 "vld1.16 {d1[2]}, [%0], %2 \n"
152 "vld1.16 {d0[3]}, [%0], %2 \n"
153 "vld1.16 {d1[3]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800154
Chong Zhangab123ac2019-06-27 14:28:37 -0700155 "vtrn.8 d0, d1 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800156
Chong Zhangab123ac2019-06-27 14:28:37 -0700157 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800158
Chong Zhangab123ac2019-06-27 14:28:37 -0700159 "vst1.64 {d0}, [%0], %4 \n"
160 "vst1.64 {d1}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800161
Chong Zhangab123ac2019-06-27 14:28:37 -0700162 "add %1, #2 \n" // src += 2
163 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
164 "subs %5, #2 \n" // w -= 2
165 "beq 4f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800166
Chong Zhangab123ac2019-06-27 14:28:37 -0700167 // 1x8 block
168 "3: \n"
169 "vld1.8 {d0[0]}, [%1], %2 \n"
170 "vld1.8 {d0[1]}, [%1], %2 \n"
171 "vld1.8 {d0[2]}, [%1], %2 \n"
172 "vld1.8 {d0[3]}, [%1], %2 \n"
173 "vld1.8 {d0[4]}, [%1], %2 \n"
174 "vld1.8 {d0[5]}, [%1], %2 \n"
175 "vld1.8 {d0[6]}, [%1], %2 \n"
176 "vld1.8 {d0[7]}, [%1] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800177
Chong Zhangab123ac2019-06-27 14:28:37 -0700178 "vst1.64 {d0}, [%3] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800179
Chong Zhangab123ac2019-06-27 14:28:37 -0700180 "4: \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800181
Chong Zhangab123ac2019-06-27 14:28:37 -0700182 : "=&r"(src_temp), // %0
183 "+r"(src), // %1
184 "+r"(src_stride), // %2
185 "+r"(dst), // %3
186 "+r"(dst_stride), // %4
187 "+r"(width) // %5
188 : "r"(&kVTbl4x4Transpose) // %6
189 : "memory", "cc", "q0", "q1", "q2", "q3");
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800190}
191
Chong Zhangab123ac2019-06-27 14:28:37 -0700192static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
193 4, 12, 5, 13, 6, 14, 7, 15};
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800194
Chong Zhangab123ac2019-06-27 14:28:37 -0700195void TransposeUVWx8_NEON(const uint8_t* src,
Frank Barchardb83bb382017-02-22 18:01:07 -0800196 int src_stride,
Chong Zhangab123ac2019-06-27 14:28:37 -0700197 uint8_t* dst_a,
Frank Barchardb83bb382017-02-22 18:01:07 -0800198 int dst_stride_a,
Chong Zhangab123ac2019-06-27 14:28:37 -0700199 uint8_t* dst_b,
Frank Barchardb83bb382017-02-22 18:01:07 -0800200 int dst_stride_b,
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800201 int width) {
Chong Zhangab123ac2019-06-27 14:28:37 -0700202 const uint8_t* src_temp;
203 asm volatile(
204 // loops are on blocks of 8. loop will stop when
205 // counter gets to or below 0. starting the counter
206 // at w-8 allow for this
207 "sub %7, #8 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800208
Chong Zhangab123ac2019-06-27 14:28:37 -0700209 // handle 8x8 blocks. this should be the majority of the plane
210 "1: \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700211 "mov %0, %1 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800212
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700213 "vld2.8 {d0, d1}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700214 "vld2.8 {d2, d3}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700215 "vld2.8 {d4, d5}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700216 "vld2.8 {d6, d7}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700217 "vld2.8 {d16, d17}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700218 "vld2.8 {d18, d19}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700219 "vld2.8 {d20, d21}, [%0], %2 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700220 "vld2.8 {d22, d23}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800221
222 "vtrn.8 q1, q0 \n"
223 "vtrn.8 q3, q2 \n"
224 "vtrn.8 q9, q8 \n"
225 "vtrn.8 q11, q10 \n"
226
227 "vtrn.16 q1, q3 \n"
228 "vtrn.16 q0, q2 \n"
229 "vtrn.16 q9, q11 \n"
230 "vtrn.16 q8, q10 \n"
231
232 "vtrn.32 q1, q9 \n"
233 "vtrn.32 q0, q8 \n"
234 "vtrn.32 q3, q11 \n"
235 "vtrn.32 q2, q10 \n"
236
237 "vrev16.8 q0, q0 \n"
238 "vrev16.8 q1, q1 \n"
239 "vrev16.8 q2, q2 \n"
240 "vrev16.8 q3, q3 \n"
241 "vrev16.8 q8, q8 \n"
242 "vrev16.8 q9, q9 \n"
243 "vrev16.8 q10, q10 \n"
244 "vrev16.8 q11, q11 \n"
245
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700246 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800247
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700248 "vst1.8 {d2}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700249 "vst1.8 {d0}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700250 "vst1.8 {d6}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700251 "vst1.8 {d4}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700252 "vst1.8 {d18}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700253 "vst1.8 {d16}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700254 "vst1.8 {d22}, [%0], %4 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700255 "vst1.8 {d20}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800256
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700257 "mov %0, %5 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800258
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700259 "vst1.8 {d3}, [%0], %6 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700260 "vst1.8 {d1}, [%0], %6 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700261 "vst1.8 {d7}, [%0], %6 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700262 "vst1.8 {d5}, [%0], %6 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700263 "vst1.8 {d19}, [%0], %6 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700264 "vst1.8 {d17}, [%0], %6 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700265 "vst1.8 {d23}, [%0], %6 \n"
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700266 "vst1.8 {d21}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800267
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700268 "add %1, #8*2 \n" // src += 8*2
269 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
270 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
271 "subs %7, #8 \n" // w -= 8
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800272 "bge 1b \n"
273
Chong Zhangab123ac2019-06-27 14:28:37 -0700274 // add 8 back to counter. if the result is 0 there are
275 // no residuals.
276 "adds %7, #8 \n"
277 "beq 4f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800278
Chong Zhangab123ac2019-06-27 14:28:37 -0700279 // some residual, so between 1 and 7 lines left to transpose
280 "cmp %7, #2 \n"
281 "blt 3f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800282
Chong Zhangab123ac2019-06-27 14:28:37 -0700283 "cmp %7, #4 \n"
284 "blt 2f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800285
Chong Zhangab123ac2019-06-27 14:28:37 -0700286 // TODO(frkoenig): Clean this up
287 // 4x8 block
288 "mov %0, %1 \n"
289 "vld1.64 {d0}, [%0], %2 \n"
290 "vld1.64 {d1}, [%0], %2 \n"
291 "vld1.64 {d2}, [%0], %2 \n"
292 "vld1.64 {d3}, [%0], %2 \n"
293 "vld1.64 {d4}, [%0], %2 \n"
294 "vld1.64 {d5}, [%0], %2 \n"
295 "vld1.64 {d6}, [%0], %2 \n"
296 "vld1.64 {d7}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800297
Chong Zhangab123ac2019-06-27 14:28:37 -0700298 "vld1.8 {q15}, [%8] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800299
Chong Zhangab123ac2019-06-27 14:28:37 -0700300 "vtrn.8 q0, q1 \n"
301 "vtrn.8 q2, q3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800302
Chong Zhangab123ac2019-06-27 14:28:37 -0700303 "vtbl.8 d16, {d0, d1}, d30 \n"
304 "vtbl.8 d17, {d0, d1}, d31 \n"
305 "vtbl.8 d18, {d2, d3}, d30 \n"
306 "vtbl.8 d19, {d2, d3}, d31 \n"
307 "vtbl.8 d20, {d4, d5}, d30 \n"
308 "vtbl.8 d21, {d4, d5}, d31 \n"
309 "vtbl.8 d22, {d6, d7}, d30 \n"
310 "vtbl.8 d23, {d6, d7}, d31 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800311
Chong Zhangab123ac2019-06-27 14:28:37 -0700312 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800313
Chong Zhangab123ac2019-06-27 14:28:37 -0700314 "vst1.32 {d16[0]}, [%0], %4 \n"
315 "vst1.32 {d16[1]}, [%0], %4 \n"
316 "vst1.32 {d17[0]}, [%0], %4 \n"
317 "vst1.32 {d17[1]}, [%0], %4 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800318
Chong Zhangab123ac2019-06-27 14:28:37 -0700319 "add %0, %3, #4 \n"
320 "vst1.32 {d20[0]}, [%0], %4 \n"
321 "vst1.32 {d20[1]}, [%0], %4 \n"
322 "vst1.32 {d21[0]}, [%0], %4 \n"
323 "vst1.32 {d21[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800324
Chong Zhangab123ac2019-06-27 14:28:37 -0700325 "mov %0, %5 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800326
Chong Zhangab123ac2019-06-27 14:28:37 -0700327 "vst1.32 {d18[0]}, [%0], %6 \n"
328 "vst1.32 {d18[1]}, [%0], %6 \n"
329 "vst1.32 {d19[0]}, [%0], %6 \n"
330 "vst1.32 {d19[1]}, [%0], %6 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800331
Chong Zhangab123ac2019-06-27 14:28:37 -0700332 "add %0, %5, #4 \n"
333 "vst1.32 {d22[0]}, [%0], %6 \n"
334 "vst1.32 {d22[1]}, [%0], %6 \n"
335 "vst1.32 {d23[0]}, [%0], %6 \n"
336 "vst1.32 {d23[1]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800337
Chong Zhangab123ac2019-06-27 14:28:37 -0700338 "add %1, #4*2 \n" // src += 4 * 2
339 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
340 // dst_stride_a
341 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
342 // dst_stride_b
343 "subs %7, #4 \n" // w -= 4
344 "beq 4f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800345
Chong Zhangab123ac2019-06-27 14:28:37 -0700346 // some residual, check to see if it includes a 2x8 block,
347 // or less
348 "cmp %7, #2 \n"
349 "blt 3f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800350
Chong Zhangab123ac2019-06-27 14:28:37 -0700351 // 2x8 block
352 "2: \n"
353 "mov %0, %1 \n"
354 "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
355 "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
356 "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
357 "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
358 "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
359 "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
360 "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
361 "vld2.16 {d1[3], d3[3]}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800362
Chong Zhangab123ac2019-06-27 14:28:37 -0700363 "vtrn.8 d0, d1 \n"
364 "vtrn.8 d2, d3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800365
Chong Zhangab123ac2019-06-27 14:28:37 -0700366 "mov %0, %3 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800367
Chong Zhangab123ac2019-06-27 14:28:37 -0700368 "vst1.64 {d0}, [%0], %4 \n"
369 "vst1.64 {d2}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800370
Chong Zhangab123ac2019-06-27 14:28:37 -0700371 "mov %0, %5 \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800372
Chong Zhangab123ac2019-06-27 14:28:37 -0700373 "vst1.64 {d1}, [%0], %6 \n"
374 "vst1.64 {d3}, [%0] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800375
Chong Zhangab123ac2019-06-27 14:28:37 -0700376 "add %1, #2*2 \n" // src += 2 * 2
377 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
378 // dst_stride_a
379 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
380 // dst_stride_b
381 "subs %7, #2 \n" // w -= 2
382 "beq 4f \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800383
Chong Zhangab123ac2019-06-27 14:28:37 -0700384 // 1x8 block
385 "3: \n"
386 "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
387 "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
388 "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
389 "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
390 "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
391 "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
392 "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
393 "vld2.8 {d0[7], d1[7]}, [%1] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800394
Chong Zhangab123ac2019-06-27 14:28:37 -0700395 "vst1.64 {d0}, [%3] \n"
396 "vst1.64 {d1}, [%5] \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800397
Chong Zhangab123ac2019-06-27 14:28:37 -0700398 "4: \n"
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800399
Chong Zhangab123ac2019-06-27 14:28:37 -0700400 : "=&r"(src_temp), // %0
401 "+r"(src), // %1
402 "+r"(src_stride), // %2
403 "+r"(dst_a), // %3
404 "+r"(dst_stride_a), // %4
405 "+r"(dst_b), // %5
406 "+r"(dst_stride_b), // %6
407 "+r"(width) // %7
408 : "r"(&kVTbl4x4TransposeDi) // %8
409 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800410}
Hangyu Kuangf047e7c2016-07-06 14:21:45 -0700411#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
Hendrik Dahlkamp33cfdeb2013-01-23 18:27:37 -0800412
413#ifdef __cplusplus
414} // extern "C"
415} // namespace libyuv
416#endif