blob: 272d41fbd172be4c86994bbd2c303b544ba85a38 [file] [log] [blame]
frkoenig@google.comf7e74a12011-11-03 22:41:59 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/basic_types.h"
12
13namespace libyuv {
14
15#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
16
17void ReverseLine_NEON(const uint8* src, uint8* dst, int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000018 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000019 // compute where to start writing destination
fbarchard@google.comf7a50482011-11-10 22:41:20 +000020 "add %1, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000021
22 // work on segments that are multiples of 16
fbarchard@google.comf7a50482011-11-10 22:41:20 +000023 "lsrs r3, %2, #4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000024
25 // the output is written in two block. 8 bytes followed
26 // by another 8. reading is done sequentially, from left to
27 // right. writing is done from right to left in block sizes
28 // %1, the destination pointer is incremented after writing
29 // the first of the two blocks. need to subtract that 8 off
30 // along with 16 to get the next location.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000031 "mov r3, #-24 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000032
fbarchard@google.comf7a50482011-11-10 22:41:20 +000033 "beq 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000034
35 // back of destination by the size of the register that is
36 // going to be reversed
fbarchard@google.comf7a50482011-11-10 22:41:20 +000037 "sub %1, #16 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000038
39 // the loop needs to run on blocks of 16. what will be left
40 // over is either a negative number, the residuals that need
41 // to be done, or 0. if this isn't subtracted off here the
42 // loop will run one extra time.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000043 "sub %2, #16 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000044
fbarchard@google.comf7a50482011-11-10 22:41:20 +000045 "1: \n"
46 "vld1.8 {q0}, [%0]! \n" // src += 16
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000047
48 // reverse the bytes in the 64 bit segments. unable to reverse
49 // the bytes in the entire 128 bits in one go.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000050 "vrev64.8 q0, q0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000051
52 // because of the inability to reverse the entire 128 bits
53 // reverse the writing out of the two 64 bit segments.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000054 "vst1.8 {d1}, [%1]! \n"
55 "vst1.8 {d0}, [%1], r3 \n" // dst -= 16
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000056
fbarchard@google.comf7a50482011-11-10 22:41:20 +000057 "subs %2, #16 \n"
58 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000059
60 // add 16 back to the counter. if the result is 0 there is no
61 // residuals so jump past
fbarchard@google.comf7a50482011-11-10 22:41:20 +000062 "adds %2, #16 \n"
63 "beq 5f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000064
fbarchard@google.comf7a50482011-11-10 22:41:20 +000065 "add %1, #16 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000066
fbarchard@google.comf7a50482011-11-10 22:41:20 +000067 "2: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000068
fbarchard@google.comf7a50482011-11-10 22:41:20 +000069 "mov r3, #-3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000070
fbarchard@google.comf7a50482011-11-10 22:41:20 +000071 "sub %1, #2 \n"
72 "subs %2, #2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000073 // check for 16*n+1 scenarios where segments_of_2 should not
74 // be run, but there is something left over.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000075 "blt 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000076
77// do this in neon registers as per
78// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
fbarchard@google.comf7a50482011-11-10 22:41:20 +000079 "3: \n"
80 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000081
fbarchard@google.comf7a50482011-11-10 22:41:20 +000082 "vst1.8 {d1[0]}, [%1]! \n"
83 "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000084
fbarchard@google.comf7a50482011-11-10 22:41:20 +000085 "subs %2, #2 \n"
86 "bge 3b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000087
fbarchard@google.comf7a50482011-11-10 22:41:20 +000088 "adds %2, #2 \n"
89 "beq 5f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000090
fbarchard@google.comf7a50482011-11-10 22:41:20 +000091 "4: \n"
92 "add %1, #1 \n"
93 "vld1.8 {d0[0]}, [%0] \n"
94 "vst1.8 {d0[0]}, [%1] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000095
fbarchard@google.comf7a50482011-11-10 22:41:20 +000096 "5: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000097 : "+r"(src), // %0
98 "+r"(dst), // %1
99 "+r"(width) // %2
100 :
101 : "memory", "cc", "r3", "q0"
102 );
103}
104
105static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) =
106 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
107
108void TransposeWx8_NEON(const uint8* src, int src_stride,
109 uint8* dst, int dst_stride,
110 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000111 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000112 // loops are on blocks of 8. loop will stop when
113 // counter gets to or below 0. starting the counter
114 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000115 "sub %4, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000116
117 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000118 "1: \n"
119 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000120
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000121 "vld1.8 {d0}, [r9], %1 \n"
122 "vld1.8 {d1}, [r9], %1 \n"
123 "vld1.8 {d2}, [r9], %1 \n"
124 "vld1.8 {d3}, [r9], %1 \n"
125 "vld1.8 {d4}, [r9], %1 \n"
126 "vld1.8 {d5}, [r9], %1 \n"
127 "vld1.8 {d6}, [r9], %1 \n"
128 "vld1.8 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000129
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000130 "vtrn.8 d1, d0 \n"
131 "vtrn.8 d3, d2 \n"
132 "vtrn.8 d5, d4 \n"
133 "vtrn.8 d7, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000134
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000135 "vtrn.16 d1, d3 \n"
136 "vtrn.16 d0, d2 \n"
137 "vtrn.16 d5, d7 \n"
138 "vtrn.16 d4, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000139
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000140 "vtrn.32 d1, d5 \n"
141 "vtrn.32 d0, d4 \n"
142 "vtrn.32 d3, d7 \n"
143 "vtrn.32 d2, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000144
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000145 "vrev16.8 q0, q0 \n"
146 "vrev16.8 q1, q1 \n"
147 "vrev16.8 q2, q2 \n"
148 "vrev16.8 q3, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000149
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000150 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000151
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000152 "vst1.8 {d1}, [r9], %3 \n"
153 "vst1.8 {d0}, [r9], %3 \n"
154 "vst1.8 {d3}, [r9], %3 \n"
155 "vst1.8 {d2}, [r9], %3 \n"
156 "vst1.8 {d5}, [r9], %3 \n"
157 "vst1.8 {d4}, [r9], %3 \n"
158 "vst1.8 {d7}, [r9], %3 \n"
159 "vst1.8 {d6}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000160
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000161 "add %0, #8 \n" // src += 8
162 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
163 "subs %4, #8 \n" // w -= 8
164 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000165
166 // add 8 back to counter. if the result is 0 there are
167 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000168 "adds %4, #8 \n"
169 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000170
171 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000172 "cmp %4, #2 \n"
173 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000174
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000175 "cmp %4, #4 \n"
176 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000177
178 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000179 "mov r9, %0 \n"
180 "vld1.32 {d0[0]}, [r9], %1 \n"
181 "vld1.32 {d0[1]}, [r9], %1 \n"
182 "vld1.32 {d1[0]}, [r9], %1 \n"
183 "vld1.32 {d1[1]}, [r9], %1 \n"
184 "vld1.32 {d2[0]}, [r9], %1 \n"
185 "vld1.32 {d2[1]}, [r9], %1 \n"
186 "vld1.32 {d3[0]}, [r9], %1 \n"
187 "vld1.32 {d3[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000188
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000189 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000190
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000191 "vld1.8 {q3}, [%5] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000192
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000193 "vtbl.8 d4, {d0, d1}, d6 \n"
194 "vtbl.8 d5, {d0, d1}, d7 \n"
195 "vtbl.8 d0, {d2, d3}, d6 \n"
196 "vtbl.8 d1, {d2, d3}, d7 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000197
198 // TODO: rework shuffle above to write
199 // out with 4 instead of 8 writes
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000200 "vst1.32 {d4[0]}, [r9], %3 \n"
201 "vst1.32 {d4[1]}, [r9], %3 \n"
202 "vst1.32 {d5[0]}, [r9], %3 \n"
203 "vst1.32 {d5[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000204
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000205 "add r9, %2, #4 \n"
206 "vst1.32 {d0[0]}, [r9], %3 \n"
207 "vst1.32 {d0[1]}, [r9], %3 \n"
208 "vst1.32 {d1[0]}, [r9], %3 \n"
209 "vst1.32 {d1[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000210
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000211 "add %0, #4 \n" // src += 4
212 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
213 "subs %4, #4 \n" // w -= 4
214 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000215
216 // some residual, check to see if it includes a 2x8 block,
217 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000218 "cmp %4, #2 \n"
219 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000220
221 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000222 "2: \n"
223 "mov r9, %0 \n"
224 "vld1.16 {d0[0]}, [r9], %1 \n"
225 "vld1.16 {d1[0]}, [r9], %1 \n"
226 "vld1.16 {d0[1]}, [r9], %1 \n"
227 "vld1.16 {d1[1]}, [r9], %1 \n"
228 "vld1.16 {d0[2]}, [r9], %1 \n"
229 "vld1.16 {d1[2]}, [r9], %1 \n"
230 "vld1.16 {d0[3]}, [r9], %1 \n"
231 "vld1.16 {d1[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000232
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000233 "vtrn.8 d0, d1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000234
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000235 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000236
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000237 "vst1.64 {d0}, [r9], %3 \n"
238 "vst1.64 {d1}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000239
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000240 "add %0, #2 \n" // src += 2
241 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
242 "subs %4, #2 \n" // w -= 2
243 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000244
245 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000246 "3: \n"
247 "vld1.8 {d0[0]}, [%0], %1 \n"
248 "vld1.8 {d0[1]}, [%0], %1 \n"
249 "vld1.8 {d0[2]}, [%0], %1 \n"
250 "vld1.8 {d0[3]}, [%0], %1 \n"
251 "vld1.8 {d0[4]}, [%0], %1 \n"
252 "vld1.8 {d0[5]}, [%0], %1 \n"
253 "vld1.8 {d0[6]}, [%0], %1 \n"
254 "vld1.8 {d0[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000255
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000256 "vst1.64 {d0}, [%2] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000257
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000258 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000259
260 : "+r"(src), // %0
261 "+r"(src_stride), // %1
262 "+r"(dst), // %2
263 "+r"(dst_stride), // %3
264 "+r"(width) // %4
265 : "r"(vtbl_4x4_transpose) // %5
266 : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
267 );
268}
269
270void ReverseLineUV_NEON(const uint8* src,
271 uint8* dst_a, uint8* dst_b,
272 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000273 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000274 // compute where to start writing destination
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000275 "add %1, %3 \n" // dst_a + width
276 "add %2, %3 \n" // dst_b + width
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000277
278 // work on input segments that are multiples of 16, but
279 // width that has been passed is output segments, half
280 // the size of input.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000281 "lsrs r12, %3, #3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000282
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000283 "beq 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000284
285 // the output is written in to two blocks.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000286 "mov r12, #-8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000287
288 // back of destination by the size of the register that is
289 // going to be reversed
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000290 "sub %1, #8 \n"
291 "sub %2, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000292
293 // the loop needs to run on blocks of 8. what will be left
294 // over is either a negative number, the residuals that need
295 // to be done, or 0. if this isn't subtracted off here the
296 // loop will run one extra time.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000297 "sub %3, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000298
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000299 "1: \n"
300 "vld2.8 {d0, d1}, [%0]! \n" // src += 16
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000301
302 // reverse the bytes in the 64 bit segments
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000303 "vrev64.8 q0, q0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000304
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000305 "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
306 "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000307
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000308 "subs %3, #8 \n"
309 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000310
311 // add 8 back to the counter. if the result is 0 there is no
312 // residuals so return
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000313 "adds %3, #8 \n"
314 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000315
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000316 "add %1, #8 \n"
317 "add %2, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000318
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000319 "2: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000320
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000321 "mov r12, #-1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000322
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000323 "sub %1, #1 \n"
324 "sub %2, #1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000325
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000326 "3: \n"
327 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000328
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000329 "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
330 "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000331
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000332 "subs %3, %3, #1 \n"
333 "bgt 3b \n"
334 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000335 : "+r"(src), // %0
336 "+r"(dst_a), // %1
337 "+r"(dst_b), // %2
338 "+r"(width) // %3
339 :
340 : "memory", "cc", "r12", "q0"
341 );
342}
343
344static const uint8 vtbl_4x4_transpose_di[16] __attribute__((vector_size(16))) =
345 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
346
347void TransposeUVWx8_NEON(const uint8* src, int src_stride,
348 uint8* dst_a, int dst_stride_a,
349 uint8* dst_b, int dst_stride_b,
350 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000351 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000352 // loops are on blocks of 8. loop will stop when
353 // counter gets to or below 0. starting the counter
354 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000355 "sub %6, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000356
357 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000358 "1: \n"
359 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000360
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000361 "vld2.8 {d0, d1}, [r9], %1 \n"
362 "vld2.8 {d2, d3}, [r9], %1 \n"
363 "vld2.8 {d4, d5}, [r9], %1 \n"
364 "vld2.8 {d6, d7}, [r9], %1 \n"
365 "vld2.8 {d16, d17}, [r9], %1 \n"
366 "vld2.8 {d18, d19}, [r9], %1 \n"
367 "vld2.8 {d20, d21}, [r9], %1 \n"
368 "vld2.8 {d22, d23}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000369
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000370 "vtrn.8 q1, q0 \n"
371 "vtrn.8 q3, q2 \n"
372 "vtrn.8 q9, q8 \n"
373 "vtrn.8 q11, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000374
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000375 "vtrn.16 q1, q3 \n"
376 "vtrn.16 q0, q2 \n"
377 "vtrn.16 q9, q11 \n"
378 "vtrn.16 q8, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000379
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000380 "vtrn.32 q1, q9 \n"
381 "vtrn.32 q0, q8 \n"
382 "vtrn.32 q3, q11 \n"
383 "vtrn.32 q2, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000384
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000385 "vrev16.8 q0, q0 \n"
386 "vrev16.8 q1, q1 \n"
387 "vrev16.8 q2, q2 \n"
388 "vrev16.8 q3, q3 \n"
389 "vrev16.8 q8, q8 \n"
390 "vrev16.8 q9, q9 \n"
391 "vrev16.8 q10, q10 \n"
392 "vrev16.8 q11, q11 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000393
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000394 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000395
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000396 "vst1.8 {d2}, [r9], %3 \n"
397 "vst1.8 {d0}, [r9], %3 \n"
398 "vst1.8 {d6}, [r9], %3 \n"
399 "vst1.8 {d4}, [r9], %3 \n"
400 "vst1.8 {d18}, [r9], %3 \n"
401 "vst1.8 {d16}, [r9], %3 \n"
402 "vst1.8 {d22}, [r9], %3 \n"
403 "vst1.8 {d20}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000404
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000405 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000406
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000407 "vst1.8 {d3}, [r9], %5 \n"
408 "vst1.8 {d1}, [r9], %5 \n"
409 "vst1.8 {d7}, [r9], %5 \n"
410 "vst1.8 {d5}, [r9], %5 \n"
411 "vst1.8 {d19}, [r9], %5 \n"
412 "vst1.8 {d17}, [r9], %5 \n"
413 "vst1.8 {d23}, [r9], %5 \n"
414 "vst1.8 {d21}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000415
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000416 "add %0, #8*2 \n" // src += 8*2
417 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
418 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
419 "subs %6, #8 \n" // w -= 8
420 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000421
422 // add 8 back to counter. if the result is 0 there are
423 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000424 "adds %6, #8 \n"
425 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000426
427 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000428 "cmp %6, #2 \n"
429 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000430
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000431 "cmp %6, #4 \n"
432 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000433
434 //TODO(frkoenig) : clean this up
435 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000436 "mov r9, %0 \n"
437 "vld1.64 {d0}, [r9], %1 \n"
438 "vld1.64 {d1}, [r9], %1 \n"
439 "vld1.64 {d2}, [r9], %1 \n"
440 "vld1.64 {d3}, [r9], %1 \n"
441 "vld1.64 {d4}, [r9], %1 \n"
442 "vld1.64 {d5}, [r9], %1 \n"
443 "vld1.64 {d6}, [r9], %1 \n"
444 "vld1.64 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000445
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000446 "vld1.8 {q15}, [%7] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000447
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000448 "vtrn.8 q0, q1 \n"
449 "vtrn.8 q2, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000450
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000451 "vtbl.8 d16, {d0, d1}, d30 \n"
452 "vtbl.8 d17, {d0, d1}, d31 \n"
453 "vtbl.8 d18, {d2, d3}, d30 \n"
454 "vtbl.8 d19, {d2, d3}, d31 \n"
455 "vtbl.8 d20, {d4, d5}, d30 \n"
456 "vtbl.8 d21, {d4, d5}, d31 \n"
457 "vtbl.8 d22, {d6, d7}, d30 \n"
458 "vtbl.8 d23, {d6, d7}, d31 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000459
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000460 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000461
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000462 "vst1.32 {d16[0]}, [r9], %3 \n"
463 "vst1.32 {d16[1]}, [r9], %3 \n"
464 "vst1.32 {d17[0]}, [r9], %3 \n"
465 "vst1.32 {d17[1]}, [r9], %3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000466
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000467 "add r9, %2, #4 \n"
468 "vst1.32 {d20[0]}, [r9], %3 \n"
469 "vst1.32 {d20[1]}, [r9], %3 \n"
470 "vst1.32 {d21[0]}, [r9], %3 \n"
471 "vst1.32 {d21[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000472
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000473 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000474
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000475 "vst1.32 {d18[0]}, [r9], %5 \n"
476 "vst1.32 {d18[1]}, [r9], %5 \n"
477 "vst1.32 {d19[0]}, [r9], %5 \n"
478 "vst1.32 {d19[1]}, [r9], %5 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000479
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000480 "add r9, %4, #4 \n"
481 "vst1.32 {d22[0]}, [r9], %5 \n"
482 "vst1.32 {d22[1]}, [r9], %5 \n"
483 "vst1.32 {d23[0]}, [r9], %5 \n"
484 "vst1.32 {d23[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000485
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000486 "add %0, #4*2 \n" // src += 4 * 2
487 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
488 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
489 "subs %6, #4 \n" // w -= 4
490 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000491
492 // some residual, check to see if it includes a 2x8 block,
493 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000494 "cmp %6, #2 \n"
495 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000496
497 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000498 "2: \n"
499 "mov r9, %0 \n"
500 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
501 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
502 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
503 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
504 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
505 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
506 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
507 "vld2.16 {d1[3], d3[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000508
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000509 "vtrn.8 d0, d1 \n"
510 "vtrn.8 d2, d3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000511
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000512 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000513
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000514 "vst1.64 {d0}, [r9], %3 \n"
515 "vst1.64 {d2}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000516
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000517 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000518
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000519 "vst1.64 {d1}, [r9], %5 \n"
520 "vst1.64 {d3}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000521
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000522 "add %0, #2*2 \n" // src += 2 * 2
523 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
524 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
525 "subs %6, #2 \n" // w -= 2
526 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000527
528 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000529 "3: \n"
530 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
531 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
532 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
533 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
534 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
535 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
536 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
537 "vld2.8 {d0[7], d1[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000538
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000539 "vst1.64 {d0}, [%2] \n"
540 "vst1.64 {d1}, [%4] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000541
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000542 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000543
544 : "+r"(src), // %0
545 "+r"(src_stride), // %1
546 "+r"(dst_a), // %2
547 "+r"(dst_stride_a), // %3
548 "+r"(dst_b), // %4
549 "+r"(dst_stride_b), // %5
550 "+r"(width) // %6
551 : "r"(vtbl_4x4_transpose_di)// %7
552 : "memory", "cc", "r9",
553 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
554 );
555}
556#endif
557}