blob: c40e1c33e8f623b7778e55018bebf2e141c29a5a [file] [log] [blame]
frkoenig@google.comf7e74a12011-11-03 22:41:59 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com15c3d452011-11-17 22:13:17 +000011#include "row.h"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000012
fbarchard@google.com17f198c2012-01-04 02:21:05 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000016namespace libyuv {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000017extern "C" {
18#endif
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000019
fbarchard@google.com15c3d452011-11-17 22:13:17 +000020#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000021
fbarchard@google.com2430e042011-11-11 21:57:06 +000022void ReverseRow_NEON(const uint8* src, uint8* dst, int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000023 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000024 // compute where to start writing destination
fbarchard@google.comf7a50482011-11-10 22:41:20 +000025 "add %1, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000026
27 // work on segments that are multiples of 16
fbarchard@google.comf7a50482011-11-10 22:41:20 +000028 "lsrs r3, %2, #4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000029
30 // the output is written in two block. 8 bytes followed
31 // by another 8. reading is done sequentially, from left to
32 // right. writing is done from right to left in block sizes
33 // %1, the destination pointer is incremented after writing
34 // the first of the two blocks. need to subtract that 8 off
35 // along with 16 to get the next location.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000036 "mov r3, #-24 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000037
fbarchard@google.comf7a50482011-11-10 22:41:20 +000038 "beq 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000039
40 // back of destination by the size of the register that is
41 // going to be reversed
fbarchard@google.comf7a50482011-11-10 22:41:20 +000042 "sub %1, #16 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000043
44 // the loop needs to run on blocks of 16. what will be left
45 // over is either a negative number, the residuals that need
46 // to be done, or 0. if this isn't subtracted off here the
47 // loop will run one extra time.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000048 "sub %2, #16 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000049
fbarchard@google.comf7a50482011-11-10 22:41:20 +000050 "1: \n"
51 "vld1.8 {q0}, [%0]! \n" // src += 16
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000052
53 // reverse the bytes in the 64 bit segments. unable to reverse
54 // the bytes in the entire 128 bits in one go.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000055 "vrev64.8 q0, q0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000056
57 // because of the inability to reverse the entire 128 bits
58 // reverse the writing out of the two 64 bit segments.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000059 "vst1.8 {d1}, [%1]! \n"
60 "vst1.8 {d0}, [%1], r3 \n" // dst -= 16
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000061
fbarchard@google.comf7a50482011-11-10 22:41:20 +000062 "subs %2, #16 \n"
63 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000064
65 // add 16 back to the counter. if the result is 0 there is no
66 // residuals so jump past
fbarchard@google.comf7a50482011-11-10 22:41:20 +000067 "adds %2, #16 \n"
68 "beq 5f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000069
fbarchard@google.comf7a50482011-11-10 22:41:20 +000070 "add %1, #16 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000071
fbarchard@google.comf7a50482011-11-10 22:41:20 +000072 "2: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000073
fbarchard@google.comf7a50482011-11-10 22:41:20 +000074 "mov r3, #-3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000075
fbarchard@google.comf7a50482011-11-10 22:41:20 +000076 "sub %1, #2 \n"
77 "subs %2, #2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000078 // check for 16*n+1 scenarios where segments_of_2 should not
79 // be run, but there is something left over.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000080 "blt 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000081
82// do this in neon registers as per
83// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
fbarchard@google.comf7a50482011-11-10 22:41:20 +000084 "3: \n"
85 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000086
fbarchard@google.comf7a50482011-11-10 22:41:20 +000087 "vst1.8 {d1[0]}, [%1]! \n"
88 "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000089
fbarchard@google.comf7a50482011-11-10 22:41:20 +000090 "subs %2, #2 \n"
91 "bge 3b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000092
fbarchard@google.comf7a50482011-11-10 22:41:20 +000093 "adds %2, #2 \n"
94 "beq 5f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000095
fbarchard@google.comf7a50482011-11-10 22:41:20 +000096 "4: \n"
97 "add %1, #1 \n"
98 "vld1.8 {d0[0]}, [%0] \n"
99 "vst1.8 {d0[0]}, [%1] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000100
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000101 "5: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000102 : "+r"(src), // %0
103 "+r"(dst), // %1
104 "+r"(width) // %2
105 :
106 : "memory", "cc", "r3", "q0"
107 );
108}
109
110static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) =
111 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
112
113void TransposeWx8_NEON(const uint8* src, int src_stride,
114 uint8* dst, int dst_stride,
115 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000116 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000117 // loops are on blocks of 8. loop will stop when
118 // counter gets to or below 0. starting the counter
119 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000120 "sub %4, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000121
122 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000123 "1: \n"
124 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000125
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000126 "vld1.8 {d0}, [r9], %1 \n"
127 "vld1.8 {d1}, [r9], %1 \n"
128 "vld1.8 {d2}, [r9], %1 \n"
129 "vld1.8 {d3}, [r9], %1 \n"
130 "vld1.8 {d4}, [r9], %1 \n"
131 "vld1.8 {d5}, [r9], %1 \n"
132 "vld1.8 {d6}, [r9], %1 \n"
133 "vld1.8 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000134
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000135 "vtrn.8 d1, d0 \n"
136 "vtrn.8 d3, d2 \n"
137 "vtrn.8 d5, d4 \n"
138 "vtrn.8 d7, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000139
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000140 "vtrn.16 d1, d3 \n"
141 "vtrn.16 d0, d2 \n"
142 "vtrn.16 d5, d7 \n"
143 "vtrn.16 d4, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000144
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000145 "vtrn.32 d1, d5 \n"
146 "vtrn.32 d0, d4 \n"
147 "vtrn.32 d3, d7 \n"
148 "vtrn.32 d2, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000149
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000150 "vrev16.8 q0, q0 \n"
151 "vrev16.8 q1, q1 \n"
152 "vrev16.8 q2, q2 \n"
153 "vrev16.8 q3, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000154
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000155 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000156
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000157 "vst1.8 {d1}, [r9], %3 \n"
158 "vst1.8 {d0}, [r9], %3 \n"
159 "vst1.8 {d3}, [r9], %3 \n"
160 "vst1.8 {d2}, [r9], %3 \n"
161 "vst1.8 {d5}, [r9], %3 \n"
162 "vst1.8 {d4}, [r9], %3 \n"
163 "vst1.8 {d7}, [r9], %3 \n"
164 "vst1.8 {d6}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000165
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000166 "add %0, #8 \n" // src += 8
167 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
168 "subs %4, #8 \n" // w -= 8
169 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000170
171 // add 8 back to counter. if the result is 0 there are
172 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000173 "adds %4, #8 \n"
174 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000175
176 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000177 "cmp %4, #2 \n"
178 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000179
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000180 "cmp %4, #4 \n"
181 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000182
183 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000184 "mov r9, %0 \n"
185 "vld1.32 {d0[0]}, [r9], %1 \n"
186 "vld1.32 {d0[1]}, [r9], %1 \n"
187 "vld1.32 {d1[0]}, [r9], %1 \n"
188 "vld1.32 {d1[1]}, [r9], %1 \n"
189 "vld1.32 {d2[0]}, [r9], %1 \n"
190 "vld1.32 {d2[1]}, [r9], %1 \n"
191 "vld1.32 {d3[0]}, [r9], %1 \n"
192 "vld1.32 {d3[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000193
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000194 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000195
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000196 "vld1.8 {q3}, [%5] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000197
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000198 "vtbl.8 d4, {d0, d1}, d6 \n"
199 "vtbl.8 d5, {d0, d1}, d7 \n"
200 "vtbl.8 d0, {d2, d3}, d6 \n"
201 "vtbl.8 d1, {d2, d3}, d7 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000202
203 // TODO: rework shuffle above to write
204 // out with 4 instead of 8 writes
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000205 "vst1.32 {d4[0]}, [r9], %3 \n"
206 "vst1.32 {d4[1]}, [r9], %3 \n"
207 "vst1.32 {d5[0]}, [r9], %3 \n"
208 "vst1.32 {d5[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000209
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000210 "add r9, %2, #4 \n"
211 "vst1.32 {d0[0]}, [r9], %3 \n"
212 "vst1.32 {d0[1]}, [r9], %3 \n"
213 "vst1.32 {d1[0]}, [r9], %3 \n"
214 "vst1.32 {d1[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000215
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000216 "add %0, #4 \n" // src += 4
217 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
218 "subs %4, #4 \n" // w -= 4
219 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000220
221 // some residual, check to see if it includes a 2x8 block,
222 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000223 "cmp %4, #2 \n"
224 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000225
226 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000227 "2: \n"
228 "mov r9, %0 \n"
229 "vld1.16 {d0[0]}, [r9], %1 \n"
230 "vld1.16 {d1[0]}, [r9], %1 \n"
231 "vld1.16 {d0[1]}, [r9], %1 \n"
232 "vld1.16 {d1[1]}, [r9], %1 \n"
233 "vld1.16 {d0[2]}, [r9], %1 \n"
234 "vld1.16 {d1[2]}, [r9], %1 \n"
235 "vld1.16 {d0[3]}, [r9], %1 \n"
236 "vld1.16 {d1[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000237
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000238 "vtrn.8 d0, d1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000239
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000240 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000241
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000242 "vst1.64 {d0}, [r9], %3 \n"
243 "vst1.64 {d1}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000244
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000245 "add %0, #2 \n" // src += 2
246 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
247 "subs %4, #2 \n" // w -= 2
248 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000249
250 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000251 "3: \n"
252 "vld1.8 {d0[0]}, [%0], %1 \n"
253 "vld1.8 {d0[1]}, [%0], %1 \n"
254 "vld1.8 {d0[2]}, [%0], %1 \n"
255 "vld1.8 {d0[3]}, [%0], %1 \n"
256 "vld1.8 {d0[4]}, [%0], %1 \n"
257 "vld1.8 {d0[5]}, [%0], %1 \n"
258 "vld1.8 {d0[6]}, [%0], %1 \n"
259 "vld1.8 {d0[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000260
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000261 "vst1.64 {d0}, [%2] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000262
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000263 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000264
265 : "+r"(src), // %0
266 "+r"(src_stride), // %1
267 "+r"(dst), // %2
268 "+r"(dst_stride), // %3
269 "+r"(width) // %4
270 : "r"(vtbl_4x4_transpose) // %5
271 : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
272 );
273}
274
fbarchard@google.com2430e042011-11-11 21:57:06 +0000275void ReverseRowUV_NEON(const uint8* src,
276 uint8* dst_a, uint8* dst_b,
277 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000278 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000279 // compute where to start writing destination
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000280 "add %1, %3 \n" // dst_a + width
281 "add %2, %3 \n" // dst_b + width
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000282
283 // work on input segments that are multiples of 16, but
284 // width that has been passed is output segments, half
285 // the size of input.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000286 "lsrs r12, %3, #3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000287
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000288 "beq 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000289
290 // the output is written in to two blocks.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000291 "mov r12, #-8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000292
293 // back of destination by the size of the register that is
294 // going to be reversed
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000295 "sub %1, #8 \n"
296 "sub %2, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000297
298 // the loop needs to run on blocks of 8. what will be left
299 // over is either a negative number, the residuals that need
300 // to be done, or 0. if this isn't subtracted off here the
301 // loop will run one extra time.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000302 "sub %3, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000303
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000304 "1: \n"
305 "vld2.8 {d0, d1}, [%0]! \n" // src += 16
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000306
307 // reverse the bytes in the 64 bit segments
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000308 "vrev64.8 q0, q0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000309
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000310 "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
311 "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000312
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000313 "subs %3, #8 \n"
314 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000315
316 // add 8 back to the counter. if the result is 0 there is no
317 // residuals so return
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000318 "adds %3, #8 \n"
319 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000320
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000321 "add %1, #8 \n"
322 "add %2, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000323
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000324 "2: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000325
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000326 "mov r12, #-1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000327
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000328 "sub %1, #1 \n"
329 "sub %2, #1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000330
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000331 "3: \n"
332 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000333
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000334 "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
335 "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000336
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000337 "subs %3, %3, #1 \n"
338 "bgt 3b \n"
339 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000340 : "+r"(src), // %0
341 "+r"(dst_a), // %1
342 "+r"(dst_b), // %2
343 "+r"(width) // %3
344 :
345 : "memory", "cc", "r12", "q0"
346 );
347}
348
349static const uint8 vtbl_4x4_transpose_di[16] __attribute__((vector_size(16))) =
350 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
351
352void TransposeUVWx8_NEON(const uint8* src, int src_stride,
353 uint8* dst_a, int dst_stride_a,
354 uint8* dst_b, int dst_stride_b,
355 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000356 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000357 // loops are on blocks of 8. loop will stop when
358 // counter gets to or below 0. starting the counter
359 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000360 "sub %6, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000361
362 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000363 "1: \n"
364 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000365
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000366 "vld2.8 {d0, d1}, [r9], %1 \n"
367 "vld2.8 {d2, d3}, [r9], %1 \n"
368 "vld2.8 {d4, d5}, [r9], %1 \n"
369 "vld2.8 {d6, d7}, [r9], %1 \n"
370 "vld2.8 {d16, d17}, [r9], %1 \n"
371 "vld2.8 {d18, d19}, [r9], %1 \n"
372 "vld2.8 {d20, d21}, [r9], %1 \n"
373 "vld2.8 {d22, d23}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000374
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000375 "vtrn.8 q1, q0 \n"
376 "vtrn.8 q3, q2 \n"
377 "vtrn.8 q9, q8 \n"
378 "vtrn.8 q11, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000379
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000380 "vtrn.16 q1, q3 \n"
381 "vtrn.16 q0, q2 \n"
382 "vtrn.16 q9, q11 \n"
383 "vtrn.16 q8, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000384
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000385 "vtrn.32 q1, q9 \n"
386 "vtrn.32 q0, q8 \n"
387 "vtrn.32 q3, q11 \n"
388 "vtrn.32 q2, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000389
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000390 "vrev16.8 q0, q0 \n"
391 "vrev16.8 q1, q1 \n"
392 "vrev16.8 q2, q2 \n"
393 "vrev16.8 q3, q3 \n"
394 "vrev16.8 q8, q8 \n"
395 "vrev16.8 q9, q9 \n"
396 "vrev16.8 q10, q10 \n"
397 "vrev16.8 q11, q11 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000398
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000399 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000400
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000401 "vst1.8 {d2}, [r9], %3 \n"
402 "vst1.8 {d0}, [r9], %3 \n"
403 "vst1.8 {d6}, [r9], %3 \n"
404 "vst1.8 {d4}, [r9], %3 \n"
405 "vst1.8 {d18}, [r9], %3 \n"
406 "vst1.8 {d16}, [r9], %3 \n"
407 "vst1.8 {d22}, [r9], %3 \n"
408 "vst1.8 {d20}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000409
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000410 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000411
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000412 "vst1.8 {d3}, [r9], %5 \n"
413 "vst1.8 {d1}, [r9], %5 \n"
414 "vst1.8 {d7}, [r9], %5 \n"
415 "vst1.8 {d5}, [r9], %5 \n"
416 "vst1.8 {d19}, [r9], %5 \n"
417 "vst1.8 {d17}, [r9], %5 \n"
418 "vst1.8 {d23}, [r9], %5 \n"
419 "vst1.8 {d21}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000420
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000421 "add %0, #8*2 \n" // src += 8*2
422 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
423 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
424 "subs %6, #8 \n" // w -= 8
425 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000426
427 // add 8 back to counter. if the result is 0 there are
428 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000429 "adds %6, #8 \n"
430 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000431
432 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000433 "cmp %6, #2 \n"
434 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000435
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000436 "cmp %6, #4 \n"
437 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000438
439 //TODO(frkoenig) : clean this up
440 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000441 "mov r9, %0 \n"
442 "vld1.64 {d0}, [r9], %1 \n"
443 "vld1.64 {d1}, [r9], %1 \n"
444 "vld1.64 {d2}, [r9], %1 \n"
445 "vld1.64 {d3}, [r9], %1 \n"
446 "vld1.64 {d4}, [r9], %1 \n"
447 "vld1.64 {d5}, [r9], %1 \n"
448 "vld1.64 {d6}, [r9], %1 \n"
449 "vld1.64 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000450
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000451 "vld1.8 {q15}, [%7] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000452
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000453 "vtrn.8 q0, q1 \n"
454 "vtrn.8 q2, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000455
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000456 "vtbl.8 d16, {d0, d1}, d30 \n"
457 "vtbl.8 d17, {d0, d1}, d31 \n"
458 "vtbl.8 d18, {d2, d3}, d30 \n"
459 "vtbl.8 d19, {d2, d3}, d31 \n"
460 "vtbl.8 d20, {d4, d5}, d30 \n"
461 "vtbl.8 d21, {d4, d5}, d31 \n"
462 "vtbl.8 d22, {d6, d7}, d30 \n"
463 "vtbl.8 d23, {d6, d7}, d31 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000464
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000465 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000466
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000467 "vst1.32 {d16[0]}, [r9], %3 \n"
468 "vst1.32 {d16[1]}, [r9], %3 \n"
469 "vst1.32 {d17[0]}, [r9], %3 \n"
470 "vst1.32 {d17[1]}, [r9], %3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000471
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000472 "add r9, %2, #4 \n"
473 "vst1.32 {d20[0]}, [r9], %3 \n"
474 "vst1.32 {d20[1]}, [r9], %3 \n"
475 "vst1.32 {d21[0]}, [r9], %3 \n"
476 "vst1.32 {d21[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000477
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000478 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000479
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000480 "vst1.32 {d18[0]}, [r9], %5 \n"
481 "vst1.32 {d18[1]}, [r9], %5 \n"
482 "vst1.32 {d19[0]}, [r9], %5 \n"
483 "vst1.32 {d19[1]}, [r9], %5 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000484
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000485 "add r9, %4, #4 \n"
486 "vst1.32 {d22[0]}, [r9], %5 \n"
487 "vst1.32 {d22[1]}, [r9], %5 \n"
488 "vst1.32 {d23[0]}, [r9], %5 \n"
489 "vst1.32 {d23[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000490
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000491 "add %0, #4*2 \n" // src += 4 * 2
492 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
493 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
494 "subs %6, #4 \n" // w -= 4
495 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000496
497 // some residual, check to see if it includes a 2x8 block,
498 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000499 "cmp %6, #2 \n"
500 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000501
502 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000503 "2: \n"
504 "mov r9, %0 \n"
505 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
506 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
507 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
508 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
509 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
510 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
511 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
512 "vld2.16 {d1[3], d3[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000513
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000514 "vtrn.8 d0, d1 \n"
515 "vtrn.8 d2, d3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000516
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000517 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000518
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000519 "vst1.64 {d0}, [r9], %3 \n"
520 "vst1.64 {d2}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000521
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000522 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000523
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000524 "vst1.64 {d1}, [r9], %5 \n"
525 "vst1.64 {d3}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000526
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000527 "add %0, #2*2 \n" // src += 2 * 2
528 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
529 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
530 "subs %6, #2 \n" // w -= 2
531 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000532
533 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000534 "3: \n"
535 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
536 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
537 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
538 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
539 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
540 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
541 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
542 "vld2.8 {d0[7], d1[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000543
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000544 "vst1.64 {d0}, [%2] \n"
545 "vst1.64 {d1}, [%4] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000546
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000547 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000548
549 : "+r"(src), // %0
550 "+r"(src_stride), // %1
551 "+r"(dst_a), // %2
552 "+r"(dst_stride_a), // %3
553 "+r"(dst_b), // %4
554 "+r"(dst_stride_b), // %5
555 "+r"(width) // %6
556 : "r"(vtbl_4x4_transpose_di)// %7
557 : "memory", "cc", "r9",
558 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
559 );
560}
561#endif
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +0000562
563#ifdef __cplusplus
564} // extern "C"
565} // namespace libyuv
566#endif