blob: 6ba7d51b8b6d10f154db14d934281a781eb7c75c [file] [log] [blame]
frkoenig@google.comf7e74a12011-11-03 22:41:59 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/basic_types.h"
fbarchard@google.com15c3d452011-11-17 22:13:17 +000012#include "row.h"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000013
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000014#ifdef __cplusplus
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000015namespace libyuv {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000016extern "C" {
17#endif
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000018
fbarchard@google.com15c3d452011-11-17 22:13:17 +000019#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000020
fbarchard@google.com2430e042011-11-11 21:57:06 +000021void ReverseRow_NEON(const uint8* src, uint8* dst, int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +000022 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000023 // compute where to start writing destination
fbarchard@google.comf7a50482011-11-10 22:41:20 +000024 "add %1, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000025
26 // work on segments that are multiples of 16
fbarchard@google.comf7a50482011-11-10 22:41:20 +000027 "lsrs r3, %2, #4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000028
29 // the output is written in two block. 8 bytes followed
30 // by another 8. reading is done sequentially, from left to
31 // right. writing is done from right to left in block sizes
32 // %1, the destination pointer is incremented after writing
33 // the first of the two blocks. need to subtract that 8 off
34 // along with 16 to get the next location.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000035 "mov r3, #-24 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000036
fbarchard@google.comf7a50482011-11-10 22:41:20 +000037 "beq 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000038
39 // back of destination by the size of the register that is
40 // going to be reversed
fbarchard@google.comf7a50482011-11-10 22:41:20 +000041 "sub %1, #16 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000042
43 // the loop needs to run on blocks of 16. what will be left
44 // over is either a negative number, the residuals that need
45 // to be done, or 0. if this isn't subtracted off here the
46 // loop will run one extra time.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000047 "sub %2, #16 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000048
fbarchard@google.comf7a50482011-11-10 22:41:20 +000049 "1: \n"
50 "vld1.8 {q0}, [%0]! \n" // src += 16
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000051
52 // reverse the bytes in the 64 bit segments. unable to reverse
53 // the bytes in the entire 128 bits in one go.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000054 "vrev64.8 q0, q0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000055
56 // because of the inability to reverse the entire 128 bits
57 // reverse the writing out of the two 64 bit segments.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000058 "vst1.8 {d1}, [%1]! \n"
59 "vst1.8 {d0}, [%1], r3 \n" // dst -= 16
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000060
fbarchard@google.comf7a50482011-11-10 22:41:20 +000061 "subs %2, #16 \n"
62 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000063
64 // add 16 back to the counter. if the result is 0 there is no
65 // residuals so jump past
fbarchard@google.comf7a50482011-11-10 22:41:20 +000066 "adds %2, #16 \n"
67 "beq 5f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000068
fbarchard@google.comf7a50482011-11-10 22:41:20 +000069 "add %1, #16 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000070
fbarchard@google.comf7a50482011-11-10 22:41:20 +000071 "2: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000072
fbarchard@google.comf7a50482011-11-10 22:41:20 +000073 "mov r3, #-3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000074
fbarchard@google.comf7a50482011-11-10 22:41:20 +000075 "sub %1, #2 \n"
76 "subs %2, #2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000077 // check for 16*n+1 scenarios where segments_of_2 should not
78 // be run, but there is something left over.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000079 "blt 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000080
81// do this in neon registers as per
82// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
fbarchard@google.comf7a50482011-11-10 22:41:20 +000083 "3: \n"
84 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000085
fbarchard@google.comf7a50482011-11-10 22:41:20 +000086 "vst1.8 {d1[0]}, [%1]! \n"
87 "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000088
fbarchard@google.comf7a50482011-11-10 22:41:20 +000089 "subs %2, #2 \n"
90 "bge 3b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000091
fbarchard@google.comf7a50482011-11-10 22:41:20 +000092 "adds %2, #2 \n"
93 "beq 5f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000094
fbarchard@google.comf7a50482011-11-10 22:41:20 +000095 "4: \n"
96 "add %1, #1 \n"
97 "vld1.8 {d0[0]}, [%0] \n"
98 "vst1.8 {d0[0]}, [%1] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000099
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000100 "5: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000101 : "+r"(src), // %0
102 "+r"(dst), // %1
103 "+r"(width) // %2
104 :
105 : "memory", "cc", "r3", "q0"
106 );
107}
108
109static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) =
110 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
111
112void TransposeWx8_NEON(const uint8* src, int src_stride,
113 uint8* dst, int dst_stride,
114 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000115 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000116 // loops are on blocks of 8. loop will stop when
117 // counter gets to or below 0. starting the counter
118 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000119 "sub %4, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000120
121 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000122 "1: \n"
123 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000124
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000125 "vld1.8 {d0}, [r9], %1 \n"
126 "vld1.8 {d1}, [r9], %1 \n"
127 "vld1.8 {d2}, [r9], %1 \n"
128 "vld1.8 {d3}, [r9], %1 \n"
129 "vld1.8 {d4}, [r9], %1 \n"
130 "vld1.8 {d5}, [r9], %1 \n"
131 "vld1.8 {d6}, [r9], %1 \n"
132 "vld1.8 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000133
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000134 "vtrn.8 d1, d0 \n"
135 "vtrn.8 d3, d2 \n"
136 "vtrn.8 d5, d4 \n"
137 "vtrn.8 d7, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000138
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000139 "vtrn.16 d1, d3 \n"
140 "vtrn.16 d0, d2 \n"
141 "vtrn.16 d5, d7 \n"
142 "vtrn.16 d4, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000143
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000144 "vtrn.32 d1, d5 \n"
145 "vtrn.32 d0, d4 \n"
146 "vtrn.32 d3, d7 \n"
147 "vtrn.32 d2, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000148
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000149 "vrev16.8 q0, q0 \n"
150 "vrev16.8 q1, q1 \n"
151 "vrev16.8 q2, q2 \n"
152 "vrev16.8 q3, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000153
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000154 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000155
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000156 "vst1.8 {d1}, [r9], %3 \n"
157 "vst1.8 {d0}, [r9], %3 \n"
158 "vst1.8 {d3}, [r9], %3 \n"
159 "vst1.8 {d2}, [r9], %3 \n"
160 "vst1.8 {d5}, [r9], %3 \n"
161 "vst1.8 {d4}, [r9], %3 \n"
162 "vst1.8 {d7}, [r9], %3 \n"
163 "vst1.8 {d6}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000164
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000165 "add %0, #8 \n" // src += 8
166 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
167 "subs %4, #8 \n" // w -= 8
168 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000169
170 // add 8 back to counter. if the result is 0 there are
171 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000172 "adds %4, #8 \n"
173 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000174
175 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000176 "cmp %4, #2 \n"
177 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000178
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000179 "cmp %4, #4 \n"
180 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000181
182 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000183 "mov r9, %0 \n"
184 "vld1.32 {d0[0]}, [r9], %1 \n"
185 "vld1.32 {d0[1]}, [r9], %1 \n"
186 "vld1.32 {d1[0]}, [r9], %1 \n"
187 "vld1.32 {d1[1]}, [r9], %1 \n"
188 "vld1.32 {d2[0]}, [r9], %1 \n"
189 "vld1.32 {d2[1]}, [r9], %1 \n"
190 "vld1.32 {d3[0]}, [r9], %1 \n"
191 "vld1.32 {d3[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000192
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000193 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000194
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000195 "vld1.8 {q3}, [%5] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000196
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000197 "vtbl.8 d4, {d0, d1}, d6 \n"
198 "vtbl.8 d5, {d0, d1}, d7 \n"
199 "vtbl.8 d0, {d2, d3}, d6 \n"
200 "vtbl.8 d1, {d2, d3}, d7 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000201
202 // TODO: rework shuffle above to write
203 // out with 4 instead of 8 writes
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000204 "vst1.32 {d4[0]}, [r9], %3 \n"
205 "vst1.32 {d4[1]}, [r9], %3 \n"
206 "vst1.32 {d5[0]}, [r9], %3 \n"
207 "vst1.32 {d5[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000208
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000209 "add r9, %2, #4 \n"
210 "vst1.32 {d0[0]}, [r9], %3 \n"
211 "vst1.32 {d0[1]}, [r9], %3 \n"
212 "vst1.32 {d1[0]}, [r9], %3 \n"
213 "vst1.32 {d1[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000214
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000215 "add %0, #4 \n" // src += 4
216 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
217 "subs %4, #4 \n" // w -= 4
218 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000219
220 // some residual, check to see if it includes a 2x8 block,
221 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000222 "cmp %4, #2 \n"
223 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000224
225 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000226 "2: \n"
227 "mov r9, %0 \n"
228 "vld1.16 {d0[0]}, [r9], %1 \n"
229 "vld1.16 {d1[0]}, [r9], %1 \n"
230 "vld1.16 {d0[1]}, [r9], %1 \n"
231 "vld1.16 {d1[1]}, [r9], %1 \n"
232 "vld1.16 {d0[2]}, [r9], %1 \n"
233 "vld1.16 {d1[2]}, [r9], %1 \n"
234 "vld1.16 {d0[3]}, [r9], %1 \n"
235 "vld1.16 {d1[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000236
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000237 "vtrn.8 d0, d1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000238
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000239 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000240
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000241 "vst1.64 {d0}, [r9], %3 \n"
242 "vst1.64 {d1}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000243
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000244 "add %0, #2 \n" // src += 2
245 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
246 "subs %4, #2 \n" // w -= 2
247 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000248
249 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000250 "3: \n"
251 "vld1.8 {d0[0]}, [%0], %1 \n"
252 "vld1.8 {d0[1]}, [%0], %1 \n"
253 "vld1.8 {d0[2]}, [%0], %1 \n"
254 "vld1.8 {d0[3]}, [%0], %1 \n"
255 "vld1.8 {d0[4]}, [%0], %1 \n"
256 "vld1.8 {d0[5]}, [%0], %1 \n"
257 "vld1.8 {d0[6]}, [%0], %1 \n"
258 "vld1.8 {d0[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000259
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000260 "vst1.64 {d0}, [%2] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000261
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000262 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000263
264 : "+r"(src), // %0
265 "+r"(src_stride), // %1
266 "+r"(dst), // %2
267 "+r"(dst_stride), // %3
268 "+r"(width) // %4
269 : "r"(vtbl_4x4_transpose) // %5
270 : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
271 );
272}
273
fbarchard@google.com2430e042011-11-11 21:57:06 +0000274void ReverseRowUV_NEON(const uint8* src,
275 uint8* dst_a, uint8* dst_b,
276 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000277 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000278 // compute where to start writing destination
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000279 "add %1, %3 \n" // dst_a + width
280 "add %2, %3 \n" // dst_b + width
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000281
282 // work on input segments that are multiples of 16, but
283 // width that has been passed is output segments, half
284 // the size of input.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000285 "lsrs r12, %3, #3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000286
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000287 "beq 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000288
289 // the output is written in to two blocks.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000290 "mov r12, #-8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000291
292 // back of destination by the size of the register that is
293 // going to be reversed
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000294 "sub %1, #8 \n"
295 "sub %2, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000296
297 // the loop needs to run on blocks of 8. what will be left
298 // over is either a negative number, the residuals that need
299 // to be done, or 0. if this isn't subtracted off here the
300 // loop will run one extra time.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000301 "sub %3, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000302
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000303 "1: \n"
304 "vld2.8 {d0, d1}, [%0]! \n" // src += 16
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000305
306 // reverse the bytes in the 64 bit segments
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000307 "vrev64.8 q0, q0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000308
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000309 "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
310 "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000311
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000312 "subs %3, #8 \n"
313 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000314
315 // add 8 back to the counter. if the result is 0 there is no
316 // residuals so return
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000317 "adds %3, #8 \n"
318 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000319
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000320 "add %1, #8 \n"
321 "add %2, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000322
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000323 "2: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000324
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000325 "mov r12, #-1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000326
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000327 "sub %1, #1 \n"
328 "sub %2, #1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000329
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000330 "3: \n"
331 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000332
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000333 "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
334 "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000335
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000336 "subs %3, %3, #1 \n"
337 "bgt 3b \n"
338 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000339 : "+r"(src), // %0
340 "+r"(dst_a), // %1
341 "+r"(dst_b), // %2
342 "+r"(width) // %3
343 :
344 : "memory", "cc", "r12", "q0"
345 );
346}
347
348static const uint8 vtbl_4x4_transpose_di[16] __attribute__((vector_size(16))) =
349 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
350
351void TransposeUVWx8_NEON(const uint8* src, int src_stride,
352 uint8* dst_a, int dst_stride_a,
353 uint8* dst_b, int dst_stride_b,
354 int width) {
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000355 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000356 // loops are on blocks of 8. loop will stop when
357 // counter gets to or below 0. starting the counter
358 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000359 "sub %6, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000360
361 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000362 "1: \n"
363 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000364
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000365 "vld2.8 {d0, d1}, [r9], %1 \n"
366 "vld2.8 {d2, d3}, [r9], %1 \n"
367 "vld2.8 {d4, d5}, [r9], %1 \n"
368 "vld2.8 {d6, d7}, [r9], %1 \n"
369 "vld2.8 {d16, d17}, [r9], %1 \n"
370 "vld2.8 {d18, d19}, [r9], %1 \n"
371 "vld2.8 {d20, d21}, [r9], %1 \n"
372 "vld2.8 {d22, d23}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000373
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000374 "vtrn.8 q1, q0 \n"
375 "vtrn.8 q3, q2 \n"
376 "vtrn.8 q9, q8 \n"
377 "vtrn.8 q11, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000378
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000379 "vtrn.16 q1, q3 \n"
380 "vtrn.16 q0, q2 \n"
381 "vtrn.16 q9, q11 \n"
382 "vtrn.16 q8, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000383
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000384 "vtrn.32 q1, q9 \n"
385 "vtrn.32 q0, q8 \n"
386 "vtrn.32 q3, q11 \n"
387 "vtrn.32 q2, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000388
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000389 "vrev16.8 q0, q0 \n"
390 "vrev16.8 q1, q1 \n"
391 "vrev16.8 q2, q2 \n"
392 "vrev16.8 q3, q3 \n"
393 "vrev16.8 q8, q8 \n"
394 "vrev16.8 q9, q9 \n"
395 "vrev16.8 q10, q10 \n"
396 "vrev16.8 q11, q11 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000397
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000398 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000399
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000400 "vst1.8 {d2}, [r9], %3 \n"
401 "vst1.8 {d0}, [r9], %3 \n"
402 "vst1.8 {d6}, [r9], %3 \n"
403 "vst1.8 {d4}, [r9], %3 \n"
404 "vst1.8 {d18}, [r9], %3 \n"
405 "vst1.8 {d16}, [r9], %3 \n"
406 "vst1.8 {d22}, [r9], %3 \n"
407 "vst1.8 {d20}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000408
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000409 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000410
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000411 "vst1.8 {d3}, [r9], %5 \n"
412 "vst1.8 {d1}, [r9], %5 \n"
413 "vst1.8 {d7}, [r9], %5 \n"
414 "vst1.8 {d5}, [r9], %5 \n"
415 "vst1.8 {d19}, [r9], %5 \n"
416 "vst1.8 {d17}, [r9], %5 \n"
417 "vst1.8 {d23}, [r9], %5 \n"
418 "vst1.8 {d21}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000419
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000420 "add %0, #8*2 \n" // src += 8*2
421 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
422 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
423 "subs %6, #8 \n" // w -= 8
424 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000425
426 // add 8 back to counter. if the result is 0 there are
427 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000428 "adds %6, #8 \n"
429 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000430
431 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000432 "cmp %6, #2 \n"
433 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000434
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000435 "cmp %6, #4 \n"
436 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000437
438 //TODO(frkoenig) : clean this up
439 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000440 "mov r9, %0 \n"
441 "vld1.64 {d0}, [r9], %1 \n"
442 "vld1.64 {d1}, [r9], %1 \n"
443 "vld1.64 {d2}, [r9], %1 \n"
444 "vld1.64 {d3}, [r9], %1 \n"
445 "vld1.64 {d4}, [r9], %1 \n"
446 "vld1.64 {d5}, [r9], %1 \n"
447 "vld1.64 {d6}, [r9], %1 \n"
448 "vld1.64 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000449
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000450 "vld1.8 {q15}, [%7] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000451
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000452 "vtrn.8 q0, q1 \n"
453 "vtrn.8 q2, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000454
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000455 "vtbl.8 d16, {d0, d1}, d30 \n"
456 "vtbl.8 d17, {d0, d1}, d31 \n"
457 "vtbl.8 d18, {d2, d3}, d30 \n"
458 "vtbl.8 d19, {d2, d3}, d31 \n"
459 "vtbl.8 d20, {d4, d5}, d30 \n"
460 "vtbl.8 d21, {d4, d5}, d31 \n"
461 "vtbl.8 d22, {d6, d7}, d30 \n"
462 "vtbl.8 d23, {d6, d7}, d31 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000463
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000464 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000465
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000466 "vst1.32 {d16[0]}, [r9], %3 \n"
467 "vst1.32 {d16[1]}, [r9], %3 \n"
468 "vst1.32 {d17[0]}, [r9], %3 \n"
469 "vst1.32 {d17[1]}, [r9], %3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000470
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000471 "add r9, %2, #4 \n"
472 "vst1.32 {d20[0]}, [r9], %3 \n"
473 "vst1.32 {d20[1]}, [r9], %3 \n"
474 "vst1.32 {d21[0]}, [r9], %3 \n"
475 "vst1.32 {d21[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000476
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000477 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000478
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000479 "vst1.32 {d18[0]}, [r9], %5 \n"
480 "vst1.32 {d18[1]}, [r9], %5 \n"
481 "vst1.32 {d19[0]}, [r9], %5 \n"
482 "vst1.32 {d19[1]}, [r9], %5 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000483
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000484 "add r9, %4, #4 \n"
485 "vst1.32 {d22[0]}, [r9], %5 \n"
486 "vst1.32 {d22[1]}, [r9], %5 \n"
487 "vst1.32 {d23[0]}, [r9], %5 \n"
488 "vst1.32 {d23[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000489
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000490 "add %0, #4*2 \n" // src += 4 * 2
491 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
492 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
493 "subs %6, #4 \n" // w -= 4
494 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000495
496 // some residual, check to see if it includes a 2x8 block,
497 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000498 "cmp %6, #2 \n"
499 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000500
501 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000502 "2: \n"
503 "mov r9, %0 \n"
504 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
505 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
506 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
507 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
508 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
509 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
510 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
511 "vld2.16 {d1[3], d3[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000512
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000513 "vtrn.8 d0, d1 \n"
514 "vtrn.8 d2, d3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000515
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000516 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000517
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000518 "vst1.64 {d0}, [r9], %3 \n"
519 "vst1.64 {d2}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000520
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000521 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000522
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000523 "vst1.64 {d1}, [r9], %5 \n"
524 "vst1.64 {d3}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000525
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000526 "add %0, #2*2 \n" // src += 2 * 2
527 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
528 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
529 "subs %6, #2 \n" // w -= 2
530 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000531
532 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000533 "3: \n"
534 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
535 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
536 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
537 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
538 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
539 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
540 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
541 "vld2.8 {d0[7], d1[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000542
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000543 "vst1.64 {d0}, [%2] \n"
544 "vst1.64 {d1}, [%4] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000545
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000546 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000547
548 : "+r"(src), // %0
549 "+r"(src_stride), // %1
550 "+r"(dst_a), // %2
551 "+r"(dst_stride_a), // %3
552 "+r"(dst_b), // %4
553 "+r"(dst_stride_b), // %5
554 "+r"(width) // %6
555 : "r"(vtbl_4x4_transpose_di)// %7
556 : "memory", "cc", "r9",
557 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
558 );
559}
560#endif
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +0000561
562#ifdef __cplusplus
563} // extern "C"
564} // namespace libyuv
565#endif