blob: a0b3d291ff4f49c778936c2663fb409f90feb036 [file] [log] [blame]
frkoenig@google.comf7e74a12011-11-03 22:41:59 +00001/*
fbarchard@google.comb0c97972012-08-08 19:04:24 +00002 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
frkoenig@google.comf7e74a12011-11-03 22:41:59 +00003 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com142f6c42012-09-18 20:56:51 +000011#include "libyuv/row.h"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000012
fbarchard@google.com17f198c2012-01-04 02:21:05 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000016namespace libyuv {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000017extern "C" {
18#endif
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000019
fbarchard@google.comd2f44132012-04-04 21:53:27 +000020#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000021
fbarchard@google.com5e642d02012-04-21 01:04:46 +000022static const uvec8 kVTbl4x4Transpose =
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000023 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
24
25void TransposeWx8_NEON(const uint8* src, int src_stride,
26 uint8* dst, int dst_stride,
27 int width) {
fbarchard@google.com5b225062012-03-29 02:19:26 +000028 asm volatile (
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000029 // loops are on blocks of 8. loop will stop when
30 // counter gets to or below 0. starting the counter
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000031 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +000032 "sub %4, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000033
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000034 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.com5bf29b52012-05-02 00:10:16 +000035 ".p2align 4 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +000036 "1: \n"
37 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000038
fbarchard@google.comf7a50482011-11-10 22:41:20 +000039 "vld1.8 {d0}, [r9], %1 \n"
40 "vld1.8 {d1}, [r9], %1 \n"
41 "vld1.8 {d2}, [r9], %1 \n"
42 "vld1.8 {d3}, [r9], %1 \n"
43 "vld1.8 {d4}, [r9], %1 \n"
44 "vld1.8 {d5}, [r9], %1 \n"
45 "vld1.8 {d6}, [r9], %1 \n"
46 "vld1.8 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000047
fbarchard@google.comf7a50482011-11-10 22:41:20 +000048 "vtrn.8 d1, d0 \n"
49 "vtrn.8 d3, d2 \n"
50 "vtrn.8 d5, d4 \n"
51 "vtrn.8 d7, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000052
fbarchard@google.comf7a50482011-11-10 22:41:20 +000053 "vtrn.16 d1, d3 \n"
54 "vtrn.16 d0, d2 \n"
55 "vtrn.16 d5, d7 \n"
56 "vtrn.16 d4, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000057
fbarchard@google.comf7a50482011-11-10 22:41:20 +000058 "vtrn.32 d1, d5 \n"
59 "vtrn.32 d0, d4 \n"
60 "vtrn.32 d3, d7 \n"
61 "vtrn.32 d2, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000062
fbarchard@google.comf7a50482011-11-10 22:41:20 +000063 "vrev16.8 q0, q0 \n"
64 "vrev16.8 q1, q1 \n"
65 "vrev16.8 q2, q2 \n"
66 "vrev16.8 q3, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000067
fbarchard@google.comf7a50482011-11-10 22:41:20 +000068 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000069
fbarchard@google.comf7a50482011-11-10 22:41:20 +000070 "vst1.8 {d1}, [r9], %3 \n"
71 "vst1.8 {d0}, [r9], %3 \n"
72 "vst1.8 {d3}, [r9], %3 \n"
73 "vst1.8 {d2}, [r9], %3 \n"
74 "vst1.8 {d5}, [r9], %3 \n"
75 "vst1.8 {d4}, [r9], %3 \n"
76 "vst1.8 {d7}, [r9], %3 \n"
77 "vst1.8 {d6}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000078
fbarchard@google.comf7a50482011-11-10 22:41:20 +000079 "add %0, #8 \n" // src += 8
80 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
81 "subs %4, #8 \n" // w -= 8
82 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000083
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +000084 // add 8 back to counter. if the result is 0 there are
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000085 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000086 "adds %4, #8 \n"
87 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000088
89 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +000090 "cmp %4, #2 \n"
91 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000092
fbarchard@google.comf7a50482011-11-10 22:41:20 +000093 "cmp %4, #4 \n"
94 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000095
96 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +000097 "mov r9, %0 \n"
98 "vld1.32 {d0[0]}, [r9], %1 \n"
99 "vld1.32 {d0[1]}, [r9], %1 \n"
100 "vld1.32 {d1[0]}, [r9], %1 \n"
101 "vld1.32 {d1[1]}, [r9], %1 \n"
102 "vld1.32 {d2[0]}, [r9], %1 \n"
103 "vld1.32 {d2[1]}, [r9], %1 \n"
104 "vld1.32 {d3[0]}, [r9], %1 \n"
105 "vld1.32 {d3[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000106
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000107 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000108
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000109 "vld1.8 {q3}, [%5] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000110
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000111 "vtbl.8 d4, {d0, d1}, d6 \n"
112 "vtbl.8 d5, {d0, d1}, d7 \n"
113 "vtbl.8 d0, {d2, d3}, d6 \n"
114 "vtbl.8 d1, {d2, d3}, d7 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000115
fbarchard@google.com1d160cb2012-11-28 20:02:55 +0000116 // TODO(frkoenig): Rework shuffle above to
117 // write out with 4 instead of 8 writes.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000118 "vst1.32 {d4[0]}, [r9], %3 \n"
119 "vst1.32 {d4[1]}, [r9], %3 \n"
120 "vst1.32 {d5[0]}, [r9], %3 \n"
121 "vst1.32 {d5[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000122
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000123 "add r9, %2, #4 \n"
124 "vst1.32 {d0[0]}, [r9], %3 \n"
125 "vst1.32 {d0[1]}, [r9], %3 \n"
126 "vst1.32 {d1[0]}, [r9], %3 \n"
127 "vst1.32 {d1[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000128
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000129 "add %0, #4 \n" // src += 4
130 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
131 "subs %4, #4 \n" // w -= 4
132 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000133
134 // some residual, check to see if it includes a 2x8 block,
135 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000136 "cmp %4, #2 \n"
137 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000138
139 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000140 "2: \n"
141 "mov r9, %0 \n"
142 "vld1.16 {d0[0]}, [r9], %1 \n"
143 "vld1.16 {d1[0]}, [r9], %1 \n"
144 "vld1.16 {d0[1]}, [r9], %1 \n"
145 "vld1.16 {d1[1]}, [r9], %1 \n"
146 "vld1.16 {d0[2]}, [r9], %1 \n"
147 "vld1.16 {d1[2]}, [r9], %1 \n"
148 "vld1.16 {d0[3]}, [r9], %1 \n"
149 "vld1.16 {d1[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000150
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000151 "vtrn.8 d0, d1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000152
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000153 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000154
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000155 "vst1.64 {d0}, [r9], %3 \n"
156 "vst1.64 {d1}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000157
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000158 "add %0, #2 \n" // src += 2
159 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
160 "subs %4, #2 \n" // w -= 2
161 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000162
163 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000164 "3: \n"
165 "vld1.8 {d0[0]}, [%0], %1 \n"
166 "vld1.8 {d0[1]}, [%0], %1 \n"
167 "vld1.8 {d0[2]}, [%0], %1 \n"
168 "vld1.8 {d0[3]}, [%0], %1 \n"
169 "vld1.8 {d0[4]}, [%0], %1 \n"
170 "vld1.8 {d0[5]}, [%0], %1 \n"
171 "vld1.8 {d0[6]}, [%0], %1 \n"
172 "vld1.8 {d0[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000173
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000174 "vst1.64 {d0}, [%2] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000175
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000176 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000177
fbarchard@google.comc4284702012-04-22 15:00:50 +0000178 : "+r"(src), // %0
179 "+r"(src_stride), // %1
180 "+r"(dst), // %2
181 "+r"(dst_stride), // %3
182 "+r"(width) // %4
183 : "r"(&kVTbl4x4Transpose) // %5
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000184 : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
185 );
186}
187
fbarchard@google.com5e642d02012-04-21 01:04:46 +0000188static const uvec8 kVTbl4x4TransposeDi =
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000189 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
190
191void TransposeUVWx8_NEON(const uint8* src, int src_stride,
192 uint8* dst_a, int dst_stride_a,
193 uint8* dst_b, int dst_stride_b,
194 int width) {
fbarchard@google.com5b225062012-03-29 02:19:26 +0000195 asm volatile (
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000196 // loops are on blocks of 8. loop will stop when
197 // counter gets to or below 0. starting the counter
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000198 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000199 "sub %6, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000200
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000201 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.com5bf29b52012-05-02 00:10:16 +0000202 ".p2align 4 \n"
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000203 "1: \n"
204 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000205
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000206 "vld2.8 {d0, d1}, [r9], %1 \n"
207 "vld2.8 {d2, d3}, [r9], %1 \n"
208 "vld2.8 {d4, d5}, [r9], %1 \n"
209 "vld2.8 {d6, d7}, [r9], %1 \n"
210 "vld2.8 {d16, d17}, [r9], %1 \n"
211 "vld2.8 {d18, d19}, [r9], %1 \n"
212 "vld2.8 {d20, d21}, [r9], %1 \n"
213 "vld2.8 {d22, d23}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000214
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000215 "vtrn.8 q1, q0 \n"
216 "vtrn.8 q3, q2 \n"
217 "vtrn.8 q9, q8 \n"
218 "vtrn.8 q11, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000219
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000220 "vtrn.16 q1, q3 \n"
221 "vtrn.16 q0, q2 \n"
222 "vtrn.16 q9, q11 \n"
223 "vtrn.16 q8, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000224
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000225 "vtrn.32 q1, q9 \n"
226 "vtrn.32 q0, q8 \n"
227 "vtrn.32 q3, q11 \n"
228 "vtrn.32 q2, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000229
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000230 "vrev16.8 q0, q0 \n"
231 "vrev16.8 q1, q1 \n"
232 "vrev16.8 q2, q2 \n"
233 "vrev16.8 q3, q3 \n"
234 "vrev16.8 q8, q8 \n"
235 "vrev16.8 q9, q9 \n"
236 "vrev16.8 q10, q10 \n"
237 "vrev16.8 q11, q11 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000238
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000239 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000240
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000241 "vst1.8 {d2}, [r9], %3 \n"
242 "vst1.8 {d0}, [r9], %3 \n"
243 "vst1.8 {d6}, [r9], %3 \n"
244 "vst1.8 {d4}, [r9], %3 \n"
245 "vst1.8 {d18}, [r9], %3 \n"
246 "vst1.8 {d16}, [r9], %3 \n"
247 "vst1.8 {d22}, [r9], %3 \n"
248 "vst1.8 {d20}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000249
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000250 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000251
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000252 "vst1.8 {d3}, [r9], %5 \n"
253 "vst1.8 {d1}, [r9], %5 \n"
254 "vst1.8 {d7}, [r9], %5 \n"
255 "vst1.8 {d5}, [r9], %5 \n"
256 "vst1.8 {d19}, [r9], %5 \n"
257 "vst1.8 {d17}, [r9], %5 \n"
258 "vst1.8 {d23}, [r9], %5 \n"
259 "vst1.8 {d21}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000260
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000261 "add %0, #8*2 \n" // src += 8*2
262 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
263 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
264 "subs %6, #8 \n" // w -= 8
265 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000266
fbarchard@google.com64ce0ab2012-10-09 00:05:29 +0000267 // add 8 back to counter. if the result is 0 there are
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000268 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000269 "adds %6, #8 \n"
270 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000271
272 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000273 "cmp %6, #2 \n"
274 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000275
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000276 "cmp %6, #4 \n"
277 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000278
fbarchard@google.com1d160cb2012-11-28 20:02:55 +0000279 //TODO(frkoenig): Clean this up
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000280 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000281 "mov r9, %0 \n"
282 "vld1.64 {d0}, [r9], %1 \n"
283 "vld1.64 {d1}, [r9], %1 \n"
284 "vld1.64 {d2}, [r9], %1 \n"
285 "vld1.64 {d3}, [r9], %1 \n"
286 "vld1.64 {d4}, [r9], %1 \n"
287 "vld1.64 {d5}, [r9], %1 \n"
288 "vld1.64 {d6}, [r9], %1 \n"
289 "vld1.64 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000290
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000291 "vld1.8 {q15}, [%7] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000292
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000293 "vtrn.8 q0, q1 \n"
294 "vtrn.8 q2, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000295
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000296 "vtbl.8 d16, {d0, d1}, d30 \n"
297 "vtbl.8 d17, {d0, d1}, d31 \n"
298 "vtbl.8 d18, {d2, d3}, d30 \n"
299 "vtbl.8 d19, {d2, d3}, d31 \n"
300 "vtbl.8 d20, {d4, d5}, d30 \n"
301 "vtbl.8 d21, {d4, d5}, d31 \n"
302 "vtbl.8 d22, {d6, d7}, d30 \n"
303 "vtbl.8 d23, {d6, d7}, d31 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000304
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000305 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000306
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000307 "vst1.32 {d16[0]}, [r9], %3 \n"
308 "vst1.32 {d16[1]}, [r9], %3 \n"
309 "vst1.32 {d17[0]}, [r9], %3 \n"
310 "vst1.32 {d17[1]}, [r9], %3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000311
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000312 "add r9, %2, #4 \n"
313 "vst1.32 {d20[0]}, [r9], %3 \n"
314 "vst1.32 {d20[1]}, [r9], %3 \n"
315 "vst1.32 {d21[0]}, [r9], %3 \n"
316 "vst1.32 {d21[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000317
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000318 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000319
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000320 "vst1.32 {d18[0]}, [r9], %5 \n"
321 "vst1.32 {d18[1]}, [r9], %5 \n"
322 "vst1.32 {d19[0]}, [r9], %5 \n"
323 "vst1.32 {d19[1]}, [r9], %5 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000324
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000325 "add r9, %4, #4 \n"
326 "vst1.32 {d22[0]}, [r9], %5 \n"
327 "vst1.32 {d22[1]}, [r9], %5 \n"
328 "vst1.32 {d23[0]}, [r9], %5 \n"
329 "vst1.32 {d23[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000330
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000331 "add %0, #4*2 \n" // src += 4 * 2
332 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
333 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
334 "subs %6, #4 \n" // w -= 4
335 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000336
337 // some residual, check to see if it includes a 2x8 block,
338 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000339 "cmp %6, #2 \n"
340 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000341
342 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000343 "2: \n"
344 "mov r9, %0 \n"
345 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
346 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
347 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
348 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
349 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
350 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
351 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
352 "vld2.16 {d1[3], d3[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000353
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000354 "vtrn.8 d0, d1 \n"
355 "vtrn.8 d2, d3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000356
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000357 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000358
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000359 "vst1.64 {d0}, [r9], %3 \n"
360 "vst1.64 {d2}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000361
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000362 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000363
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000364 "vst1.64 {d1}, [r9], %5 \n"
365 "vst1.64 {d3}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000366
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000367 "add %0, #2*2 \n" // src += 2 * 2
368 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
369 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
370 "subs %6, #2 \n" // w -= 2
371 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000372
373 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000374 "3: \n"
375 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
376 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
377 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
378 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
379 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
380 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
381 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
382 "vld2.8 {d0[7], d1[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000383
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000384 "vst1.64 {d0}, [%2] \n"
385 "vst1.64 {d1}, [%4] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000386
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000387 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000388
fbarchard@google.com5e642d02012-04-21 01:04:46 +0000389 : "+r"(src), // %0
390 "+r"(src_stride), // %1
391 "+r"(dst_a), // %2
392 "+r"(dst_stride_a), // %3
393 "+r"(dst_b), // %4
394 "+r"(dst_stride_b), // %5
395 "+r"(width) // %6
396 : "r"(&kVTbl4x4TransposeDi) // %7
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000397 : "memory", "cc", "r9",
398 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
399 );
400}
401#endif
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +0000402
403#ifdef __cplusplus
404} // extern "C"
405} // namespace libyuv
406#endif