blob: 7ff99361729bb36caf48eb303d0e672b29cf626d [file] [log] [blame]
frkoenig@google.comf7e74a12011-11-03 22:41:59 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
fbarchard@google.com2b9c2102012-03-22 22:36:44 +000011#include "source/row.h"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000012
fbarchard@google.com17f198c2012-01-04 02:21:05 +000013#include "libyuv/basic_types.h"
14
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000015#ifdef __cplusplus
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000016namespace libyuv {
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +000017extern "C" {
18#endif
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000019
fbarchard@google.comd2f44132012-04-04 21:53:27 +000020#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000021
fbarchard@google.com8536b2f2012-02-21 21:02:54 +000022static const uvec8 vtbl_4x4_transpose =
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000023 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
24
25void TransposeWx8_NEON(const uint8* src, int src_stride,
26 uint8* dst, int dst_stride,
27 int width) {
fbarchard@google.com5b225062012-03-29 02:19:26 +000028 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000029 // loops are on blocks of 8. loop will stop when
30 // counter gets to or below 0. starting the counter
31 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +000032 "sub %4, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000033
34 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.comf7a50482011-11-10 22:41:20 +000035 "1: \n"
36 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000037
fbarchard@google.comf7a50482011-11-10 22:41:20 +000038 "vld1.8 {d0}, [r9], %1 \n"
39 "vld1.8 {d1}, [r9], %1 \n"
40 "vld1.8 {d2}, [r9], %1 \n"
41 "vld1.8 {d3}, [r9], %1 \n"
42 "vld1.8 {d4}, [r9], %1 \n"
43 "vld1.8 {d5}, [r9], %1 \n"
44 "vld1.8 {d6}, [r9], %1 \n"
45 "vld1.8 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000046
fbarchard@google.comf7a50482011-11-10 22:41:20 +000047 "vtrn.8 d1, d0 \n"
48 "vtrn.8 d3, d2 \n"
49 "vtrn.8 d5, d4 \n"
50 "vtrn.8 d7, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000051
fbarchard@google.comf7a50482011-11-10 22:41:20 +000052 "vtrn.16 d1, d3 \n"
53 "vtrn.16 d0, d2 \n"
54 "vtrn.16 d5, d7 \n"
55 "vtrn.16 d4, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000056
fbarchard@google.comf7a50482011-11-10 22:41:20 +000057 "vtrn.32 d1, d5 \n"
58 "vtrn.32 d0, d4 \n"
59 "vtrn.32 d3, d7 \n"
60 "vtrn.32 d2, d6 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000061
fbarchard@google.comf7a50482011-11-10 22:41:20 +000062 "vrev16.8 q0, q0 \n"
63 "vrev16.8 q1, q1 \n"
64 "vrev16.8 q2, q2 \n"
65 "vrev16.8 q3, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000066
fbarchard@google.comf7a50482011-11-10 22:41:20 +000067 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000068
fbarchard@google.comf7a50482011-11-10 22:41:20 +000069 "vst1.8 {d1}, [r9], %3 \n"
70 "vst1.8 {d0}, [r9], %3 \n"
71 "vst1.8 {d3}, [r9], %3 \n"
72 "vst1.8 {d2}, [r9], %3 \n"
73 "vst1.8 {d5}, [r9], %3 \n"
74 "vst1.8 {d4}, [r9], %3 \n"
75 "vst1.8 {d7}, [r9], %3 \n"
76 "vst1.8 {d6}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000077
fbarchard@google.comf7a50482011-11-10 22:41:20 +000078 "add %0, #8 \n" // src += 8
79 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
80 "subs %4, #8 \n" // w -= 8
81 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000082
83 // add 8 back to counter. if the result is 0 there are
84 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +000085 "adds %4, #8 \n"
86 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000087
88 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +000089 "cmp %4, #2 \n"
90 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000091
fbarchard@google.comf7a50482011-11-10 22:41:20 +000092 "cmp %4, #4 \n"
93 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +000094
95 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +000096 "mov r9, %0 \n"
97 "vld1.32 {d0[0]}, [r9], %1 \n"
98 "vld1.32 {d0[1]}, [r9], %1 \n"
99 "vld1.32 {d1[0]}, [r9], %1 \n"
100 "vld1.32 {d1[1]}, [r9], %1 \n"
101 "vld1.32 {d2[0]}, [r9], %1 \n"
102 "vld1.32 {d2[1]}, [r9], %1 \n"
103 "vld1.32 {d3[0]}, [r9], %1 \n"
104 "vld1.32 {d3[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000105
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000106 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000107
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000108 "vld1.8 {q3}, [%5] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000109
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000110 "vtbl.8 d4, {d0, d1}, d6 \n"
111 "vtbl.8 d5, {d0, d1}, d7 \n"
112 "vtbl.8 d0, {d2, d3}, d6 \n"
113 "vtbl.8 d1, {d2, d3}, d7 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000114
115 // TODO: rework shuffle above to write
116 // out with 4 instead of 8 writes
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000117 "vst1.32 {d4[0]}, [r9], %3 \n"
118 "vst1.32 {d4[1]}, [r9], %3 \n"
119 "vst1.32 {d5[0]}, [r9], %3 \n"
120 "vst1.32 {d5[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000121
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000122 "add r9, %2, #4 \n"
123 "vst1.32 {d0[0]}, [r9], %3 \n"
124 "vst1.32 {d0[1]}, [r9], %3 \n"
125 "vst1.32 {d1[0]}, [r9], %3 \n"
126 "vst1.32 {d1[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000127
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000128 "add %0, #4 \n" // src += 4
129 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
130 "subs %4, #4 \n" // w -= 4
131 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000132
133 // some residual, check to see if it includes a 2x8 block,
134 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000135 "cmp %4, #2 \n"
136 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000137
138 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000139 "2: \n"
140 "mov r9, %0 \n"
141 "vld1.16 {d0[0]}, [r9], %1 \n"
142 "vld1.16 {d1[0]}, [r9], %1 \n"
143 "vld1.16 {d0[1]}, [r9], %1 \n"
144 "vld1.16 {d1[1]}, [r9], %1 \n"
145 "vld1.16 {d0[2]}, [r9], %1 \n"
146 "vld1.16 {d1[2]}, [r9], %1 \n"
147 "vld1.16 {d0[3]}, [r9], %1 \n"
148 "vld1.16 {d1[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000149
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000150 "vtrn.8 d0, d1 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000151
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000152 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000153
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000154 "vst1.64 {d0}, [r9], %3 \n"
155 "vst1.64 {d1}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000156
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000157 "add %0, #2 \n" // src += 2
158 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
159 "subs %4, #2 \n" // w -= 2
160 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000161
162 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000163 "3: \n"
164 "vld1.8 {d0[0]}, [%0], %1 \n"
165 "vld1.8 {d0[1]}, [%0], %1 \n"
166 "vld1.8 {d0[2]}, [%0], %1 \n"
167 "vld1.8 {d0[3]}, [%0], %1 \n"
168 "vld1.8 {d0[4]}, [%0], %1 \n"
169 "vld1.8 {d0[5]}, [%0], %1 \n"
170 "vld1.8 {d0[6]}, [%0], %1 \n"
171 "vld1.8 {d0[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000172
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000173 "vst1.64 {d0}, [%2] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000174
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000175 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000176
177 : "+r"(src), // %0
178 "+r"(src_stride), // %1
179 "+r"(dst), // %2
180 "+r"(dst_stride), // %3
181 "+r"(width) // %4
182 : "r"(vtbl_4x4_transpose) // %5
183 : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
184 );
185}
186
fbarchard@google.com8536b2f2012-02-21 21:02:54 +0000187static const uvec8 vtbl_4x4_transpose_di =
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000188 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
189
190void TransposeUVWx8_NEON(const uint8* src, int src_stride,
191 uint8* dst_a, int dst_stride_a,
192 uint8* dst_b, int dst_stride_b,
193 int width) {
fbarchard@google.com5b225062012-03-29 02:19:26 +0000194 asm volatile (
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000195 // loops are on blocks of 8. loop will stop when
196 // counter gets to or below 0. starting the counter
197 // at w-8 allow for this
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000198 "sub %6, #8 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000199
200 // handle 8x8 blocks. this should be the majority of the plane
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000201 "1: \n"
202 "mov r9, %0 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000203
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000204 "vld2.8 {d0, d1}, [r9], %1 \n"
205 "vld2.8 {d2, d3}, [r9], %1 \n"
206 "vld2.8 {d4, d5}, [r9], %1 \n"
207 "vld2.8 {d6, d7}, [r9], %1 \n"
208 "vld2.8 {d16, d17}, [r9], %1 \n"
209 "vld2.8 {d18, d19}, [r9], %1 \n"
210 "vld2.8 {d20, d21}, [r9], %1 \n"
211 "vld2.8 {d22, d23}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000212
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000213 "vtrn.8 q1, q0 \n"
214 "vtrn.8 q3, q2 \n"
215 "vtrn.8 q9, q8 \n"
216 "vtrn.8 q11, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000217
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000218 "vtrn.16 q1, q3 \n"
219 "vtrn.16 q0, q2 \n"
220 "vtrn.16 q9, q11 \n"
221 "vtrn.16 q8, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000222
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000223 "vtrn.32 q1, q9 \n"
224 "vtrn.32 q0, q8 \n"
225 "vtrn.32 q3, q11 \n"
226 "vtrn.32 q2, q10 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000227
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000228 "vrev16.8 q0, q0 \n"
229 "vrev16.8 q1, q1 \n"
230 "vrev16.8 q2, q2 \n"
231 "vrev16.8 q3, q3 \n"
232 "vrev16.8 q8, q8 \n"
233 "vrev16.8 q9, q9 \n"
234 "vrev16.8 q10, q10 \n"
235 "vrev16.8 q11, q11 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000236
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000237 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000238
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000239 "vst1.8 {d2}, [r9], %3 \n"
240 "vst1.8 {d0}, [r9], %3 \n"
241 "vst1.8 {d6}, [r9], %3 \n"
242 "vst1.8 {d4}, [r9], %3 \n"
243 "vst1.8 {d18}, [r9], %3 \n"
244 "vst1.8 {d16}, [r9], %3 \n"
245 "vst1.8 {d22}, [r9], %3 \n"
246 "vst1.8 {d20}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000247
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000248 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000249
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000250 "vst1.8 {d3}, [r9], %5 \n"
251 "vst1.8 {d1}, [r9], %5 \n"
252 "vst1.8 {d7}, [r9], %5 \n"
253 "vst1.8 {d5}, [r9], %5 \n"
254 "vst1.8 {d19}, [r9], %5 \n"
255 "vst1.8 {d17}, [r9], %5 \n"
256 "vst1.8 {d23}, [r9], %5 \n"
257 "vst1.8 {d21}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000258
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000259 "add %0, #8*2 \n" // src += 8*2
260 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
261 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
262 "subs %6, #8 \n" // w -= 8
263 "bge 1b \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000264
265 // add 8 back to counter. if the result is 0 there are
266 // no residuals.
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000267 "adds %6, #8 \n"
268 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000269
270 // some residual, so between 1 and 7 lines left to transpose
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000271 "cmp %6, #2 \n"
272 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000273
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000274 "cmp %6, #4 \n"
275 "blt 2f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000276
277 //TODO(frkoenig) : clean this up
278 // 4x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000279 "mov r9, %0 \n"
280 "vld1.64 {d0}, [r9], %1 \n"
281 "vld1.64 {d1}, [r9], %1 \n"
282 "vld1.64 {d2}, [r9], %1 \n"
283 "vld1.64 {d3}, [r9], %1 \n"
284 "vld1.64 {d4}, [r9], %1 \n"
285 "vld1.64 {d5}, [r9], %1 \n"
286 "vld1.64 {d6}, [r9], %1 \n"
287 "vld1.64 {d7}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000288
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000289 "vld1.8 {q15}, [%7] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000290
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000291 "vtrn.8 q0, q1 \n"
292 "vtrn.8 q2, q3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000293
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000294 "vtbl.8 d16, {d0, d1}, d30 \n"
295 "vtbl.8 d17, {d0, d1}, d31 \n"
296 "vtbl.8 d18, {d2, d3}, d30 \n"
297 "vtbl.8 d19, {d2, d3}, d31 \n"
298 "vtbl.8 d20, {d4, d5}, d30 \n"
299 "vtbl.8 d21, {d4, d5}, d31 \n"
300 "vtbl.8 d22, {d6, d7}, d30 \n"
301 "vtbl.8 d23, {d6, d7}, d31 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000302
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000303 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000304
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000305 "vst1.32 {d16[0]}, [r9], %3 \n"
306 "vst1.32 {d16[1]}, [r9], %3 \n"
307 "vst1.32 {d17[0]}, [r9], %3 \n"
308 "vst1.32 {d17[1]}, [r9], %3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000309
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000310 "add r9, %2, #4 \n"
311 "vst1.32 {d20[0]}, [r9], %3 \n"
312 "vst1.32 {d20[1]}, [r9], %3 \n"
313 "vst1.32 {d21[0]}, [r9], %3 \n"
314 "vst1.32 {d21[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000315
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000316 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000317
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000318 "vst1.32 {d18[0]}, [r9], %5 \n"
319 "vst1.32 {d18[1]}, [r9], %5 \n"
320 "vst1.32 {d19[0]}, [r9], %5 \n"
321 "vst1.32 {d19[1]}, [r9], %5 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000322
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000323 "add r9, %4, #4 \n"
324 "vst1.32 {d22[0]}, [r9], %5 \n"
325 "vst1.32 {d22[1]}, [r9], %5 \n"
326 "vst1.32 {d23[0]}, [r9], %5 \n"
327 "vst1.32 {d23[1]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000328
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000329 "add %0, #4*2 \n" // src += 4 * 2
330 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
331 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
332 "subs %6, #4 \n" // w -= 4
333 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000334
335 // some residual, check to see if it includes a 2x8 block,
336 // or less
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000337 "cmp %6, #2 \n"
338 "blt 3f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000339
340 // 2x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000341 "2: \n"
342 "mov r9, %0 \n"
343 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
344 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
345 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
346 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
347 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
348 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
349 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
350 "vld2.16 {d1[3], d3[3]}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000351
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000352 "vtrn.8 d0, d1 \n"
353 "vtrn.8 d2, d3 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000354
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000355 "mov r9, %2 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000356
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000357 "vst1.64 {d0}, [r9], %3 \n"
358 "vst1.64 {d2}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000359
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000360 "mov r9, %4 \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000361
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000362 "vst1.64 {d1}, [r9], %5 \n"
363 "vst1.64 {d3}, [r9] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000364
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000365 "add %0, #2*2 \n" // src += 2 * 2
366 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
367 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
368 "subs %6, #2 \n" // w -= 2
369 "beq 4f \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000370
371 // 1x8 block
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000372 "3: \n"
373 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
374 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
375 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
376 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
377 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
378 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
379 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
380 "vld2.8 {d0[7], d1[7]}, [%0] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000381
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000382 "vst1.64 {d0}, [%2] \n"
383 "vst1.64 {d1}, [%4] \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000384
fbarchard@google.comf7a50482011-11-10 22:41:20 +0000385 "4: \n"
frkoenig@google.comf7e74a12011-11-03 22:41:59 +0000386
387 : "+r"(src), // %0
388 "+r"(src_stride), // %1
389 "+r"(dst_a), // %2
390 "+r"(dst_stride_a), // %3
391 "+r"(dst_b), // %4
392 "+r"(dst_stride_b), // %5
393 "+r"(width) // %6
394 : "r"(vtbl_4x4_transpose_di)// %7
395 : "memory", "cc", "r9",
396 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
397 );
398}
399#endif
fbarchard@google.comfe5ff7e2011-12-10 07:45:58 +0000400
401#ifdef __cplusplus
402} // extern "C"
403} // namespace libyuv
404#endif