blob: a6496d33be5f82c67a19ed8c5e22ddbf97b302f8 [file] [log] [blame]
frkoenig@google.comf7e74a12011-11-03 22:41:59 +00001/*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/basic_types.h"
12
13namespace libyuv {
14
15#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
16
17void ReverseLine_NEON(const uint8* src, uint8* dst, int width) {
18 asm volatile(
19 // compute where to start writing destination
20 "add %1, %2\n"
21
22 // work on segments that are multiples of 16
23 "lsrs r3, %2, #4\n"
24
25 // the output is written in two block. 8 bytes followed
26 // by another 8. reading is done sequentially, from left to
27 // right. writing is done from right to left in block sizes
28 // %1, the destination pointer is incremented after writing
29 // the first of the two blocks. need to subtract that 8 off
30 // along with 16 to get the next location.
31 "mov r3, #-24\n"
32
33 "beq 2f\n"
34
35 // back of destination by the size of the register that is
36 // going to be reversed
37 "sub %1, #16\n"
38
39 // the loop needs to run on blocks of 16. what will be left
40 // over is either a negative number, the residuals that need
41 // to be done, or 0. if this isn't subtracted off here the
42 // loop will run one extra time.
43 "sub %2, #16\n"
44
45 "1:\n"
46 "vld1.8 {q0}, [%0]!\n" // src += 16
47
48 // reverse the bytes in the 64 bit segments. unable to reverse
49 // the bytes in the entire 128 bits in one go.
50 "vrev64.8 q0, q0\n"
51
52 // because of the inability to reverse the entire 128 bits
53 // reverse the writing out of the two 64 bit segments.
54 "vst1.8 {d1}, [%1]!\n"
55 "vst1.8 {d0}, [%1], r3\n" // dst -= 16
56
57 "subs %2, #16\n"
58 "bge 1b\n"
59
60 // add 16 back to the counter. if the result is 0 there is no
61 // residuals so jump past
62 "adds %2, #16\n"
63 "beq 5f\n"
64
65 "add %1, #16\n"
66
67 "2:\n"
68
69 "mov r3, #-3\n"
70
71 "sub %1, #2\n"
72 "subs %2, #2\n"
73 // check for 16*n+1 scenarios where segments_of_2 should not
74 // be run, but there is something left over.
75 "blt 4f\n"
76
77// do this in neon registers as per
78// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
79 "3:\n"
80 "vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2
81
82 "vst1.8 {d1[0]}, [%1]!\n"
83 "vst1.8 {d0[0]}, [%1], r3\n" // dst -= 2
84
85 "subs %2, #2\n"
86 "bge 3b\n"
87
88 "adds %2, #2\n"
89 "beq 5f\n"
90
91 "4:\n"
92 "add %1, #1\n"
93 "vld1.8 {d0[0]}, [%0]\n"
94 "vst1.8 {d0[0]}, [%1]\n"
95
96 "5:\n"
97 : "+r"(src), // %0
98 "+r"(dst), // %1
99 "+r"(width) // %2
100 :
101 : "memory", "cc", "r3", "q0"
102 );
103}
104
105static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) =
106 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
107
108void TransposeWx8_NEON(const uint8* src, int src_stride,
109 uint8* dst, int dst_stride,
110 int width) {
111 asm volatile(
112 // loops are on blocks of 8. loop will stop when
113 // counter gets to or below 0. starting the counter
114 // at w-8 allow for this
115 "sub %4, #8\n"
116
117 // handle 8x8 blocks. this should be the majority of the plane
118 "1:\n"
119 "mov r9, %0\n"
120
121 "vld1.8 {d0}, [r9], %1\n"
122 "vld1.8 {d1}, [r9], %1\n"
123 "vld1.8 {d2}, [r9], %1\n"
124 "vld1.8 {d3}, [r9], %1\n"
125 "vld1.8 {d4}, [r9], %1\n"
126 "vld1.8 {d5}, [r9], %1\n"
127 "vld1.8 {d6}, [r9], %1\n"
128 "vld1.8 {d7}, [r9]\n"
129
130 "vtrn.8 d1, d0\n"
131 "vtrn.8 d3, d2\n"
132 "vtrn.8 d5, d4\n"
133 "vtrn.8 d7, d6\n"
134
135 "vtrn.16 d1, d3\n"
136 "vtrn.16 d0, d2\n"
137 "vtrn.16 d5, d7\n"
138 "vtrn.16 d4, d6\n"
139
140 "vtrn.32 d1, d5\n"
141 "vtrn.32 d0, d4\n"
142 "vtrn.32 d3, d7\n"
143 "vtrn.32 d2, d6\n"
144
145 "vrev16.8 q0, q0\n"
146 "vrev16.8 q1, q1\n"
147 "vrev16.8 q2, q2\n"
148 "vrev16.8 q3, q3\n"
149
150 "mov r9, %2\n"
151
152 "vst1.8 {d1}, [r9], %3\n"
153 "vst1.8 {d0}, [r9], %3\n"
154 "vst1.8 {d3}, [r9], %3\n"
155 "vst1.8 {d2}, [r9], %3\n"
156 "vst1.8 {d5}, [r9], %3\n"
157 "vst1.8 {d4}, [r9], %3\n"
158 "vst1.8 {d7}, [r9], %3\n"
159 "vst1.8 {d6}, [r9]\n"
160
161 "add %0, #8\n" // src += 8
162 "add %2, %3, lsl #3\n" // dst += 8 * dst_stride
163 "subs %4, #8\n" // w -= 8
164 "bge 1b\n"
165
166 // add 8 back to counter. if the result is 0 there are
167 // no residuals.
168 "adds %4, #8\n"
169 "beq 4f\n"
170
171 // some residual, so between 1 and 7 lines left to transpose
172 "cmp %4, #2\n"
173 "blt 3f\n"
174
175 "cmp %4, #4\n"
176 "blt 2f\n"
177
178 // 4x8 block
179 "mov r9, %0\n"
180 "vld1.32 {d0[0]}, [r9], %1\n"
181 "vld1.32 {d0[1]}, [r9], %1\n"
182 "vld1.32 {d1[0]}, [r9], %1\n"
183 "vld1.32 {d1[1]}, [r9], %1\n"
184 "vld1.32 {d2[0]}, [r9], %1\n"
185 "vld1.32 {d2[1]}, [r9], %1\n"
186 "vld1.32 {d3[0]}, [r9], %1\n"
187 "vld1.32 {d3[1]}, [r9]\n"
188
189 "mov r9, %2\n"
190
191 "vld1.8 {q3}, [%5]\n"
192
193 "vtbl.8 d4, {d0, d1}, d6\n"
194 "vtbl.8 d5, {d0, d1}, d7\n"
195 "vtbl.8 d0, {d2, d3}, d6\n"
196 "vtbl.8 d1, {d2, d3}, d7\n"
197
198 // TODO: rework shuffle above to write
199 // out with 4 instead of 8 writes
200 "vst1.32 {d4[0]}, [r9], %3\n"
201 "vst1.32 {d4[1]}, [r9], %3\n"
202 "vst1.32 {d5[0]}, [r9], %3\n"
203 "vst1.32 {d5[1]}, [r9]\n"
204
205 "add r9, %2, #4\n"
206 "vst1.32 {d0[0]}, [r9], %3\n"
207 "vst1.32 {d0[1]}, [r9], %3\n"
208 "vst1.32 {d1[0]}, [r9], %3\n"
209 "vst1.32 {d1[1]}, [r9]\n"
210
211 "add %0, #4\n" // src += 4
212 "add %2, %3, lsl #2\n" // dst += 4 * dst_stride
213 "subs %4, #4\n" // w -= 4
214 "beq 4f\n"
215
216 // some residual, check to see if it includes a 2x8 block,
217 // or less
218 "cmp %4, #2\n"
219 "blt 3f\n"
220
221 // 2x8 block
222 "2:\n"
223 "mov r9, %0\n"
224 "vld1.16 {d0[0]}, [r9], %1\n"
225 "vld1.16 {d1[0]}, [r9], %1\n"
226 "vld1.16 {d0[1]}, [r9], %1\n"
227 "vld1.16 {d1[1]}, [r9], %1\n"
228 "vld1.16 {d0[2]}, [r9], %1\n"
229 "vld1.16 {d1[2]}, [r9], %1\n"
230 "vld1.16 {d0[3]}, [r9], %1\n"
231 "vld1.16 {d1[3]}, [r9]\n"
232
233 "vtrn.8 d0, d1\n"
234
235 "mov r9, %2\n"
236
237 "vst1.64 {d0}, [r9], %3\n"
238 "vst1.64 {d1}, [r9]\n"
239
240 "add %0, #2\n" // src += 2
241 "add %2, %3, lsl #1\n" // dst += 2 * dst_stride
242 "subs %4, #2\n" // w -= 2
243 "beq 4f\n"
244
245 // 1x8 block
246 "3:\n"
247 "vld1.8 {d0[0]}, [%0], %1\n"
248 "vld1.8 {d0[1]}, [%0], %1\n"
249 "vld1.8 {d0[2]}, [%0], %1\n"
250 "vld1.8 {d0[3]}, [%0], %1\n"
251 "vld1.8 {d0[4]}, [%0], %1\n"
252 "vld1.8 {d0[5]}, [%0], %1\n"
253 "vld1.8 {d0[6]}, [%0], %1\n"
254 "vld1.8 {d0[7]}, [%0]\n"
255
256 "vst1.64 {d0}, [%2]\n"
257
258 "4:\n"
259
260 : "+r"(src), // %0
261 "+r"(src_stride), // %1
262 "+r"(dst), // %2
263 "+r"(dst_stride), // %3
264 "+r"(width) // %4
265 : "r"(vtbl_4x4_transpose) // %5
266 : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
267 );
268}
269
270void ReverseLineUV_NEON(const uint8* src,
271 uint8* dst_a, uint8* dst_b,
272 int width) {
273 asm volatile(
274 // compute where to start writing destination
275 "add %1, %3\n" // dst_a + width
276 "add %2, %3\n" // dst_b + width
277
278 // work on input segments that are multiples of 16, but
279 // width that has been passed is output segments, half
280 // the size of input.
281 "lsrs r12, %3, #3\n"
282
283 "beq 2f\n"
284
285 // the output is written in to two blocks.
286 "mov r12, #-8\n"
287
288 // back of destination by the size of the register that is
289 // going to be reversed
290 "sub %1, #8\n"
291 "sub %2, #8\n"
292
293 // the loop needs to run on blocks of 8. what will be left
294 // over is either a negative number, the residuals that need
295 // to be done, or 0. if this isn't subtracted off here the
296 // loop will run one extra time.
297 "sub %3, #8\n"
298
299 "1:\n"
300 "vld2.8 {d0, d1}, [%0]!\n" // src += 16
301
302 // reverse the bytes in the 64 bit segments
303 "vrev64.8 q0, q0\n"
304
305 "vst1.8 {d0}, [%1], r12\n" // dst_a -= 8
306 "vst1.8 {d1}, [%2], r12\n" // dst_b -= 8
307
308 "subs %3, #8\n"
309 "bge 1b\n"
310
311 // add 8 back to the counter. if the result is 0 there is no
312 // residuals so return
313 "adds %3, #8\n"
314 "beq 4f\n"
315
316 "add %1, #8\n"
317 "add %2, #8\n"
318
319 "2:\n"
320
321 "mov r12, #-1\n"
322
323 "sub %1, #1\n"
324 "sub %2, #1\n"
325
326 "3:\n"
327 "vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2
328
329 "vst1.8 {d0[0]}, [%1], r12\n" // dst_a -= 1
330 "vst1.8 {d1[0]}, [%2], r12\n" // dst_b -= 1
331
332 "subs %3, %3, #1\n"
333 "bgt 3b\n"
334 "4:\n"
335 : "+r"(src), // %0
336 "+r"(dst_a), // %1
337 "+r"(dst_b), // %2
338 "+r"(width) // %3
339 :
340 : "memory", "cc", "r12", "q0"
341 );
342}
343
344static const uint8 vtbl_4x4_transpose_di[16] __attribute__((vector_size(16))) =
345 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
346
347void TransposeUVWx8_NEON(const uint8* src, int src_stride,
348 uint8* dst_a, int dst_stride_a,
349 uint8* dst_b, int dst_stride_b,
350 int width) {
351 asm volatile(
352 // loops are on blocks of 8. loop will stop when
353 // counter gets to or below 0. starting the counter
354 // at w-8 allow for this
355 "sub %6, #8\n"
356
357 // handle 8x8 blocks. this should be the majority of the plane
358 "1:\n"
359 "mov r9, %0\n"
360
361 "vld2.8 {d0, d1}, [r9], %1\n"
362 "vld2.8 {d2, d3}, [r9], %1\n"
363 "vld2.8 {d4, d5}, [r9], %1\n"
364 "vld2.8 {d6, d7}, [r9], %1\n"
365 "vld2.8 {d16, d17}, [r9], %1\n"
366 "vld2.8 {d18, d19}, [r9], %1\n"
367 "vld2.8 {d20, d21}, [r9], %1\n"
368 "vld2.8 {d22, d23}, [r9]\n"
369
370 "vtrn.8 q1, q0\n"
371 "vtrn.8 q3, q2\n"
372 "vtrn.8 q9, q8\n"
373 "vtrn.8 q11, q10\n"
374
375 "vtrn.16 q1, q3\n"
376 "vtrn.16 q0, q2\n"
377 "vtrn.16 q9, q11\n"
378 "vtrn.16 q8, q10\n"
379
380 "vtrn.32 q1, q9\n"
381 "vtrn.32 q0, q8\n"
382 "vtrn.32 q3, q11\n"
383 "vtrn.32 q2, q10\n"
384
385 "vrev16.8 q0, q0\n"
386 "vrev16.8 q1, q1\n"
387 "vrev16.8 q2, q2\n"
388 "vrev16.8 q3, q3\n"
389 "vrev16.8 q8, q8\n"
390 "vrev16.8 q9, q9\n"
391 "vrev16.8 q10, q10\n"
392 "vrev16.8 q11, q11\n"
393
394 "mov r9, %2\n"
395
396 "vst1.8 {d2}, [r9], %3\n"
397 "vst1.8 {d0}, [r9], %3\n"
398 "vst1.8 {d6}, [r9], %3\n"
399 "vst1.8 {d4}, [r9], %3\n"
400 "vst1.8 {d18}, [r9], %3\n"
401 "vst1.8 {d16}, [r9], %3\n"
402 "vst1.8 {d22}, [r9], %3\n"
403 "vst1.8 {d20}, [r9]\n"
404
405 "mov r9, %4\n"
406
407 "vst1.8 {d3}, [r9], %5\n"
408 "vst1.8 {d1}, [r9], %5\n"
409 "vst1.8 {d7}, [r9], %5\n"
410 "vst1.8 {d5}, [r9], %5\n"
411 "vst1.8 {d19}, [r9], %5\n"
412 "vst1.8 {d17}, [r9], %5\n"
413 "vst1.8 {d23}, [r9], %5\n"
414 "vst1.8 {d21}, [r9]\n"
415
416 "add %0, #8*2\n" // src += 8*2
417 "add %2, %3, lsl #3\n" // dst_a += 8 * dst_stride_a
418 "add %4, %5, lsl #3\n" // dst_b += 8 * dst_stride_b
419 "subs %6, #8\n" // w -= 8
420 "bge 1b\n"
421
422 // add 8 back to counter. if the result is 0 there are
423 // no residuals.
424 "adds %6, #8\n"
425 "beq 4f\n"
426
427 // some residual, so between 1 and 7 lines left to transpose
428 "cmp %6, #2\n"
429 "blt 3f\n"
430
431 "cmp %6, #4\n"
432 "blt 2f\n"
433
434 //TODO(frkoenig) : clean this up
435 // 4x8 block
436 "mov r9, %0\n"
437 "vld1.64 {d0}, [r9], %1\n"
438 "vld1.64 {d1}, [r9], %1\n"
439 "vld1.64 {d2}, [r9], %1\n"
440 "vld1.64 {d3}, [r9], %1\n"
441 "vld1.64 {d4}, [r9], %1\n"
442 "vld1.64 {d5}, [r9], %1\n"
443 "vld1.64 {d6}, [r9], %1\n"
444 "vld1.64 {d7}, [r9]\n"
445
446 "vld1.8 {q15}, [%7]\n"
447
448 "vtrn.8 q0, q1\n"
449 "vtrn.8 q2, q3\n"
450
451 "vtbl.8 d16, {d0, d1}, d30\n"
452 "vtbl.8 d17, {d0, d1}, d31\n"
453 "vtbl.8 d18, {d2, d3}, d30\n"
454 "vtbl.8 d19, {d2, d3}, d31\n"
455 "vtbl.8 d20, {d4, d5}, d30\n"
456 "vtbl.8 d21, {d4, d5}, d31\n"
457 "vtbl.8 d22, {d6, d7}, d30\n"
458 "vtbl.8 d23, {d6, d7}, d31\n"
459
460 "mov r9, %2\n"
461
462 "vst1.32 {d16[0]}, [r9], %3\n"
463 "vst1.32 {d16[1]}, [r9], %3\n"
464 "vst1.32 {d17[0]}, [r9], %3\n"
465 "vst1.32 {d17[1]}, [r9], %3\n"
466
467 "add r9, %2, #4\n"
468 "vst1.32 {d20[0]}, [r9], %3\n"
469 "vst1.32 {d20[1]}, [r9], %3\n"
470 "vst1.32 {d21[0]}, [r9], %3\n"
471 "vst1.32 {d21[1]}, [r9]\n"
472
473 "mov r9, %4\n"
474
475 "vst1.32 {d18[0]}, [r9], %5\n"
476 "vst1.32 {d18[1]}, [r9], %5\n"
477 "vst1.32 {d19[0]}, [r9], %5\n"
478 "vst1.32 {d19[1]}, [r9], %5\n"
479
480 "add r9, %4, #4\n"
481 "vst1.32 {d22[0]}, [r9], %5\n"
482 "vst1.32 {d22[1]}, [r9], %5\n"
483 "vst1.32 {d23[0]}, [r9], %5\n"
484 "vst1.32 {d23[1]}, [r9]\n"
485
486 "add %0, #4*2\n" // src += 4 * 2
487 "add %2, %3, lsl #2\n" // dst_a += 4 * dst_stride_a
488 "add %4, %5, lsl #2\n" // dst_b += 4 * dst_stride_b
489 "subs %6, #4\n" // w -= 4
490 "beq 4f\n"
491
492 // some residual, check to see if it includes a 2x8 block,
493 // or less
494 "cmp %6, #2\n"
495 "blt 3f\n"
496
497 // 2x8 block
498 "2:\n"
499 "mov r9, %0\n"
500 "vld2.16 {d0[0], d2[0]}, [r9], %1\n"
501 "vld2.16 {d1[0], d3[0]}, [r9], %1\n"
502 "vld2.16 {d0[1], d2[1]}, [r9], %1\n"
503 "vld2.16 {d1[1], d3[1]}, [r9], %1\n"
504 "vld2.16 {d0[2], d2[2]}, [r9], %1\n"
505 "vld2.16 {d1[2], d3[2]}, [r9], %1\n"
506 "vld2.16 {d0[3], d2[3]}, [r9], %1\n"
507 "vld2.16 {d1[3], d3[3]}, [r9]\n"
508
509 "vtrn.8 d0, d1\n"
510 "vtrn.8 d2, d3\n"
511
512 "mov r9, %2\n"
513
514 "vst1.64 {d0}, [r9], %3\n"
515 "vst1.64 {d2}, [r9]\n"
516
517 "mov r9, %4\n"
518
519 "vst1.64 {d1}, [r9], %5\n"
520 "vst1.64 {d3}, [r9]\n"
521
522 "add %0, #2*2\n" // src += 2 * 2
523 "add %2, %3, lsl #1\n" // dst_a += 2 * dst_stride_a
524 "add %4, %5, lsl #1\n" // dst_b += 2 * dst_stride_b
525 "subs %6, #2\n" // w -= 2
526 "beq 4f\n"
527
528 // 1x8 block
529 "3:\n"
530 "vld2.8 {d0[0], d1[0]}, [%0], %1\n"
531 "vld2.8 {d0[1], d1[1]}, [%0], %1\n"
532 "vld2.8 {d0[2], d1[2]}, [%0], %1\n"
533 "vld2.8 {d0[3], d1[3]}, [%0], %1\n"
534 "vld2.8 {d0[4], d1[4]}, [%0], %1\n"
535 "vld2.8 {d0[5], d1[5]}, [%0], %1\n"
536 "vld2.8 {d0[6], d1[6]}, [%0], %1\n"
537 "vld2.8 {d0[7], d1[7]}, [%0]\n"
538
539 "vst1.64 {d0}, [%2]\n"
540 "vst1.64 {d1}, [%4]\n"
541
542 "4:\n"
543
544 : "+r"(src), // %0
545 "+r"(src_stride), // %1
546 "+r"(dst_a), // %2
547 "+r"(dst_stride_a), // %3
548 "+r"(dst_b), // %4
549 "+r"(dst_stride_b), // %5
550 "+r"(width) // %6
551 : "r"(vtbl_4x4_transpose_di)// %7
552 : "memory", "cc", "r9",
553 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
554 );
555}
556#endif
557}