frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 1 | /* |
fbarchard@google.com | b0c9797 | 2012-08-08 19:04:24 +0000 | [diff] [blame] | 2 | * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
fbarchard@google.com | 142f6c4 | 2012-09-18 20:56:51 +0000 | [diff] [blame] | 11 | #include "libyuv/row.h" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 12 | |
fbarchard@google.com | 17f198c | 2012-01-04 02:21:05 +0000 | [diff] [blame] | 13 | #include "libyuv/basic_types.h" |
| 14 | |
fbarchard@google.com | fe5ff7e | 2011-12-10 07:45:58 +0000 | [diff] [blame] | 15 | #ifdef __cplusplus |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 16 | namespace libyuv { |
fbarchard@google.com | fe5ff7e | 2011-12-10 07:45:58 +0000 | [diff] [blame] | 17 | extern "C" { |
| 18 | #endif |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 19 | |
fbarchard@google.com | d2f4413 | 2012-04-04 21:53:27 +0000 | [diff] [blame] | 20 | #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 21 | |
fbarchard@google.com | 5e642d0 | 2012-04-21 01:04:46 +0000 | [diff] [blame] | 22 | static const uvec8 kVTbl4x4Transpose = |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 23 | { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; |
| 24 | |
| 25 | void TransposeWx8_NEON(const uint8* src, int src_stride, |
| 26 | uint8* dst, int dst_stride, |
| 27 | int width) { |
fbarchard@google.com | 5b22506 | 2012-03-29 02:19:26 +0000 | [diff] [blame] | 28 | asm volatile ( |
fbarchard@google.com | 64ce0ab | 2012-10-09 00:05:29 +0000 | [diff] [blame] | 29 | // loops are on blocks of 8. loop will stop when |
| 30 | // counter gets to or below 0. starting the counter |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 31 | // at w-8 allow for this |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 32 | "sub %4, #8 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 33 | |
fbarchard@google.com | 64ce0ab | 2012-10-09 00:05:29 +0000 | [diff] [blame] | 34 | // handle 8x8 blocks. this should be the majority of the plane |
fbarchard@google.com | 5bf29b5 | 2012-05-02 00:10:16 +0000 | [diff] [blame] | 35 | ".p2align 4 \n" |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 36 | "1: \n" |
| 37 | "mov r9, %0 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 38 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 39 | "vld1.8 {d0}, [r9], %1 \n" |
| 40 | "vld1.8 {d1}, [r9], %1 \n" |
| 41 | "vld1.8 {d2}, [r9], %1 \n" |
| 42 | "vld1.8 {d3}, [r9], %1 \n" |
| 43 | "vld1.8 {d4}, [r9], %1 \n" |
| 44 | "vld1.8 {d5}, [r9], %1 \n" |
| 45 | "vld1.8 {d6}, [r9], %1 \n" |
| 46 | "vld1.8 {d7}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 47 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 48 | "vtrn.8 d1, d0 \n" |
| 49 | "vtrn.8 d3, d2 \n" |
| 50 | "vtrn.8 d5, d4 \n" |
| 51 | "vtrn.8 d7, d6 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 52 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 53 | "vtrn.16 d1, d3 \n" |
| 54 | "vtrn.16 d0, d2 \n" |
| 55 | "vtrn.16 d5, d7 \n" |
| 56 | "vtrn.16 d4, d6 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 57 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 58 | "vtrn.32 d1, d5 \n" |
| 59 | "vtrn.32 d0, d4 \n" |
| 60 | "vtrn.32 d3, d7 \n" |
| 61 | "vtrn.32 d2, d6 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 62 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 63 | "vrev16.8 q0, q0 \n" |
| 64 | "vrev16.8 q1, q1 \n" |
| 65 | "vrev16.8 q2, q2 \n" |
| 66 | "vrev16.8 q3, q3 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 67 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 68 | "mov r9, %2 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 69 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 70 | "vst1.8 {d1}, [r9], %3 \n" |
| 71 | "vst1.8 {d0}, [r9], %3 \n" |
| 72 | "vst1.8 {d3}, [r9], %3 \n" |
| 73 | "vst1.8 {d2}, [r9], %3 \n" |
| 74 | "vst1.8 {d5}, [r9], %3 \n" |
| 75 | "vst1.8 {d4}, [r9], %3 \n" |
| 76 | "vst1.8 {d7}, [r9], %3 \n" |
| 77 | "vst1.8 {d6}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 78 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 79 | "add %0, #8 \n" // src += 8 |
| 80 | "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride |
| 81 | "subs %4, #8 \n" // w -= 8 |
| 82 | "bge 1b \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 83 | |
fbarchard@google.com | 64ce0ab | 2012-10-09 00:05:29 +0000 | [diff] [blame] | 84 | // add 8 back to counter. if the result is 0 there are |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 85 | // no residuals. |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 86 | "adds %4, #8 \n" |
| 87 | "beq 4f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 88 | |
| 89 | // some residual, so between 1 and 7 lines left to transpose |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 90 | "cmp %4, #2 \n" |
| 91 | "blt 3f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 92 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 93 | "cmp %4, #4 \n" |
| 94 | "blt 2f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 95 | |
| 96 | // 4x8 block |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 97 | "mov r9, %0 \n" |
| 98 | "vld1.32 {d0[0]}, [r9], %1 \n" |
| 99 | "vld1.32 {d0[1]}, [r9], %1 \n" |
| 100 | "vld1.32 {d1[0]}, [r9], %1 \n" |
| 101 | "vld1.32 {d1[1]}, [r9], %1 \n" |
| 102 | "vld1.32 {d2[0]}, [r9], %1 \n" |
| 103 | "vld1.32 {d2[1]}, [r9], %1 \n" |
| 104 | "vld1.32 {d3[0]}, [r9], %1 \n" |
| 105 | "vld1.32 {d3[1]}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 106 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 107 | "mov r9, %2 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 108 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 109 | "vld1.8 {q3}, [%5] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 110 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 111 | "vtbl.8 d4, {d0, d1}, d6 \n" |
| 112 | "vtbl.8 d5, {d0, d1}, d7 \n" |
| 113 | "vtbl.8 d0, {d2, d3}, d6 \n" |
| 114 | "vtbl.8 d1, {d2, d3}, d7 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 115 | |
fbarchard@google.com | 1d160cb | 2012-11-28 20:02:55 +0000 | [diff] [blame] | 116 | // TODO(frkoenig): Rework shuffle above to |
| 117 | // write out with 4 instead of 8 writes. |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 118 | "vst1.32 {d4[0]}, [r9], %3 \n" |
| 119 | "vst1.32 {d4[1]}, [r9], %3 \n" |
| 120 | "vst1.32 {d5[0]}, [r9], %3 \n" |
| 121 | "vst1.32 {d5[1]}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 122 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 123 | "add r9, %2, #4 \n" |
| 124 | "vst1.32 {d0[0]}, [r9], %3 \n" |
| 125 | "vst1.32 {d0[1]}, [r9], %3 \n" |
| 126 | "vst1.32 {d1[0]}, [r9], %3 \n" |
| 127 | "vst1.32 {d1[1]}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 128 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 129 | "add %0, #4 \n" // src += 4 |
| 130 | "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride |
| 131 | "subs %4, #4 \n" // w -= 4 |
| 132 | "beq 4f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 133 | |
| 134 | // some residual, check to see if it includes a 2x8 block, |
| 135 | // or less |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 136 | "cmp %4, #2 \n" |
| 137 | "blt 3f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 138 | |
| 139 | // 2x8 block |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 140 | "2: \n" |
| 141 | "mov r9, %0 \n" |
| 142 | "vld1.16 {d0[0]}, [r9], %1 \n" |
| 143 | "vld1.16 {d1[0]}, [r9], %1 \n" |
| 144 | "vld1.16 {d0[1]}, [r9], %1 \n" |
| 145 | "vld1.16 {d1[1]}, [r9], %1 \n" |
| 146 | "vld1.16 {d0[2]}, [r9], %1 \n" |
| 147 | "vld1.16 {d1[2]}, [r9], %1 \n" |
| 148 | "vld1.16 {d0[3]}, [r9], %1 \n" |
| 149 | "vld1.16 {d1[3]}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 150 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 151 | "vtrn.8 d0, d1 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 152 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 153 | "mov r9, %2 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 154 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 155 | "vst1.64 {d0}, [r9], %3 \n" |
| 156 | "vst1.64 {d1}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 157 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 158 | "add %0, #2 \n" // src += 2 |
| 159 | "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride |
| 160 | "subs %4, #2 \n" // w -= 2 |
| 161 | "beq 4f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 162 | |
| 163 | // 1x8 block |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 164 | "3: \n" |
| 165 | "vld1.8 {d0[0]}, [%0], %1 \n" |
| 166 | "vld1.8 {d0[1]}, [%0], %1 \n" |
| 167 | "vld1.8 {d0[2]}, [%0], %1 \n" |
| 168 | "vld1.8 {d0[3]}, [%0], %1 \n" |
| 169 | "vld1.8 {d0[4]}, [%0], %1 \n" |
| 170 | "vld1.8 {d0[5]}, [%0], %1 \n" |
| 171 | "vld1.8 {d0[6]}, [%0], %1 \n" |
| 172 | "vld1.8 {d0[7]}, [%0] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 173 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 174 | "vst1.64 {d0}, [%2] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 175 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 176 | "4: \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 177 | |
fbarchard@google.com | c428470 | 2012-04-22 15:00:50 +0000 | [diff] [blame] | 178 | : "+r"(src), // %0 |
| 179 | "+r"(src_stride), // %1 |
| 180 | "+r"(dst), // %2 |
| 181 | "+r"(dst_stride), // %3 |
| 182 | "+r"(width) // %4 |
| 183 | : "r"(&kVTbl4x4Transpose) // %5 |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 184 | : "memory", "cc", "r9", "q0", "q1", "q2", "q3" |
| 185 | ); |
| 186 | } |
| 187 | |
fbarchard@google.com | 5e642d0 | 2012-04-21 01:04:46 +0000 | [diff] [blame] | 188 | static const uvec8 kVTbl4x4TransposeDi = |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 189 | { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; |
| 190 | |
| 191 | void TransposeUVWx8_NEON(const uint8* src, int src_stride, |
| 192 | uint8* dst_a, int dst_stride_a, |
| 193 | uint8* dst_b, int dst_stride_b, |
| 194 | int width) { |
fbarchard@google.com | 5b22506 | 2012-03-29 02:19:26 +0000 | [diff] [blame] | 195 | asm volatile ( |
fbarchard@google.com | 64ce0ab | 2012-10-09 00:05:29 +0000 | [diff] [blame] | 196 | // loops are on blocks of 8. loop will stop when |
| 197 | // counter gets to or below 0. starting the counter |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 198 | // at w-8 allow for this |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 199 | "sub %6, #8 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 200 | |
fbarchard@google.com | 64ce0ab | 2012-10-09 00:05:29 +0000 | [diff] [blame] | 201 | // handle 8x8 blocks. this should be the majority of the plane |
fbarchard@google.com | 5bf29b5 | 2012-05-02 00:10:16 +0000 | [diff] [blame] | 202 | ".p2align 4 \n" |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 203 | "1: \n" |
| 204 | "mov r9, %0 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 205 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 206 | "vld2.8 {d0, d1}, [r9], %1 \n" |
| 207 | "vld2.8 {d2, d3}, [r9], %1 \n" |
| 208 | "vld2.8 {d4, d5}, [r9], %1 \n" |
| 209 | "vld2.8 {d6, d7}, [r9], %1 \n" |
| 210 | "vld2.8 {d16, d17}, [r9], %1 \n" |
| 211 | "vld2.8 {d18, d19}, [r9], %1 \n" |
| 212 | "vld2.8 {d20, d21}, [r9], %1 \n" |
| 213 | "vld2.8 {d22, d23}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 214 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 215 | "vtrn.8 q1, q0 \n" |
| 216 | "vtrn.8 q3, q2 \n" |
| 217 | "vtrn.8 q9, q8 \n" |
| 218 | "vtrn.8 q11, q10 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 219 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 220 | "vtrn.16 q1, q3 \n" |
| 221 | "vtrn.16 q0, q2 \n" |
| 222 | "vtrn.16 q9, q11 \n" |
| 223 | "vtrn.16 q8, q10 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 224 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 225 | "vtrn.32 q1, q9 \n" |
| 226 | "vtrn.32 q0, q8 \n" |
| 227 | "vtrn.32 q3, q11 \n" |
| 228 | "vtrn.32 q2, q10 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 229 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 230 | "vrev16.8 q0, q0 \n" |
| 231 | "vrev16.8 q1, q1 \n" |
| 232 | "vrev16.8 q2, q2 \n" |
| 233 | "vrev16.8 q3, q3 \n" |
| 234 | "vrev16.8 q8, q8 \n" |
| 235 | "vrev16.8 q9, q9 \n" |
| 236 | "vrev16.8 q10, q10 \n" |
| 237 | "vrev16.8 q11, q11 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 238 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 239 | "mov r9, %2 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 240 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 241 | "vst1.8 {d2}, [r9], %3 \n" |
| 242 | "vst1.8 {d0}, [r9], %3 \n" |
| 243 | "vst1.8 {d6}, [r9], %3 \n" |
| 244 | "vst1.8 {d4}, [r9], %3 \n" |
| 245 | "vst1.8 {d18}, [r9], %3 \n" |
| 246 | "vst1.8 {d16}, [r9], %3 \n" |
| 247 | "vst1.8 {d22}, [r9], %3 \n" |
| 248 | "vst1.8 {d20}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 249 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 250 | "mov r9, %4 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 251 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 252 | "vst1.8 {d3}, [r9], %5 \n" |
| 253 | "vst1.8 {d1}, [r9], %5 \n" |
| 254 | "vst1.8 {d7}, [r9], %5 \n" |
| 255 | "vst1.8 {d5}, [r9], %5 \n" |
| 256 | "vst1.8 {d19}, [r9], %5 \n" |
| 257 | "vst1.8 {d17}, [r9], %5 \n" |
| 258 | "vst1.8 {d23}, [r9], %5 \n" |
| 259 | "vst1.8 {d21}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 260 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 261 | "add %0, #8*2 \n" // src += 8*2 |
| 262 | "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a |
| 263 | "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b |
| 264 | "subs %6, #8 \n" // w -= 8 |
| 265 | "bge 1b \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 266 | |
fbarchard@google.com | 64ce0ab | 2012-10-09 00:05:29 +0000 | [diff] [blame] | 267 | // add 8 back to counter. if the result is 0 there are |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 268 | // no residuals. |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 269 | "adds %6, #8 \n" |
| 270 | "beq 4f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 271 | |
| 272 | // some residual, so between 1 and 7 lines left to transpose |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 273 | "cmp %6, #2 \n" |
| 274 | "blt 3f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 275 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 276 | "cmp %6, #4 \n" |
| 277 | "blt 2f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 278 | |
fbarchard@google.com | 1d160cb | 2012-11-28 20:02:55 +0000 | [diff] [blame] | 279 | //TODO(frkoenig): Clean this up |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 280 | // 4x8 block |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 281 | "mov r9, %0 \n" |
| 282 | "vld1.64 {d0}, [r9], %1 \n" |
| 283 | "vld1.64 {d1}, [r9], %1 \n" |
| 284 | "vld1.64 {d2}, [r9], %1 \n" |
| 285 | "vld1.64 {d3}, [r9], %1 \n" |
| 286 | "vld1.64 {d4}, [r9], %1 \n" |
| 287 | "vld1.64 {d5}, [r9], %1 \n" |
| 288 | "vld1.64 {d6}, [r9], %1 \n" |
| 289 | "vld1.64 {d7}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 290 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 291 | "vld1.8 {q15}, [%7] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 292 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 293 | "vtrn.8 q0, q1 \n" |
| 294 | "vtrn.8 q2, q3 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 295 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 296 | "vtbl.8 d16, {d0, d1}, d30 \n" |
| 297 | "vtbl.8 d17, {d0, d1}, d31 \n" |
| 298 | "vtbl.8 d18, {d2, d3}, d30 \n" |
| 299 | "vtbl.8 d19, {d2, d3}, d31 \n" |
| 300 | "vtbl.8 d20, {d4, d5}, d30 \n" |
| 301 | "vtbl.8 d21, {d4, d5}, d31 \n" |
| 302 | "vtbl.8 d22, {d6, d7}, d30 \n" |
| 303 | "vtbl.8 d23, {d6, d7}, d31 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 304 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 305 | "mov r9, %2 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 306 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 307 | "vst1.32 {d16[0]}, [r9], %3 \n" |
| 308 | "vst1.32 {d16[1]}, [r9], %3 \n" |
| 309 | "vst1.32 {d17[0]}, [r9], %3 \n" |
| 310 | "vst1.32 {d17[1]}, [r9], %3 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 311 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 312 | "add r9, %2, #4 \n" |
| 313 | "vst1.32 {d20[0]}, [r9], %3 \n" |
| 314 | "vst1.32 {d20[1]}, [r9], %3 \n" |
| 315 | "vst1.32 {d21[0]}, [r9], %3 \n" |
| 316 | "vst1.32 {d21[1]}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 317 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 318 | "mov r9, %4 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 319 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 320 | "vst1.32 {d18[0]}, [r9], %5 \n" |
| 321 | "vst1.32 {d18[1]}, [r9], %5 \n" |
| 322 | "vst1.32 {d19[0]}, [r9], %5 \n" |
| 323 | "vst1.32 {d19[1]}, [r9], %5 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 324 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 325 | "add r9, %4, #4 \n" |
| 326 | "vst1.32 {d22[0]}, [r9], %5 \n" |
| 327 | "vst1.32 {d22[1]}, [r9], %5 \n" |
| 328 | "vst1.32 {d23[0]}, [r9], %5 \n" |
| 329 | "vst1.32 {d23[1]}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 330 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 331 | "add %0, #4*2 \n" // src += 4 * 2 |
| 332 | "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a |
| 333 | "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b |
| 334 | "subs %6, #4 \n" // w -= 4 |
| 335 | "beq 4f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 336 | |
| 337 | // some residual, check to see if it includes a 2x8 block, |
| 338 | // or less |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 339 | "cmp %6, #2 \n" |
| 340 | "blt 3f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 341 | |
| 342 | // 2x8 block |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 343 | "2: \n" |
| 344 | "mov r9, %0 \n" |
| 345 | "vld2.16 {d0[0], d2[0]}, [r9], %1 \n" |
| 346 | "vld2.16 {d1[0], d3[0]}, [r9], %1 \n" |
| 347 | "vld2.16 {d0[1], d2[1]}, [r9], %1 \n" |
| 348 | "vld2.16 {d1[1], d3[1]}, [r9], %1 \n" |
| 349 | "vld2.16 {d0[2], d2[2]}, [r9], %1 \n" |
| 350 | "vld2.16 {d1[2], d3[2]}, [r9], %1 \n" |
| 351 | "vld2.16 {d0[3], d2[3]}, [r9], %1 \n" |
| 352 | "vld2.16 {d1[3], d3[3]}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 353 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 354 | "vtrn.8 d0, d1 \n" |
| 355 | "vtrn.8 d2, d3 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 356 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 357 | "mov r9, %2 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 358 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 359 | "vst1.64 {d0}, [r9], %3 \n" |
| 360 | "vst1.64 {d2}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 361 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 362 | "mov r9, %4 \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 363 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 364 | "vst1.64 {d1}, [r9], %5 \n" |
| 365 | "vst1.64 {d3}, [r9] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 366 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 367 | "add %0, #2*2 \n" // src += 2 * 2 |
| 368 | "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a |
| 369 | "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b |
| 370 | "subs %6, #2 \n" // w -= 2 |
| 371 | "beq 4f \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 372 | |
| 373 | // 1x8 block |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 374 | "3: \n" |
| 375 | "vld2.8 {d0[0], d1[0]}, [%0], %1 \n" |
| 376 | "vld2.8 {d0[1], d1[1]}, [%0], %1 \n" |
| 377 | "vld2.8 {d0[2], d1[2]}, [%0], %1 \n" |
| 378 | "vld2.8 {d0[3], d1[3]}, [%0], %1 \n" |
| 379 | "vld2.8 {d0[4], d1[4]}, [%0], %1 \n" |
| 380 | "vld2.8 {d0[5], d1[5]}, [%0], %1 \n" |
| 381 | "vld2.8 {d0[6], d1[6]}, [%0], %1 \n" |
| 382 | "vld2.8 {d0[7], d1[7]}, [%0] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 383 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 384 | "vst1.64 {d0}, [%2] \n" |
| 385 | "vst1.64 {d1}, [%4] \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 386 | |
fbarchard@google.com | f7a5048 | 2011-11-10 22:41:20 +0000 | [diff] [blame] | 387 | "4: \n" |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 388 | |
fbarchard@google.com | 5e642d0 | 2012-04-21 01:04:46 +0000 | [diff] [blame] | 389 | : "+r"(src), // %0 |
| 390 | "+r"(src_stride), // %1 |
| 391 | "+r"(dst_a), // %2 |
| 392 | "+r"(dst_stride_a), // %3 |
| 393 | "+r"(dst_b), // %4 |
| 394 | "+r"(dst_stride_b), // %5 |
| 395 | "+r"(width) // %6 |
| 396 | : "r"(&kVTbl4x4TransposeDi) // %7 |
frkoenig@google.com | f7e74a1 | 2011-11-03 22:41:59 +0000 | [diff] [blame] | 397 | : "memory", "cc", "r9", |
| 398 | "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" |
| 399 | ); |
| 400 | } |
| 401 | #endif |
fbarchard@google.com | fe5ff7e | 2011-12-10 07:45:58 +0000 | [diff] [blame] | 402 | |
| 403 | #ifdef __cplusplus |
| 404 | } // extern "C" |
| 405 | } // namespace libyuv |
| 406 | #endif |