fbarchard@google.com | ca41005 | 2012-10-14 06:01:19 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
| 11 | #include "libyuv/row.h" |
| 12 | #ifdef __cplusplus |
| 13 | namespace libyuv { |
| 14 | extern "C" { |
| 15 | #endif |
| 16 | |
| 17 | #if !defined(YUV_DISABLE_ASM) && defined(__mips__) |
| 18 | #ifdef HAS_SPLITUV_MIPS_DSPR2 |
| 19 | void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { |
| 20 | |
| 21 | __asm__ __volatile__( |
| 22 | ".set push \n\t" |
| 23 | ".set noreorder \n\t" |
| 24 | |
| 25 | "srl $t4, %[width], 4 \n\t" // how many multiplies of 16 8bits |
| 26 | "blez $t4, 2f \n\t" |
| 27 | " andi %[width], %[width], 0xf \n\t" // residual |
| 28 | "andi $t0, %[src_uv], 0x3 \n\t" |
| 29 | "andi $t1, %[dst_u], 0x3 \n\t" |
| 30 | "andi $t2, %[dst_v], 0x3 \n\t" |
| 31 | "or $t0, $t0, $t1 \n\t" |
| 32 | "or $t0, $t0, $t2 \n\t" |
| 33 | |
| 34 | "beqz $t0, 12f \n\t" // if src and dsts are aligned |
| 35 | " nop \n\t" |
| 36 | |
| 37 | // src and dst are unaligned |
| 38 | "1: \n\t" |
| 39 | "addiu $t4, $t4, -1 \n\t" |
| 40 | "lwr $t0, 0(%[src_uv]) \n\t" |
| 41 | "lwl $t0, 3(%[src_uv]) \n\t" // t0 = V1 | U1 | V0 | U0 |
| 42 | "lwr $t1, 4(%[src_uv]) \n\t" |
| 43 | "lwl $t1, 7(%[src_uv]) \n\t" // t1 = V3 | U3 | V2 | U2 |
| 44 | "lwr $t2, 8(%[src_uv]) \n\t" |
| 45 | "lwl $t2, 11(%[src_uv]) \n\t" // t2 = V5 | U5 | V4 | U4 |
| 46 | "lwr $t3, 12(%[src_uv]) \n\t" |
| 47 | "lwl $t3, 15(%[src_uv]) \n\t" // t3 = V7 | U7 | V6 | U6 |
| 48 | "lwr $t5, 16(%[src_uv]) \n\t" |
| 49 | "lwl $t5, 19(%[src_uv]) \n\t" // t5 = V9 | U9 | V8 | U8 |
| 50 | "lwr $t6, 20(%[src_uv]) \n\t" |
| 51 | "lwl $t6, 23(%[src_uv]) \n\t" // t6 = V11 | U11 | V10 | U10 |
| 52 | "lwr $t7, 24(%[src_uv]) \n\t" |
| 53 | "lwl $t7, 27(%[src_uv]) \n\t" // t7 = V13 | U13 | V12 | U12 |
| 54 | "lwr $t8, 28(%[src_uv]) \n\t" |
| 55 | "lwl $t8, 31(%[src_uv]) \n\t" // t8 = V15 | U15 | V14 | U14 |
| 56 | |
| 57 | "precrq.qb.ph $t9, $t1, $t0 \n\t" // t9 = V3 | V2 | V1 | V0 |
| 58 | "precr.qb.ph $t0, $t1, $t0 \n\t" // t0 = U3 | U2 | U1 | U0 |
| 59 | "precrq.qb.ph $t1, $t3, $t2 \n\t" // t1 = V7 | V6 | V5 | V4 |
| 60 | "precr.qb.ph $t2, $t3, $t2 \n\t" // t2 = U7 | U6 | U5 | U4 |
| 61 | "precrq.qb.ph $t3, $t6, $t5 \n\t" // t3 = V11 | V10 | V9 | V8 |
| 62 | "precr.qb.ph $t5, $t6, $t5 \n\t" // t5 = U11 | U10 | U9 | U8 |
| 63 | "precrq.qb.ph $t6, $t8, $t7 \n\t" // t6 = V15 | V14 | V13 | V12 |
| 64 | "precr.qb.ph $t7, $t8, $t7 \n\t" // t7 = U15 | U14 | U13 | U12 |
| 65 | "addiu %[src_uv], %[src_uv], 32 \n\t" |
| 66 | |
| 67 | "swr $t9, 0(%[dst_v]) \n\t" |
| 68 | "swl $t9, 3(%[dst_v]) \n\t" |
| 69 | "swr $t0, 0(%[dst_u]) \n\t" |
| 70 | "swl $t0, 3(%[dst_u]) \n\t" |
| 71 | "swr $t1, 4(%[dst_v]) \n\t" |
| 72 | "swl $t1, 7(%[dst_v]) \n\t" |
| 73 | "swr $t2, 4(%[dst_u]) \n\t" |
| 74 | "swl $t2, 7(%[dst_u]) \n\t" |
| 75 | "swr $t3, 8(%[dst_v]) \n\t" |
| 76 | "swl $t3, 11(%[dst_v]) \n\t" |
| 77 | "swr $t5, 8(%[dst_u]) \n\t" |
| 78 | "swl $t5, 11(%[dst_u]) \n\t" |
| 79 | "swr $t6, 12(%[dst_v]) \n\t" |
| 80 | "swl $t6, 15(%[dst_v]) \n\t" |
| 81 | "swr $t7, 12(%[dst_u]) \n\t" |
| 82 | "swl $t7, 15(%[dst_u]) \n\t" |
| 83 | "addiu %[dst_u], %[dst_u], 16 \n\t" |
| 84 | "bgtz $t4, 1b \n\t" |
| 85 | " addiu %[dst_v], %[dst_v], 16 \n\t" |
| 86 | |
| 87 | "beqz %[width], 3f \n\t" |
| 88 | " nop \n\t" |
| 89 | "b 2f \n\t" |
| 90 | " nop \n\t" |
| 91 | |
| 92 | // src and dst are aligned |
| 93 | "12: \n\t" |
| 94 | "addiu $t4, $t4, -1 \n\t" |
| 95 | "lw $t0, 0(%[src_uv]) \n\t" // t0 = V1 | U1 | V0 | U0 |
| 96 | "lw $t1, 4(%[src_uv]) \n\t" // t1 = V3 | U3 | V2 | U2 |
| 97 | "lw $t2, 8(%[src_uv]) \n\t" // t2 = V5 | U5 | V4 | U4 |
| 98 | "lw $t3, 12(%[src_uv]) \n\t" // t3 = V7 | U7 | V6 | U6 |
| 99 | "lw $t5, 16(%[src_uv]) \n\t" // t5 = V9 | U9 | V8 | U8 |
| 100 | "lw $t6, 20(%[src_uv]) \n\t" // t6 = V11 | U11 | V10 | U10 |
| 101 | "lw $t7, 24(%[src_uv]) \n\t" // t7 = V13 | U13 | V12 | U12 |
| 102 | "lw $t8, 28(%[src_uv]) \n\t" // t8 = V15 | U15 | V14 | U14 |
| 103 | |
| 104 | "addiu %[src_uv], %[src_uv], 32 \n\t" |
| 105 | "precrq.qb.ph $t9, $t1, $t0 \n\t" // t9 = V3 | V2 | V1 | V0 |
| 106 | "precr.qb.ph $t0, $t1, $t0 \n\t" // t0 = U3 | U2 | U1 | U0 |
| 107 | "precrq.qb.ph $t1, $t3, $t2 \n\t" // t1 = V7 | V6 | V5 | V4 |
| 108 | "precr.qb.ph $t2, $t3, $t2 \n\t" // t2 = U7 | U6 | U5 | U4 |
| 109 | "precrq.qb.ph $t3, $t6, $t5 \n\t" // t3 = V11 | V10 | V9 | V8 |
| 110 | "precr.qb.ph $t5, $t6, $t5 \n\t" // t5 = U11 | U10 | U9 | U8 |
| 111 | "precrq.qb.ph $t6, $t8, $t7 \n\t" // t6 = V15 | V14 | V13 | V12 |
| 112 | "precr.qb.ph $t7, $t8, $t7 \n\t" // t7 = U15 | U14 | U13 | U12 |
| 113 | |
| 114 | "sw $t9, 0(%[dst_v]) \n\t" |
| 115 | "sw $t0, 0(%[dst_u]) \n\t" |
| 116 | "sw $t1, 4(%[dst_v]) \n\t" |
| 117 | "sw $t2, 4(%[dst_u]) \n\t" |
| 118 | "sw $t3, 8(%[dst_v]) \n\t" |
| 119 | "sw $t5, 8(%[dst_u]) \n\t" |
| 120 | "sw $t6, 12(%[dst_v]) \n\t" |
| 121 | "sw $t7, 12(%[dst_u]) \n\t" |
| 122 | "addiu %[dst_v], %[dst_v], 16 \n\t" |
| 123 | "bgtz $t4, 12b \n\t" |
| 124 | " addiu %[dst_u], %[dst_u], 16 \n\t" |
| 125 | |
| 126 | "beqz %[width], 3f \n\t" |
| 127 | " nop \n\t" |
| 128 | |
| 129 | "2: \n\t" |
| 130 | "lbu $t0, 0(%[src_uv]) \n\t" |
| 131 | "lbu $t1, 1(%[src_uv]) \n\t" |
| 132 | "addiu %[src_uv], %[src_uv], 2 \n\t" |
| 133 | "addiu %[width], %[width], -1 \n\t" |
| 134 | "sb $t0, 0(%[dst_u]) \n\t" |
| 135 | "sb $t1, 0(%[dst_v]) \n\t" |
| 136 | "addiu %[dst_u], %[dst_u], 1 \n\t" |
| 137 | "bgtz %[width], 2b \n\t" |
| 138 | " addiu %[dst_v], %[dst_v], 1 \n\t" |
| 139 | |
| 140 | "3: \n\t" |
| 141 | ".set pop \n\t" |
| 142 | : [src_uv] "+r" (src_uv), [width] "+r" (width), |
| 143 | [dst_u] "+r" (dst_u), [dst_v] "+r" (dst_v) |
| 144 | : |
| 145 | : "t0", "t1","t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" |
| 146 | ); |
| 147 | } |
| 148 | #endif // HAS_SPLITUV_MIPS_DSPR2 |
| 149 | |
| 150 | |
| 151 | #ifdef HAS_SPLITUV_MIPS_DSPR2 |
| 152 | // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v |
| 153 | // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. |
| 154 | void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { |
| 155 | asm volatile ( |
| 156 | ".p2align 2 \n" |
| 157 | "1: \n" |
| 158 | "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV |
| 159 | "subs %3, %3, #16 \n" // 16 processed per loop |
| 160 | "vst1.u8 {q0}, [%1]! \n" // store U |
| 161 | "vst1.u8 {q1}, [%2]! \n" // Store V |
| 162 | "bgt 1b \n" |
| 163 | : "+r"(src_uv), // %0 |
| 164 | "+r"(dst_u), // %1 |
| 165 | "+r"(dst_v), // %2 |
| 166 | "+r"(width) // %3 // Output registers |
| 167 | : // Input registers |
| 168 | : "memory", "cc", "q0", "q1" // Clobber List |
| 169 | ); |
| 170 | } |
| 171 | #endif // HAS_SPLITUV_MIPS_DSPR2 |
| 172 | |
| 173 | #ifdef HAS_SPLITUV_MIPS_DSPR2 |
| 174 | // Reads 4 pairs of UV and write even values to dst_u and odd to dst_v |
| 175 | // Alignment requirement: 4 bytes for pointers, and multiple of 4 pixels. |
| 176 | void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
| 177 | int width) { |
| 178 | asm volatile ( |
| 179 | ".set push \n" |
| 180 | ".set noreorder \n" |
| 181 | ".p2align 2 \n" |
| 182 | "1: \n" |
| 183 | "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 |
| 184 | "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2 |
| 185 | "addiu %[width], %[width], -4 \n" |
| 186 | "addiu %[src_uv], %[src_uv], 8 \n" |
| 187 | "precr.qb.ph $t2, $t1, $t0 \n" // U3 | U2 | U1 | U0 |
| 188 | "precrq.qb.ph $t3, $t1, $t0 \n" // V3 | V2 | V1 | V0 |
| 189 | "sw $t2, 0(%[dst_u]) \n" |
| 190 | "sw $t3, 0(%[dst_v]) \n" |
| 191 | "addiu %[dst_u], %[dst_u], 4 \n" |
| 192 | "bgtz %[width], 1b \n" |
| 193 | " addiu %[dst_v], %[dst_v], 4 \n" |
| 194 | ".set pop \n" |
| 195 | : [src_uv] "+r" (src_uv), |
| 196 | [width] "+r" (width), |
| 197 | [dst_u] "+r" (dst_u), |
| 198 | [dst_v] "+r" (dst_v) |
| 199 | : |
| 200 | : "t0", "t1","t2", "t3", |
| 201 | ); |
| 202 | } |
| 203 | #endif // HAS_SPLITUV_MIPS_DSPR2 |
| 204 | #endif // __mips__ |
| 205 | |
| 206 | #ifdef __cplusplus |
| 207 | } // extern "C" |
| 208 | } // namespace libyuv |
| 209 | #endif |