Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 1 | @/****************************************************************************** |
| 2 | @ * |
| 3 | @ * Copyright (C) 2015 The Android Open Source Project |
| 4 | @ * |
| 5 | @ * Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | @ * you may not use this file except in compliance with the License. |
| 7 | @ * You may obtain a copy of the License at: |
| 8 | @ * |
| 9 | @ * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | @ * |
| 11 | @ * Unless required by applicable law or agreed to in writing, software |
| 12 | @ * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | @ * See the License for the specific language governing permissions and |
| 15 | @ * limitations under the License. |
| 16 | @ * |
| 17 | @ ***************************************************************************** |
| 18 | @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| 19 | @*/ |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 20 | @** |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 21 | @****************************************************************************** |
| 22 | @* @file |
| 23 | @* ih264_inter_pred_chroma_a9q.s |
| 24 | @* |
| 25 | @* @brief |
| 26 | @* Contains function definitions for inter prediction interpolation. |
| 27 | @* |
| 28 | @* @author |
| 29 | @* Ittaim |
| 30 | @* |
| 31 | @* @par List of Functions: |
| 32 | @* |
| 33 | @* - ih264_inter_pred_chroma_a9q() |
| 34 | @* |
| 35 | @* @remarks |
| 36 | @* None |
| 37 | @* |
| 38 | @******************************************************************************* |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 39 | @* |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 40 | |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 41 | @* All the functions here are replicated from ih264_inter_pred_filters.c |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 42 | @ |
| 43 | |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 44 | @** |
| 45 | @** |
| 46 | @** |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 47 | @ |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 48 | @** |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 49 | @******************************************************************************* |
| 50 | @* |
| 51 | @* @brief |
| 52 | @* Interprediction chroma filter |
| 53 | @* |
| 54 | @* @par Description: |
| 55 | @* Applies filtering to chroma samples as mentioned in |
| 56 | @* sec 8.4.2.2.2 titled "chroma sample interpolation process" |
| 57 | @* |
| 58 | @* @param[in] pu1_src |
| 59 | @* UWORD8 pointer to the source containing alternate U and V samples |
| 60 | @* |
| 61 | @* @param[out] pu1_dst |
| 62 | @* UWORD8 pointer to the destination |
| 63 | @* |
| 64 | @* @param[in] src_strd |
| 65 | @* integer source stride |
| 66 | @* |
| 67 | @* @param[in] dst_strd |
| 68 | @* integer destination stride |
| 69 | @* |
| 70 | @* @param[in]uc_dx |
| 71 | @* dx value where the sample is to be produced(refer sec 8.4.2.2.2 ) |
| 72 | @* |
| 73 | @* @param[in] uc_dy |
| 74 | @* dy value where the sample is to be produced(refer sec 8.4.2.2.2 ) |
| 75 | @* |
| 76 | @* @param[in] ht |
| 77 | @* integer height of the array |
| 78 | @* |
| 79 | @* @param[in] wd |
| 80 | @* integer width of the array |
| 81 | @* |
| 82 | @* @returns |
| 83 | @* |
| 84 | @* @remarks |
| 85 | @* None |
| 86 | @* |
| 87 | @******************************************************************************* |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 88 | @* |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 89 | |
| 90 | @void ih264_inter_pred_chroma(UWORD8 *pu1_src, |
| 91 | @ UWORD8 *pu1_dst, |
| 92 | @ WORD32 src_strd, |
| 93 | @ WORD32 dst_strd, |
| 94 | @ UWORD8 u1_dx, |
| 95 | @ UWORD8 u1_dy, |
| 96 | @ WORD32 ht, |
| 97 | @ WORD32 wd) |
| 98 | @**************Variables Vs Registers***************************************** |
| 99 | @ r0 => *pu1_src |
| 100 | @ r1 => *pu1_dst |
| 101 | @ r2 => src_strd |
| 102 | @ r3 => dst_strd |
| 103 | @ r4 => u1_dx |
| 104 | @ r5 => u1_dy |
| 105 | @ r6 => height |
| 106 | @ r7 => width |
| 107 | @ |
| 108 | .text |
| 109 | .p2align 2 |
| 110 | |
| 111 | .global ih264_inter_pred_chroma_a9q |
| 112 | |
| 113 | ih264_inter_pred_chroma_a9q: |
| 114 | |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 115 | stmfd sp!, {r4-r12, r14} @store register values to stack |
| 116 | vstmdb sp!, {d8-d15} @push neon registers to stack |
| 117 | ldr r4, [sp, #104] |
| 118 | ldr r5, [sp, #108] |
| 119 | ldr r6, [sp, #112] |
| 120 | ldr r7, [sp, #116] |
| 121 | |
| 122 | rsb r8, r4, #8 @8-u1_dx |
| 123 | rsb r9, r5, #8 @8-u1_dy |
| 124 | mul r10, r8, r9 |
| 125 | mul r11, r4, r9 |
| 126 | |
| 127 | vdup.u8 d28, r10 |
| 128 | vdup.u8 d29, r11 |
| 129 | |
| 130 | mul r10, r8, r5 |
| 131 | mul r11, r4, r5 |
| 132 | |
| 133 | vdup.u8 d30, r10 |
| 134 | vdup.u8 d31, r11 |
| 135 | |
| 136 | subs r12, r7, #2 @if wd=4 branch to loop_4 |
| 137 | beq loop_2 |
| 138 | subs r12, r7, #4 @if wd=8 branch to loop_8 |
| 139 | beq loop_4 |
| 140 | |
| 141 | loop_8: |
| 142 | sub r6, #1 |
| 143 | vld1.8 {d0, d1, d2}, [r0], r2 @ Load row0 |
| 144 | vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 |
| 145 | vext.8 d3, d0, d1, #2 |
| 146 | vext.8 d8, d5, d6, #2 |
| 147 | |
| 148 | vmull.u8 q5, d0, d28 |
| 149 | vmlal.u8 q5, d5, d30 |
| 150 | vmlal.u8 q5, d3, d29 |
| 151 | vmlal.u8 q5, d8, d31 |
| 152 | vext.8 d9, d6, d7, #2 |
| 153 | vext.8 d4, d1, d2, #2 |
| 154 | |
| 155 | inner_loop_8: |
| 156 | vmull.u8 q6, d6, d30 |
| 157 | vmlal.u8 q6, d1, d28 |
| 158 | vmlal.u8 q6, d9, d31 |
| 159 | vmlal.u8 q6, d4, d29 |
| 160 | vmov d0, d5 |
| 161 | vmov d3, d8 |
| 162 | |
| 163 | vqrshrun.s16 d14, q5, #6 |
| 164 | vmov d1, d6 |
| 165 | vmov d4, d9 |
| 166 | |
| 167 | vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 |
| 168 | vqrshrun.s16 d15, q6, #6 |
| 169 | |
| 170 | vext.8 d8, d5, d6, #2 |
| 171 | subs r6, #1 |
| 172 | vext.8 d9, d6, d7, #2 |
| 173 | vst1.8 {q7}, [r1], r3 @ Store dest row |
| 174 | |
| 175 | vmull.u8 q5, d0, d28 |
| 176 | vmlal.u8 q5, d5, d30 |
| 177 | vmlal.u8 q5, d3, d29 |
| 178 | vmlal.u8 q5, d8, d31 |
| 179 | bne inner_loop_8 |
| 180 | |
| 181 | vmull.u8 q6, d6, d30 |
| 182 | vmlal.u8 q6, d1, d28 |
| 183 | vmlal.u8 q6, d9, d31 |
| 184 | vmlal.u8 q6, d4, d29 |
| 185 | |
| 186 | vqrshrun.s16 d14, q5, #6 |
| 187 | vqrshrun.s16 d15, q6, #6 |
| 188 | |
| 189 | vst1.8 {q7}, [r1], r3 @ Store dest row |
| 190 | |
| 191 | b end_func |
| 192 | |
| 193 | loop_4: |
| 194 | sub r6, #1 |
| 195 | vld1.8 {d0, d1}, [r0], r2 @ Load row0 |
| 196 | vld1.8 {d2, d3}, [r0], r2 @ Load row1 |
| 197 | vext.8 d1, d0, d1, #2 |
| 198 | vext.8 d3, d2, d3, #2 |
| 199 | |
| 200 | vmull.u8 q2, d2, d30 |
| 201 | vmlal.u8 q2, d0, d28 |
| 202 | vmlal.u8 q2, d3, d31 |
| 203 | vmlal.u8 q2, d1, d29 |
| 204 | |
| 205 | inner_loop_4: |
| 206 | subs r6, #1 |
| 207 | vmov d0, d2 |
| 208 | vmov d1, d3 |
| 209 | |
| 210 | vld1.8 {d2, d3}, [r0], r2 @ Load row1 |
| 211 | vqrshrun.s16 d6, q2, #6 |
| 212 | |
| 213 | vext.8 d3, d2, d3, #2 |
| 214 | vst1.8 {d6}, [r1], r3 @ Store dest row |
| 215 | |
| 216 | vmull.u8 q2, d0, d28 |
| 217 | vmlal.u8 q2, d2, d30 |
| 218 | vmlal.u8 q2, d1, d29 |
| 219 | vmlal.u8 q2, d3, d31 |
| 220 | bne inner_loop_4 |
| 221 | |
| 222 | vqrshrun.s16 d6, q2, #6 |
| 223 | vst1.8 {d6}, [r1], r3 @ Store dest row |
| 224 | |
| 225 | b end_func |
| 226 | |
| 227 | loop_2: |
| 228 | vld1.8 {d0}, [r0], r2 @ Load row0 |
| 229 | vext.8 d1, d0, d0, #2 |
| 230 | vld1.8 {d2}, [r0], r2 @ Load row1 |
| 231 | vext.8 d3, d2, d2, #2 |
| 232 | vmull.u8 q2, d0, d28 |
| 233 | vmlal.u8 q2, d1, d29 |
| 234 | vmlal.u8 q2, d2, d30 |
| 235 | vmlal.u8 q2, d3, d31 |
| 236 | vld1.8 {d6}, [r0] @ Load row2 |
| 237 | vqrshrun.s16 d4, q2, #6 |
| 238 | vext.8 d7, d6, d6, #2 |
| 239 | vst1.32 d4[0], [r1], r3 @ Store dest row0 |
| 240 | vmull.u8 q4, d2, d28 |
| 241 | vmlal.u8 q4, d3, d29 |
| 242 | vmlal.u8 q4, d6, d30 |
| 243 | vmlal.u8 q4, d7, d31 |
| 244 | subs r6, #2 |
| 245 | vqrshrun.s16 d8, q4, #6 |
| 246 | vst1.32 d8[0], [r1], r3 @ Store dest row1 |
| 247 | bne loop_2 @ repeat if ht=2 |
| 248 | |
| 249 | end_func: |
| 250 | vldmia sp!, {d8-d15} @ Restore neon registers that were saved |
| 251 | ldmfd sp!, {r4-r12, pc} @ Restoring registers from stack |
| 252 | |