Harish Mahendrakar | 0d8951c | 2014-05-16 10:31:13 -0700 | [diff] [blame^] | 1 | @/***************************************************************************** |
| 2 | @* |
| 3 | @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| 4 | @* |
| 5 | @* Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | @* you may not use this file except in compliance with the License. |
| 7 | @* You may obtain a copy of the License at: |
| 8 | @* |
| 9 | @* http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | @* |
| 11 | @* Unless required by applicable law or agreed to in writing, software |
| 12 | @* distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | @* See the License for the specific language governing permissions and |
| 15 | @* limitations under the License. |
| 16 | @* |
| 17 | @*****************************************************************************/ |
| 18 | @/** |
| 19 | @******************************************************************************* |
| 20 | @* ,:file |
| 21 | @* ihevc_sao_band_offset_chroma.s |
| 22 | @* |
| 23 | @* ,:brief |
| 24 | @* Contains function definitions for inter prediction interpolation. |
| 25 | @* Functions are coded using NEON intrinsics and can be compiled using@ ARM |
| 26 | @* RVCT |
| 27 | @* |
| 28 | @* ,:author |
| 29 | @* Parthiban V |
| 30 | @* |
| 31 | @* ,:par List of Functions: |
| 32 | @* |
| 33 | @* |
| 34 | @* ,:remarks |
| 35 | @* None |
| 36 | @* |
| 37 | @******************************************************************************* |
| 38 | @*/ |
| 39 | @void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src, |
| 40 | @ WORD32 src_strd, |
| 41 | @ UWORD8 *pu1_src_left, |
| 42 | @ UWORD8 *pu1_src_top, |
| 43 | @ UWORD8 *pu1_src_top_left, |
| 44 | @ WORD32 sao_band_pos_u, |
| 45 | @ WORD32 sao_band_pos_v, |
| 46 | @ WORD8 *pi1_sao_offset_u, |
| 47 | @ WORD8 *pi1_sao_offset_v, |
| 48 | @ WORD32 wd, |
| 49 | @ WORD32 ht) |
| 50 | @ |
| 51 | @**************Variables Vs Registers***************************************** |
| 52 | @r0 => *pu1_src |
| 53 | @r1 => src_strd |
| 54 | @r2 => *pu1_src_left |
| 55 | @r3 => *pu1_src_top |
| 56 | @r4 => *pu1_src_top_left |
| 57 | @r5 => sao_band_pos_u |
| 58 | @r6 => sao_band_pos_v |
| 59 | @r7 => *pi1_sao_offset_u |
| 60 | @r8 => *pi1_sao_offset_v |
| 61 | @r9 => wd |
| 62 | @r10=> ht |
| 63 | |
| 64 | .text |
| 65 | .p2align 2 |
| 66 | |
| 67 | .extern gu1_table_band_idx |
| 68 | .globl ihevc_sao_band_offset_chroma_a9q |
| 69 | |
| 70 | gu1_table_band_idx_addr_1: |
| 71 | .long gu1_table_band_idx - ulbl1 - 8 |
| 72 | |
| 73 | gu1_table_band_idx_addr_2: |
| 74 | .long gu1_table_band_idx - ulbl2 - 8 |
| 75 | |
| 76 | ihevc_sao_band_offset_chroma_a9q: |
| 77 | |
| 78 | STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments |
| 79 | LDR r4,[sp,#40] @Loads pu1_src_top_left |
| 80 | LDR r10,[sp,#64] @Loads ht |
| 81 | |
| 82 | LDR r9,[sp,#60] @Loads wd |
| 83 | MOV r11,r10 @Move the ht to r9 for loop counter |
| 84 | |
| 85 | ADD r12,r0,r9 @pu1_src[row * src_strd + (wd)] |
| 86 | LDR r14, gu1_table_band_idx_addr_1 |
| 87 | ulbl1: |
| 88 | add r14,r14,pc |
| 89 | SUB r12,r12,#2 @wd-2 |
| 90 | |
| 91 | SRC_LEFT_LOOP: |
| 92 | LDRH r5,[r12],r1 @Load the value |
| 93 | SUBS r11,r11,#1 @Decrement the loop counter |
| 94 | STRH r5,[r2],#2 @Store the value in pu1_src_left pointer |
| 95 | BNE SRC_LEFT_LOOP |
| 96 | |
| 97 | LDR r5,[sp,#44] @Loads sao_band_pos_u |
| 98 | VLD1.8 D1,[r14]! @band_table_u.val[0] |
| 99 | ADD r12,r3,r9 @pu1_src_top[wd] |
| 100 | |
| 101 | LDRH r11,[r12,#-2] |
| 102 | VLD1.8 D2,[r14]! @band_table_u.val[1] |
| 103 | LSL r6,r5,#3 @sao_band_pos_u |
| 104 | |
| 105 | STRH r11,[r4] @store to pu1_src_top_left[0] |
| 106 | VLD1.8 D3,[r14]! @band_table_u.val[2] |
| 107 | LDR r7,[sp,#52] @Loads pi1_sao_offset_u |
| 108 | |
| 109 | SUB r4,r10,#1 @ht-1 |
| 110 | VDUP.8 D31,r6 @band_pos_u |
| 111 | MUL r4,r4,r1 @ht-1 * src_strd |
| 112 | |
| 113 | ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd] |
| 114 | VLD1.8 D4,[r14]! @band_table_u.val[3] |
| 115 | MOV r11,r9 @Move the wd to r9 for loop counter |
| 116 | |
| 117 | SRC_TOP_LOOP: @wd is always multiple of 8 |
| 118 | VLD1.8 D0,[r4]! @Load pu1_src[(ht - 1) * src_strd + col] |
| 119 | SUBS r11,r11,#8 @Decrement the loop counter by 8 |
| 120 | VST1.8 D0,[r3]! @Store to pu1_src_top[col] |
| 121 | BNE SRC_TOP_LOOP |
| 122 | |
| 123 | VLD1.8 D30,[r7] @pi1_sao_offset_u load |
| 124 | VADD.I8 D5,D1,D31 @band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u) |
| 125 | |
| 126 | VDUP.8 D29,D30[1] @vdup_n_u8(pi1_sao_offset_u[1]) |
| 127 | VADD.I8 D6,D2,D31 @band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u) |
| 128 | |
| 129 | VDUP.8 D28,D30[2] @vdup_n_u8(pi1_sao_offset_u[2]) |
| 130 | VADD.I8 D7,D3,D31 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u) |
| 131 | |
| 132 | VDUP.8 D27,D30[3] @vdup_n_u8(pi1_sao_offset_u[3]) |
| 133 | VADD.I8 D8,D4,D31 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u) |
| 134 | |
| 135 | CMP r5,#28 |
| 136 | VDUP.8 D26,D30[4] @vdup_n_u8(pi1_sao_offset_u[4]) |
| 137 | LDR r14, gu1_table_band_idx_addr_2 |
| 138 | ulbl2: |
| 139 | add r14,r14,pc |
| 140 | |
| 141 | VMOV.I8 D30,#16 @vdup_n_u8(16) |
| 142 | VADD.I8 D1,D5,D29 @band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1])) |
| 143 | |
| 144 | VLD1.8 D9,[r14]! @band_table_v.val[0] |
| 145 | VADD.I8 D2,D6,D28 @band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2])) |
| 146 | |
| 147 | VLD1.8 D10,[r14]! @band_table_v.val[1] |
| 148 | VADD.I8 D3,D7,D27 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3])) |
| 149 | |
| 150 | LDR r6,[sp,#48] @Loads sao_band_pos_v |
| 151 | VADD.I8 D4,D8,D26 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4])) |
| 152 | LSL r11,r6,#3 @sao_band_pos_v |
| 153 | |
| 154 | BLT SAO_BAND_POS_U_0 |
| 155 | |
| 156 | SAO_BAND_POS_U_28: @case 28 |
| 157 | VCLE.U8 D13,D4,D30 @vcle_u8(band_table.val[3], vdup_n_u8(16)) |
| 158 | BNE SAO_BAND_POS_U_29 |
| 159 | |
| 160 | VORR.U8 D4,D4,D13 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) |
| 161 | B SWITCH_BREAK_U |
| 162 | |
| 163 | SAO_BAND_POS_U_29: @case 29 |
| 164 | CMP r5,#29 |
| 165 | |
| 166 | VCLE.U8 D14,D3,D30 @vcle_u8(band_table.val[2], vdup_n_u8(16)) |
| 167 | BNE SAO_BAND_POS_U_30 |
| 168 | VORR.U8 D3,D3,D14 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) |
| 169 | |
| 170 | VAND.U8 D4,D4,D13 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) |
| 171 | B SWITCH_BREAK_U |
| 172 | |
| 173 | SAO_BAND_POS_U_30: @case 30 |
| 174 | CMP r5,#30 |
| 175 | |
| 176 | VCLE.U8 D15,D2,D30 @vcle_u8(band_table.val[1], vdup_n_u8(16)) |
| 177 | BNE SAO_BAND_POS_U_31 |
| 178 | VORR.U8 D2,D2,D15 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) |
| 179 | |
| 180 | VAND.U8 D3,D3,D14 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) |
| 181 | |
| 182 | SAO_BAND_POS_U_31: @case 31 |
| 183 | CMP r5,#31 |
| 184 | BNE SWITCH_BREAK_U |
| 185 | |
| 186 | VCLE.U8 D16,D1,D30 @vcle_u8(band_table.val[0], vdup_n_u8(16)) |
| 187 | VORR.U8 D1,D1,D16 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) |
| 188 | |
| 189 | VAND.U8 D2,D2,D15 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) |
| 190 | B SWITCH_BREAK_U |
| 191 | |
| 192 | SAO_BAND_POS_U_0: |
| 193 | CMP r5,#0 @case 0 |
| 194 | BNE SWITCH_BREAK_U |
| 195 | |
| 196 | VCLE.U8 D16,D1,D30 @vcle_u8(band_table.val[0], vdup_n_u8(16)) |
| 197 | VAND.U8 D1,D1,D16 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) |
| 198 | |
| 199 | SWITCH_BREAK_U: |
| 200 | VDUP.8 D30,r11 @band_pos_v |
| 201 | LDR r8,[sp,#56] @Loads pi1_sao_offset_v |
| 202 | |
| 203 | VLD1.8 D11,[r14]! @band_table_v.val[2] |
| 204 | VADD.I8 D13,D9,D30 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v) |
| 205 | |
| 206 | VLD1.8 D12,[r14]! @band_table_v.val[3] |
| 207 | VADD.I8 D14,D10,D30 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v) |
| 208 | |
| 209 | VLD1.8 D25,[r8] @pi1_sao_offset_v load |
| 210 | VADD.I8 D15,D11,D30 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v) |
| 211 | |
| 212 | VDUP.8 D29,D25[1] @vdup_n_u8(pi1_sao_offset_v[1]) |
| 213 | VADD.I8 D16,D12,D30 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v) |
| 214 | |
| 215 | VDUP.8 D28,D25[2] @vdup_n_u8(pi1_sao_offset_v[2]) |
| 216 | VADD.I8 D9,D13,D29 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1])) |
| 217 | |
| 218 | VDUP.8 D27,D25[3] @vdup_n_u8(pi1_sao_offset_v[3]) |
| 219 | VADD.I8 D10,D14,D28 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2])) |
| 220 | |
| 221 | VDUP.8 D26,D25[4] @vdup_n_u8(pi1_sao_offset_v[4]) |
| 222 | VADD.I8 D11,D15,D27 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3])) |
| 223 | |
| 224 | VMOV.I8 D29,#16 @vdup_n_u8(16) |
| 225 | VADD.I8 D12,D16,D26 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4])) |
| 226 | AND r12,r9,#0xf |
| 227 | |
| 228 | CMP r6,#28 |
| 229 | BLT SAO_BAND_POS_V_0 |
| 230 | |
| 231 | SAO_BAND_POS_V_28: @case 28 |
| 232 | VCLE.U8 D17,D12,D29 @vcle_u8(band_table.val[3], vdup_n_u8(16)) |
| 233 | BNE SAO_BAND_POS_V_29 |
| 234 | VORR.U8 D12,D12,D17 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) |
| 235 | B SWITCH_BREAK_V |
| 236 | |
| 237 | SAO_BAND_POS_V_29: @case 29 |
| 238 | CMP r6,#29 |
| 239 | |
| 240 | VCLE.U8 D18,D11,D29 @vcle_u8(band_table.val[2], vdup_n_u8(16)) |
| 241 | BNE SAO_BAND_POS_V_30 |
| 242 | VORR.U8 D11,D11,D18 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) |
| 243 | |
| 244 | VAND.U8 D12,D12,D17 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) |
| 245 | B SWITCH_BREAK_V |
| 246 | |
| 247 | SAO_BAND_POS_V_30: @case 30 |
| 248 | CMP r6,#30 |
| 249 | |
| 250 | VCLE.U8 D19,D10,D29 @vcle_u8(band_table.val[1], vdup_n_u8(16)) |
| 251 | BNE SAO_BAND_POS_V_31 |
| 252 | VORR.U8 D10,D10,D19 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) |
| 253 | |
| 254 | VAND.U8 D11,D11,D18 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) |
| 255 | B SWITCH_BREAK_V |
| 256 | |
| 257 | SAO_BAND_POS_V_31: @case 31 |
| 258 | CMP r6,#31 |
| 259 | BNE SWITCH_BREAK_V |
| 260 | |
| 261 | VCLE.U8 D20,D9,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16)) |
| 262 | VORR.U8 D9,D9,D20 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) |
| 263 | |
| 264 | VAND.U8 D10,D10,D19 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) |
| 265 | B SWITCH_BREAK_V |
| 266 | |
| 267 | SAO_BAND_POS_V_0: |
| 268 | CMP r6,#0 @case 0 |
| 269 | BNE SWITCH_BREAK_V |
| 270 | |
| 271 | VCLE.U8 D20,D9,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16)) |
| 272 | VAND.U8 D9,D9,D20 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) |
| 273 | |
| 274 | SWITCH_BREAK_V: |
| 275 | CMP r9,#16 |
| 276 | MOV r4,r0 @pu1_src_cpy |
| 277 | BLT WIDTH_RESIDUE |
| 278 | |
| 279 | WIDTH_LOOP: @Width is assigned to be multiple of 16 |
| 280 | MOV r4,r0 @pu1_src_cpy |
| 281 | MOV r11,r10 @move ht |
| 282 | ADD r5,r4,r1 |
| 283 | |
| 284 | HEIGHT_LOOP: @unrolled for 4 rows |
| 285 | ADD r6,r5,r1 |
| 286 | VLD2.8 {D5,D6},[r4] @vld1q_u8(pu1_src_cpy) |
| 287 | ADD r7,r6,r1 |
| 288 | |
| 289 | VLD2.8 {D13,D14},[r5] @vld1q_u8(pu1_src_cpy) |
| 290 | VSUB.I8 D7,D5,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| 291 | |
| 292 | VLD2.8 {D17,D18},[r6] @vld1q_u8(pu1_src_cpy) |
| 293 | VSUB.I8 D8,D6,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| 294 | |
| 295 | VLD2.8 {D21,D22},[r7] @vld1q_u8(pu1_src_cpy) |
| 296 | VSUB.I8 D15,D13,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| 297 | |
| 298 | VTBX.8 D5,{D1-D4},D7 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| 299 | VSUB.I8 D16,D14,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| 300 | |
| 301 | VTBX.8 D6,{D9-D12},D8 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| 302 | VSUB.I8 D19,D17,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| 303 | |
| 304 | VTBX.8 D13,{D1-D4},D15 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| 305 | VSUB.I8 D20,D18,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| 306 | |
| 307 | VTBX.8 D14,{D9-D12},D16 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| 308 | VSUB.I8 D23,D21,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| 309 | |
| 310 | VST2.8 {D5,D6},[r4] @vst1q_u8(pu1_src_cpy, au1_cur_row) |
| 311 | VSUB.I8 D24,D22,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| 312 | |
| 313 | SUBS r11,r11,#4 @Decrement the ht loop count by 4 |
| 314 | VTBX.8 D17,{D1-D4},D19 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| 315 | |
| 316 | VST2.8 {D13,D14},[r5] @vst1q_u8(pu1_src_cpy, au1_cur_row) |
| 317 | |
| 318 | VTBX.8 D18,{D9-D12},D20 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| 319 | VTBX.8 D21,{D1-D4},D23 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| 320 | VTBX.8 D22,{D9-D12},D24 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| 321 | |
| 322 | VST2.8 {D17,D18},[r6],r1 @vst1q_u8(pu1_src_cpy, au1_cur_row) |
| 323 | |
| 324 | ADD r4,r6,r1 |
| 325 | VST2.8 {D21,D22},[r7] @vst1q_u8(pu1_src_cpy, au1_cur_row) |
| 326 | ADD r5,r4,r1 |
| 327 | |
| 328 | BNE HEIGHT_LOOP |
| 329 | |
| 330 | SUB r9,r9,#16 @Decrement the width loop by 16 |
| 331 | ADD r0,r0,#16 |
| 332 | CMP r9,#8 |
| 333 | BGT WIDTH_LOOP |
| 334 | BLT END_LOOP |
| 335 | MOV r4,r0 @pu1_src_cpy |
| 336 | |
| 337 | WIDTH_RESIDUE: @If width is not multiple of 16 |
| 338 | ADD r5,r4,r1 |
| 339 | VLD2.8 {D5,D6},[r4] @vld1q_u8(pu1_src_cpy) |
| 340 | ADD r6,r5,r1 |
| 341 | |
| 342 | ADD r7,r6,r1 |
| 343 | VLD2.8 {D13,D14},[r5] @vld1q_u8(pu1_src_cpy) |
| 344 | VSUB.I8 D7,D5,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| 345 | |
| 346 | VLD2.8 {D17,D18},[r6] @vld1q_u8(pu1_src_cpy) |
| 347 | VSUB.I8 D8,D6,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| 348 | |
| 349 | VTBX.8 D5,{D1-D4},D7 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| 350 | VSUB.I8 D15,D13,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| 351 | |
| 352 | VTBX.8 D6,{D9-D12},D8 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| 353 | VSUB.I8 D16,D14,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| 354 | |
| 355 | VLD2.8 {D21,D22},[r7] @vld1q_u8(pu1_src_cpy) |
| 356 | VSUB.I8 D19,D17,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| 357 | |
| 358 | VTBX.8 D13,{D1-D4},D15 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| 359 | VSUB.I8 D20,D18,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| 360 | |
| 361 | VTBX.8 D14,{D9-D12},D16 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| 362 | VZIP.8 D5,D6 |
| 363 | |
| 364 | VTBX.8 D17,{D1-D4},D19 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| 365 | VSUB.I8 D23,D21,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| 366 | |
| 367 | VST1.8 {D5},[r4] @vst1q_u8(pu1_src_cpy, au1_cur_row) |
| 368 | VZIP.8 D13,D14 |
| 369 | |
| 370 | VTBX.8 D18,{D9-D12},D20 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| 371 | VSUB.I8 D24,D22,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| 372 | |
| 373 | VST1.8 {D13},[r5] @vst1q_u8(pu1_src_cpy, au1_cur_row) |
| 374 | SUBS r10,r10,#4 @Decrement the ht loop count by 4 |
| 375 | |
| 376 | VTBX.8 D21,{D1-D4},D23 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| 377 | VZIP.8 D17,D18 |
| 378 | |
| 379 | VTBX.8 D22,{D9-D12},D24 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| 380 | VST1.8 {D17},[r6],r1 @vst1q_u8(pu1_src_cpy, au1_cur_row) |
| 381 | VZIP.8 D21,D22 |
| 382 | |
| 383 | ADD r4,r6,r1 |
| 384 | VST1.8 {D21},[r7] @vst1q_u8(pu1_src_cpy, au1_cur_row) |
| 385 | ADD r5,r4,r1 |
| 386 | |
| 387 | BNE WIDTH_RESIDUE |
| 388 | |
| 389 | END_LOOP: |
| 390 | LDMFD sp!,{r4-r12,r15} @Reload the registers from SP |
| 391 | |
| 392 | |
| 393 | |