blob: 32e149d4fde8adb80366b2f9848677a1e6f2edf1 [file] [log] [blame]
Harish Mahendrakar0d8951c2014-05-16 10:31:13 -07001@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@* ihevc_sao_band_offset_chroma.s
22@*
23@* ,:brief
24@* Contains function definitions for inter prediction interpolation.
25@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@* Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@* None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
40@ WORD32 src_strd,
41@ UWORD8 *pu1_src_left,
42@ UWORD8 *pu1_src_top,
43@ UWORD8 *pu1_src_top_left,
44@ WORD32 sao_band_pos_u,
45@ WORD32 sao_band_pos_v,
46@ WORD8 *pi1_sao_offset_u,
47@ WORD8 *pi1_sao_offset_v,
48@ WORD32 wd,
49@ WORD32 ht)
50@
51@**************Variables Vs Registers*****************************************
52@r0 => *pu1_src
53@r1 => src_strd
54@r2 => *pu1_src_left
55@r3 => *pu1_src_top
56@r4 => *pu1_src_top_left
57@r5 => sao_band_pos_u
58@r6 => sao_band_pos_v
59@r7 => *pi1_sao_offset_u
60@r8 => *pi1_sao_offset_v
61@r9 => wd
62@r10=> ht
63
64.text
65.p2align 2
66
67.extern gu1_table_band_idx
68.globl ihevc_sao_band_offset_chroma_a9q
69
70gu1_table_band_idx_addr_1:
71.long gu1_table_band_idx - ulbl1 - 8
72
73gu1_table_band_idx_addr_2:
74.long gu1_table_band_idx - ulbl2 - 8
75
76ihevc_sao_band_offset_chroma_a9q:
77
78 STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
79 LDR r4,[sp,#40] @Loads pu1_src_top_left
80 LDR r10,[sp,#64] @Loads ht
81
82 LDR r9,[sp,#60] @Loads wd
83 MOV r11,r10 @Move the ht to r9 for loop counter
84
85 ADD r12,r0,r9 @pu1_src[row * src_strd + (wd)]
86 LDR r14, gu1_table_band_idx_addr_1
87ulbl1:
88 add r14,r14,pc
89 SUB r12,r12,#2 @wd-2
90
91SRC_LEFT_LOOP:
92 LDRH r5,[r12],r1 @Load the value
93 SUBS r11,r11,#1 @Decrement the loop counter
94 STRH r5,[r2],#2 @Store the value in pu1_src_left pointer
95 BNE SRC_LEFT_LOOP
96
97 LDR r5,[sp,#44] @Loads sao_band_pos_u
98 VLD1.8 D1,[r14]! @band_table_u.val[0]
99 ADD r12,r3,r9 @pu1_src_top[wd]
100
101 LDRH r11,[r12,#-2]
102 VLD1.8 D2,[r14]! @band_table_u.val[1]
103 LSL r6,r5,#3 @sao_band_pos_u
104
105 STRH r11,[r4] @store to pu1_src_top_left[0]
106 VLD1.8 D3,[r14]! @band_table_u.val[2]
107 LDR r7,[sp,#52] @Loads pi1_sao_offset_u
108
109 SUB r4,r10,#1 @ht-1
110 VDUP.8 D31,r6 @band_pos_u
111 MUL r4,r4,r1 @ht-1 * src_strd
112
113 ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd]
114 VLD1.8 D4,[r14]! @band_table_u.val[3]
115 MOV r11,r9 @Move the wd to r9 for loop counter
116
117SRC_TOP_LOOP: @wd is always multiple of 8
118 VLD1.8 D0,[r4]! @Load pu1_src[(ht - 1) * src_strd + col]
119 SUBS r11,r11,#8 @Decrement the loop counter by 8
120 VST1.8 D0,[r3]! @Store to pu1_src_top[col]
121 BNE SRC_TOP_LOOP
122
123 VLD1.8 D30,[r7] @pi1_sao_offset_u load
124 VADD.I8 D5,D1,D31 @band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
125
126 VDUP.8 D29,D30[1] @vdup_n_u8(pi1_sao_offset_u[1])
127 VADD.I8 D6,D2,D31 @band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
128
129 VDUP.8 D28,D30[2] @vdup_n_u8(pi1_sao_offset_u[2])
130 VADD.I8 D7,D3,D31 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
131
132 VDUP.8 D27,D30[3] @vdup_n_u8(pi1_sao_offset_u[3])
133 VADD.I8 D8,D4,D31 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
134
135 CMP r5,#28
136 VDUP.8 D26,D30[4] @vdup_n_u8(pi1_sao_offset_u[4])
137 LDR r14, gu1_table_band_idx_addr_2
138ulbl2:
139 add r14,r14,pc
140
141 VMOV.I8 D30,#16 @vdup_n_u8(16)
142 VADD.I8 D1,D5,D29 @band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
143
144 VLD1.8 D9,[r14]! @band_table_v.val[0]
145 VADD.I8 D2,D6,D28 @band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
146
147 VLD1.8 D10,[r14]! @band_table_v.val[1]
148 VADD.I8 D3,D7,D27 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
149
150 LDR r6,[sp,#48] @Loads sao_band_pos_v
151 VADD.I8 D4,D8,D26 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
152 LSL r11,r6,#3 @sao_band_pos_v
153
154 BLT SAO_BAND_POS_U_0
155
156SAO_BAND_POS_U_28: @case 28
157 VCLE.U8 D13,D4,D30 @vcle_u8(band_table.val[3], vdup_n_u8(16))
158 BNE SAO_BAND_POS_U_29
159
160 VORR.U8 D4,D4,D13 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
161 B SWITCH_BREAK_U
162
163SAO_BAND_POS_U_29: @case 29
164 CMP r5,#29
165
166 VCLE.U8 D14,D3,D30 @vcle_u8(band_table.val[2], vdup_n_u8(16))
167 BNE SAO_BAND_POS_U_30
168 VORR.U8 D3,D3,D14 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
169
170 VAND.U8 D4,D4,D13 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
171 B SWITCH_BREAK_U
172
173SAO_BAND_POS_U_30: @case 30
174 CMP r5,#30
175
176 VCLE.U8 D15,D2,D30 @vcle_u8(band_table.val[1], vdup_n_u8(16))
177 BNE SAO_BAND_POS_U_31
178 VORR.U8 D2,D2,D15 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
179
180 VAND.U8 D3,D3,D14 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
181
182SAO_BAND_POS_U_31: @case 31
183 CMP r5,#31
184 BNE SWITCH_BREAK_U
185
186 VCLE.U8 D16,D1,D30 @vcle_u8(band_table.val[0], vdup_n_u8(16))
187 VORR.U8 D1,D1,D16 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
188
189 VAND.U8 D2,D2,D15 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
190 B SWITCH_BREAK_U
191
192SAO_BAND_POS_U_0:
193 CMP r5,#0 @case 0
194 BNE SWITCH_BREAK_U
195
196 VCLE.U8 D16,D1,D30 @vcle_u8(band_table.val[0], vdup_n_u8(16))
197 VAND.U8 D1,D1,D16 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
198
199SWITCH_BREAK_U:
200 VDUP.8 D30,r11 @band_pos_v
201 LDR r8,[sp,#56] @Loads pi1_sao_offset_v
202
203 VLD1.8 D11,[r14]! @band_table_v.val[2]
204 VADD.I8 D13,D9,D30 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
205
206 VLD1.8 D12,[r14]! @band_table_v.val[3]
207 VADD.I8 D14,D10,D30 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
208
209 VLD1.8 D25,[r8] @pi1_sao_offset_v load
210 VADD.I8 D15,D11,D30 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
211
212 VDUP.8 D29,D25[1] @vdup_n_u8(pi1_sao_offset_v[1])
213 VADD.I8 D16,D12,D30 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
214
215 VDUP.8 D28,D25[2] @vdup_n_u8(pi1_sao_offset_v[2])
216 VADD.I8 D9,D13,D29 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
217
218 VDUP.8 D27,D25[3] @vdup_n_u8(pi1_sao_offset_v[3])
219 VADD.I8 D10,D14,D28 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
220
221 VDUP.8 D26,D25[4] @vdup_n_u8(pi1_sao_offset_v[4])
222 VADD.I8 D11,D15,D27 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
223
224 VMOV.I8 D29,#16 @vdup_n_u8(16)
225 VADD.I8 D12,D16,D26 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
226 AND r12,r9,#0xf
227
228 CMP r6,#28
229 BLT SAO_BAND_POS_V_0
230
231SAO_BAND_POS_V_28: @case 28
232 VCLE.U8 D17,D12,D29 @vcle_u8(band_table.val[3], vdup_n_u8(16))
233 BNE SAO_BAND_POS_V_29
234 VORR.U8 D12,D12,D17 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
235 B SWITCH_BREAK_V
236
237SAO_BAND_POS_V_29: @case 29
238 CMP r6,#29
239
240 VCLE.U8 D18,D11,D29 @vcle_u8(band_table.val[2], vdup_n_u8(16))
241 BNE SAO_BAND_POS_V_30
242 VORR.U8 D11,D11,D18 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
243
244 VAND.U8 D12,D12,D17 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
245 B SWITCH_BREAK_V
246
247SAO_BAND_POS_V_30: @case 30
248 CMP r6,#30
249
250 VCLE.U8 D19,D10,D29 @vcle_u8(band_table.val[1], vdup_n_u8(16))
251 BNE SAO_BAND_POS_V_31
252 VORR.U8 D10,D10,D19 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
253
254 VAND.U8 D11,D11,D18 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
255 B SWITCH_BREAK_V
256
257SAO_BAND_POS_V_31: @case 31
258 CMP r6,#31
259 BNE SWITCH_BREAK_V
260
261 VCLE.U8 D20,D9,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16))
262 VORR.U8 D9,D9,D20 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
263
264 VAND.U8 D10,D10,D19 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
265 B SWITCH_BREAK_V
266
267SAO_BAND_POS_V_0:
268 CMP r6,#0 @case 0
269 BNE SWITCH_BREAK_V
270
271 VCLE.U8 D20,D9,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16))
272 VAND.U8 D9,D9,D20 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
273
274SWITCH_BREAK_V:
275 CMP r9,#16
276 MOV r4,r0 @pu1_src_cpy
277 BLT WIDTH_RESIDUE
278
279WIDTH_LOOP: @Width is assigned to be multiple of 16
280 MOV r4,r0 @pu1_src_cpy
281 MOV r11,r10 @move ht
282 ADD r5,r4,r1
283
284HEIGHT_LOOP: @unrolled for 4 rows
285 ADD r6,r5,r1
286 VLD2.8 {D5,D6},[r4] @vld1q_u8(pu1_src_cpy)
287 ADD r7,r6,r1
288
289 VLD2.8 {D13,D14},[r5] @vld1q_u8(pu1_src_cpy)
290 VSUB.I8 D7,D5,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
291
292 VLD2.8 {D17,D18},[r6] @vld1q_u8(pu1_src_cpy)
293 VSUB.I8 D8,D6,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
294
295 VLD2.8 {D21,D22},[r7] @vld1q_u8(pu1_src_cpy)
296 VSUB.I8 D15,D13,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
297
298 VTBX.8 D5,{D1-D4},D7 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
299 VSUB.I8 D16,D14,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
300
301 VTBX.8 D6,{D9-D12},D8 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
302 VSUB.I8 D19,D17,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
303
304 VTBX.8 D13,{D1-D4},D15 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
305 VSUB.I8 D20,D18,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
306
307 VTBX.8 D14,{D9-D12},D16 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
308 VSUB.I8 D23,D21,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
309
310 VST2.8 {D5,D6},[r4] @vst1q_u8(pu1_src_cpy, au1_cur_row)
311 VSUB.I8 D24,D22,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
312
313 SUBS r11,r11,#4 @Decrement the ht loop count by 4
314 VTBX.8 D17,{D1-D4},D19 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
315
316 VST2.8 {D13,D14},[r5] @vst1q_u8(pu1_src_cpy, au1_cur_row)
317
318 VTBX.8 D18,{D9-D12},D20 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
319 VTBX.8 D21,{D1-D4},D23 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
320 VTBX.8 D22,{D9-D12},D24 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
321
322 VST2.8 {D17,D18},[r6],r1 @vst1q_u8(pu1_src_cpy, au1_cur_row)
323
324 ADD r4,r6,r1
325 VST2.8 {D21,D22},[r7] @vst1q_u8(pu1_src_cpy, au1_cur_row)
326 ADD r5,r4,r1
327
328 BNE HEIGHT_LOOP
329
330 SUB r9,r9,#16 @Decrement the width loop by 16
331 ADD r0,r0,#16
332 CMP r9,#8
333 BGT WIDTH_LOOP
334 BLT END_LOOP
335 MOV r4,r0 @pu1_src_cpy
336
337WIDTH_RESIDUE: @If width is not multiple of 16
338 ADD r5,r4,r1
339 VLD2.8 {D5,D6},[r4] @vld1q_u8(pu1_src_cpy)
340 ADD r6,r5,r1
341
342 ADD r7,r6,r1
343 VLD2.8 {D13,D14},[r5] @vld1q_u8(pu1_src_cpy)
344 VSUB.I8 D7,D5,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
345
346 VLD2.8 {D17,D18},[r6] @vld1q_u8(pu1_src_cpy)
347 VSUB.I8 D8,D6,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
348
349 VTBX.8 D5,{D1-D4},D7 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
350 VSUB.I8 D15,D13,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
351
352 VTBX.8 D6,{D9-D12},D8 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
353 VSUB.I8 D16,D14,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
354
355 VLD2.8 {D21,D22},[r7] @vld1q_u8(pu1_src_cpy)
356 VSUB.I8 D19,D17,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
357
358 VTBX.8 D13,{D1-D4},D15 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
359 VSUB.I8 D20,D18,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
360
361 VTBX.8 D14,{D9-D12},D16 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
362 VZIP.8 D5,D6
363
364 VTBX.8 D17,{D1-D4},D19 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
365 VSUB.I8 D23,D21,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
366
367 VST1.8 {D5},[r4] @vst1q_u8(pu1_src_cpy, au1_cur_row)
368 VZIP.8 D13,D14
369
370 VTBX.8 D18,{D9-D12},D20 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
371 VSUB.I8 D24,D22,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
372
373 VST1.8 {D13},[r5] @vst1q_u8(pu1_src_cpy, au1_cur_row)
374 SUBS r10,r10,#4 @Decrement the ht loop count by 4
375
376 VTBX.8 D21,{D1-D4},D23 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
377 VZIP.8 D17,D18
378
379 VTBX.8 D22,{D9-D12},D24 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
380 VST1.8 {D17},[r6],r1 @vst1q_u8(pu1_src_cpy, au1_cur_row)
381 VZIP.8 D21,D22
382
383 ADD r4,r6,r1
384 VST1.8 {D21},[r7] @vst1q_u8(pu1_src_cpy, au1_cur_row)
385 ADD r5,r4,r1
386
387 BNE WIDTH_RESIDUE
388
389END_LOOP:
390 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
391
392
393