blob: 4014c4f0619fa18cb9841e8d54e35eb298c47246 [file] [log] [blame]
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
Hamsalekha S8d3d3032015-03-13 21:24:58 +053020
21///**
22//******************************************************************************
23//*
24//* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC )
25//* and do the prediction.
26//*
27//* @par Description
28//* This function evaluates first three intra chroma modes and compute corresponding sad
29//* and return the buffer predicted with best mode.
30//*
31//* @param[in] pu1_src
32//* UWORD8 pointer to the source
33//*
34//** @param[in] pu1_ngbr_pels
35//* UWORD8 pointer to neighbouring pels
36//*
37//* @param[out] pu1_dst
38//* UWORD8 pointer to the destination
39//*
40//* @param[in] src_strd
41//* integer source stride
42//*
43//* @param[in] dst_strd
44//* integer destination stride
45//*
46//* @param[in] u4_n_avblty
47//* availability of neighbouring pixels
48//*
49//* @param[in] u4_intra_mode
50//* Pointer to the variable in which best mode is returned
51//*
52//* @param[in] pu4_sadmin
53//* Pointer to the variable in which minimum sad is returned
54//*
55//* @param[in] u4_valid_intra_modes
56//* Says what all modes are valid
57//*
58//*
59//* @return none
60//*
61//******************************************************************************
62//*/
63//
64//void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
65// UWORD8 *pu1_ngbr_pels_i16,
66// UWORD8 *pu1_dst,
67// UWORD32 src_strd,
68// UWORD32 dst_strd,
69// WORD32 u4_n_avblty,
70// UWORD32 *u4_intra_mode,
71// WORD32 *pu4_sadmin,
72// UWORD32 u4_valid_intra_modes)
73//
74.text
75.p2align 2
76.include "ih264_neon_macros.s"
77
78.global ih264e_evaluate_intra_chroma_modes_av8
79
80ih264e_evaluate_intra_chroma_modes_av8:
81
82//x0 = pu1_src,
83//x1 = pu1_ngbr_pels_i16,
84//x2 = pu1_dst,
Martin Storsjod91f49a2016-09-05 16:15:02 +030085//w3 = src_strd,
86//w4 = dst_strd,
87//w5 = u4_n_avblty,
Hamsalekha S8d3d3032015-03-13 21:24:58 +053088//x6 = u4_intra_mode,
89//x7 = pu4_sadmin
90
91
92
93 // STMFD sp!, {x4-x12, x14} //store register values to stack
94 push_v_regs
Martin Storsjod91f49a2016-09-05 16:15:02 +030095 sxtw x3, w3
96 sxtw x4, w4
Hamsalekha S8d3d3032015-03-13 21:24:58 +053097 stp x19, x20, [sp, #-16]!
98 //-----------------------
Martin Storsjod91f49a2016-09-05 16:15:02 +030099 ldr w16, [sp, #80]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530100 mov x17, x4
Martin Storsjod91f49a2016-09-05 16:15:02 +0300101 mov w18, w5
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530102 mov x14, x6
103 mov x15, x7
104
Martin Storsjod91f49a2016-09-05 16:15:02 +0300105 mov w19, #5
106 ands w6, w5, w19
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530107 beq none_available
Martin Storsjod91f49a2016-09-05 16:15:02 +0300108 cmp w6, #1
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530109 beq left_only_available
Martin Storsjod91f49a2016-09-05 16:15:02 +0300110 cmp w6, #4
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530111 beq top_only_available
112
113all_available:
114 ld1 {v0.8b, v1.8b}, [x1]
115 add x6, x1, #18
116 ld1 {v2.8b, v3.8b}, [x6]
117 uxtl v0.8h, v0.8b
118 uxtl v1.8h, v1.8b
119 addp v0.4s, v0.4s , v0.4s
120 addp v1.4s, v1.4s , v1.4s
121 addp v0.4s, v0.4s , v0.4s
122 addp v1.4s, v1.4s , v1.4s
123 uxtl v2.8h, v2.8b
124 uxtl v3.8h, v3.8b
125 addp v2.4s, v2.4s , v2.4s
126 addp v3.4s, v3.4s , v3.4s
127 addp v2.4s, v2.4s , v2.4s
128 addp v3.4s, v3.4s , v3.4s
129 rshrn v5.8b, v0.8h, #2
130 dup v21.8h, v5.h[0]
131 rshrn v6.8b, v3.8h, #2
132 dup v20.8h, v6.h[0]
133 add v1.8h, v1.8h, v2.8h
134 rshrn v1.8b, v1.8h, #3
135 dup v23.8h, v1.h[0]
136 mov v20.d[0], v23.d[0]
137 add v0.8h, v0.8h, v3.8h
138 rshrn v0.8b, v0.8h, #3
139 dup v23.8h, v0.h[0]
140 mov v31.d[0], v23.d[0]
141 mov v28.d[0], v20.d[0]
142 mov v29.d[0], v20.d[1]
143 mov v30.d[0], v21.d[0]
144 b sad_comp
145
146left_only_available:
147 ld1 {v0.8b, v1.8b}, [x1]
148 uxtl v0.8h, v0.8b
149 uxtl v1.8h, v1.8b
150 addp v0.4s, v0.4s , v0.4s
151 addp v1.4s, v1.4s , v1.4s
152 addp v0.4s, v0.4s , v0.4s
153 addp v1.4s, v1.4s , v1.4s
154 rshrn v0.8b, v0.8h, #2
155 rshrn v1.8b, v1.8h, #2
156
157 dup v28.8h , v1.h[0]
158 dup v29.8h , v1.h[0]
159 dup v30.8h, v0.h[0]
160 dup v31.8h, v0.h[0]
161 b sad_comp
162
163top_only_available:
164 add x6, x1, #18
165 ld1 {v0.8b, v1.8b}, [x6]
166 uxtl v0.8h, v0.8b
167 uxtl v1.8h, v1.8b
168 addp v0.4s, v0.4s , v0.4s
169 addp v1.4s, v1.4s , v1.4s
170 addp v0.4s, v0.4s , v0.4s
171 addp v1.4s, v1.4s , v1.4s
172 rshrn v0.8b, v0.8h, #2
173 rshrn v1.8b, v1.8h, #2
174 dup v28.8h , v0.h[0]
175 dup v30.8h, v1.h[0]
176 mov v29.d[0], v30.d[1]
177 mov v30.d[0], v28.d[0]
178 mov v31.d[0], v30.d[1]
179 b sad_comp
180none_available:
181 mov w20, #128
182 dup v28.16b, w20
183 dup v29.16b, w20
184 dup v30.16b, w20
185 dup v31.16b, w20
186
187
188
189sad_comp:
190 add x6, x1, #18
191 ld1 {v10.8b, v11.8b}, [x6] // vertical values
192
193 ld1 {v27.8h}, [x1]
194
195 dup v20.8h, v27.h[7] ///HORIZONTAL VALUE ROW=0//
196 dup v21.8h, v27.h[7]
197
198 ld1 { v0.8b, v1.8b}, [x0], x3
199
200
201 ///vertical row 0@
202 uabdl v16.8h, v0.8b, v10.8b
203 uabdl v18.8h, v1.8b, v11.8b
204
205 ///HORZ row 0@
206 uabdl v26.8h, v0.8b, v20.8b
207 uabdl v14.8h, v1.8b, v21.8b
208
209 ld1 {v2.8b, v3.8b}, [x0], x3
210
211
212
213 ///dc row 0@
214 uabdl v22.8h, v0.8b, v28.8b
215 uabdl v24.8h, v1.8b, v29.8b
216
217
218 dup v20.8h, v27.h[6]
219 dup v21.8h, v27.h[6] ///HORIZONTAL VALUE ROW=1//
220
221 ///vertical row 1@
222 uabal v16.8h, v2.8b, v10.8b
223 uabal v18.8h, v3.8b, v11.8b
224
225 ld1 { v4.8b, v5.8b}, [x0], x3
226
227 ///HORZ row 1@
228 uabal v26.8h, v2.8b, v20.8b
229 uabal v14.8h, v3.8b, v21.8b
230
231 ///dc row 1@
232 uabal v22.8h, v2.8b, v28.8b
233 uabal v24.8h, v3.8b, v29.8b
234
235 dup v20.8h, v27.h[5]
236 dup v21.8h, v27.h[5] ///HORIZONTAL VALUE ROW=2//
237
238 ///vertical row 2@
239 uabal v16.8h, v4.8b, v10.8b
240 uabal v18.8h, v5.8b, v11.8b
241
242 ld1 { v6.8b, v7.8b}, [x0], x3
243 ///HORZ row 2@
244 uabal v26.8h, v4.8b, v20.8b
245 uabal v14.8h, v5.8b, v21.8b
246
247 ///dc row 2@
248 uabal v22.8h, v4.8b, v28.8b
249 uabal v24.8h, v5.8b, v29.8b
250
251 dup v20.8h, v27.h[4]
252 dup v21.8h, v27.h[4] ///HORIZONTAL VALUE ROW=3//
253
254 ///vertical row 3@
255 uabal v16.8h, v6.8b, v10.8b
256 uabal v18.8h, v7.8b, v11.8b
257
258 ///HORZ row 3@
259 uabal v26.8h, v6.8b, v20.8b
260 uabal v14.8h, v7.8b, v21.8b
261
262 ///dc row 3@
263 uabal v22.8h, v6.8b, v28.8b
264 uabal v24.8h, v7.8b, v29.8b
265
266 //----------------------------------------------------------------------------------------------
267 ld1 { v0.8b, v1.8b}, [x0], x3
268
269
270 dup v20.8h, v27.h[3]
271 dup v21.8h, v27.h[3] ///HORIZONTAL VALUE ROW=0//
272
273 ///vertical row 0@
274 uabal v16.8h, v0.8b, v10.8b
275 uabal v18.8h, v1.8b, v11.8b
276
277 ///HORZ row 0@
278 uabal v26.8h, v0.8b, v20.8b
279 uabal v14.8h, v1.8b, v21.8b
280
281 ld1 { v2.8b, v3.8b}, [x0], x3
282
283 ///dc row 0@
284 uabal v22.8h, v0.8b, v30.8b
285 uabal v24.8h, v1.8b, v31.8b
286
287 dup v20.8h, v27.h[2]
288 dup v21.8h, v27.h[2] ///HORIZONTAL VALUE ROW=1//
289
290 ///vertical row 1@
291 uabal v16.8h, v2.8b, v10.8b
292 uabal v18.8h, v3.8b, v11.8b
293
294 ///HORZ row 1@
295 uabal v26.8h, v2.8b, v20.8b
296 uabal v14.8h, v3.8b, v21.8b
297
298 ld1 { v4.8b, v5.8b}, [x0], x3
299
300 ///dc row 1@
301 uabal v22.8h, v2.8b, v30.8b
302 uabal v24.8h, v3.8b, v31.8b
303
304 dup v20.8h, v27.h[1]
305 dup v21.8h, v27.h[1] ///HORIZONTAL VALUE ROW=2//
306
307 ///vertical row 2@
308 uabal v16.8h, v4.8b, v10.8b
309 uabal v18.8h, v5.8b, v11.8b
310
311 ///HORZ row 2@
312 uabal v26.8h, v4.8b, v20.8b
313 uabal v14.8h, v5.8b, v21.8b
314
315 ld1 {v6.8b, v7.8b}, [x0], x3
316
317 ///dc row 2@
318 uabal v22.8h, v4.8b, v30.8b
319 uabal v24.8h, v5.8b, v31.8b
320
321 dup v20.8h, v27.h[0]
322 dup v21.8h, v27.h[0] ///HORIZONTAL VALUE ROW=3//
323
324 ///vertical row 3@
325 uabal v16.8h, v6.8b, v10.8b
326 uabal v18.8h, v7.8b, v11.8b
327
328 ///HORZ row 3@
329 uabal v26.8h, v6.8b, v20.8b
330 uabal v14.8h, v7.8b, v21.8b
331
332 ///dc row 3@
333 uabal v22.8h, v6.8b, v30.8b
334 uabal v24.8h, v7.8b, v31.8b
335
336
337//-------------------------------------------
338
339
340//vert sum
341
342 add v16.8h, v16.8h , v18.8h
343 mov v18.d[0], v16.d[1]
344 add v16.4h, v16.4h , v18.4h
345 uaddlp v16.2s, v16.4h
346 addp v16.2s, v16.2s, v16.2s
347 smov x8, v16.s[0]
348
349
350 //horz sum
351
352 add v26.8h, v26.8h , v14.8h
353 mov v14.d[0], v26.d[1]
354 add v26.4h, v26.4h , v14.4h
355 uaddlp v26.2s, v26.4h
356 addp v26.2s, v26.2s, v26.2s
357 smov x9, v26.s[0]
358
359 //dc sum
360
361 add v24.8h, v22.8h , v24.8h ///DC
362 mov v25.d[0], v24.d[1]
363 add v24.4h, v24.4h , v25.4h ///DC
364 uaddlp v24.2s, v24.4h ///DC
365 addp v24.2s, v24.2s, v24.2s ///DC
366 smov x10, v24.s[0] //dc
367
368
369
370
371 mov x11, #1
372//-----------------------
Martin Storsjod91f49a2016-09-05 16:15:02 +0300373 mov w0, w16 // u4_valid_intra_modes
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530374
375//--------------------------------------------
376
377
378 lsl x11, x11, #30
379
Martin Storsjod91f49a2016-09-05 16:15:02 +0300380 ands w7, w0, #04 // vert mode valid????????????
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530381 csel x8, x11, x8, eq
382
Martin Storsjod91f49a2016-09-05 16:15:02 +0300383 ands w6, w0, #02 // horz mode valid????????????
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530384 csel x9, x11, x9, eq
385
Martin Storsjod91f49a2016-09-05 16:15:02 +0300386 ands w6, w0, #01 // dc mode valid????????????
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530387 csel x10, x11, x10, eq
388
389
390 //---------------------------
391
392 mov x4, x17
393 mov x6, x14
394 mov x7, x15
395
396 //--------------------------
397
398 cmp x10, x9
399 bgt not_dc
400 cmp x10, x8
401 bgt do_vert
402
403 ///----------------------
404 //DO DC PREDICTION
Martin Storsjoa61d4e02015-06-16 07:32:52 +0200405 str w10 , [x7] //MIN SAD
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530406
Martin Storsjoa61d4e02015-06-16 07:32:52 +0200407 mov w10, #0
408 str w10 , [x6] // MODE
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530409
410 b do_dc_vert
411 //-----------------------------
412
413not_dc:
414 cmp x9, x8
415 bgt do_vert
416 ///----------------------
417 //DO HORIZONTAL
Martin Storsjoa61d4e02015-06-16 07:32:52 +0200418 str w9 , [x7] //MIN SAD
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530419
Martin Storsjoa61d4e02015-06-16 07:32:52 +0200420 mov w10, #1
421 str w10 , [x6] // MODE
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530422 ld1 {v0.8h}, [x1]
423
424 dup v10.8h, v0.h[7]
425 dup v11.8h, v0.h[6]
426 dup v12.8h, v0.h[5]
427 dup v13.8h, v0.h[4]
428 st1 {v10.8h}, [x2], x4
429 dup v14.8h, v0.h[3]
430 st1 {v11.8h}, [x2], x4
431 dup v15.8h, v0.h[2]
432 st1 {v12.8h}, [x2], x4
433 dup v16.8h, v0.h[1]
434 st1 {v13.8h}, [x2], x4
435 dup v17.8h, v0.h[0]
436 st1 {v14.8h}, [x2], x4
437 st1 {v15.8h}, [x2], x4
438 st1 {v16.8h}, [x2], x4
439 st1 {v17.8h}, [x2], x4
440
441 b end_func
442
443do_vert:
444 //DO VERTICAL PREDICTION
Martin Storsjoa61d4e02015-06-16 07:32:52 +0200445 str w8 , [x7] //MIN SAD
446 mov w8, #2
447 str w8 , [x6] // MODE
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530448 add x6, x1, #18
449 ld1 {v28.8b, v29.8b}, [x6] // vertical values
450 ld1 {v30.8b, v31.8b}, [x6] // vertical values
451
452do_dc_vert:
453 st1 {v28.2s, v29.2s} , [x2], x4 //0
454 st1 {v28.2s, v29.2s} , [x2], x4 //1
455 st1 {v28.2s, v29.2s} , [x2], x4 //2
456 st1 {v28.2s, v29.2s} , [x2], x4 //3
457 st1 {v30.2s, v31.2s} , [x2], x4 //4
458 st1 {v30.2s, v31.2s} , [x2], x4 //5
459 st1 {v30.2s, v31.2s} , [x2], x4 //6
460 st1 {v30.2s, v31.2s} , [x2], x4 //7
461
462end_func:
463 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
464 ldp x19, x20, [sp], #16
465 pop_v_regs
466 ret
467
468