blob: 014facaf2619610869675b9069fad1bd78f5a0cf [file] [log] [blame]
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//* ih264_inter_pred_luma_vert_qpel_av8.s
24//*
25//* @brief
26//* Contains function definitions for inter prediction vertical quarter pel interpolation.
27//*
28//* @author
29//* Mohit
30//*
31//* @par List of Functions:
32//*
33//* - ih264_inter_pred_luma_vert_qpel_av8()
34//*
35//* @remarks
36//* None
37//*
38//*******************************************************************************
39//*/
40
41///* All the functions here are replicated from ih264_inter_pred_filters.c
42//
43
44///**
45///**
46//*******************************************************************************
47//*
48//* @brief
49//* Quarter pel interprediction luma filter for vertical input
50//*
51//* @par Description:
52//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
53//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
54//*
55//* @param[in] pu1_src
56//* UWORD8 pointer to the source
57//*
58//* @param[out] pu1_dst
59//* UWORD8 pointer to the destination
60//*
61//* @param[in] src_strd
62//* integer source stride
63//*
64//* @param[in] dst_strd
65//* integer destination stride
66//*
67//* @param[in] ht
68//* integer height of the array
69//*
70//* @param[in] wd
71//* integer width of the array
72//*
73//* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
74//*
75//* @param[in] dydx: x and y reference offset for qpel calculations.
76//* @returns
77//*
78// @remarks
79//* None
80//*
81//*******************************************************************************
82//*/
83
84//void ih264_inter_pred_luma_vert (
85// UWORD8 *pu1_src,
86// UWORD8 *pu1_dst,
87// WORD32 src_strd,
88// WORD32 dst_strd,
89// WORD32 ht,
90// WORD32 wd,
91// UWORD8* pu1_tmp,
92// UWORD32 dydx)
93
94//**************Variables Vs Registers*****************************************
95// x0 => *pu1_src
96// x1 => *pu1_dst
Martin Storsjod91f49a2016-09-05 16:15:02 +030097// w2 => src_strd
98// w3 => dst_strd
99// w4 => ht
100// w5 => wd
101// w7 => dydx
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530102
103.text
104.p2align 2
105.include "ih264_neon_macros.s"
106
107
108
109 .global ih264_inter_pred_luma_vert_qpel_av8
110
111ih264_inter_pred_luma_vert_qpel_av8:
112
113 push_v_regs
114 stp x19, x20, [sp, #-16]!
Martin Storsjod91f49a2016-09-05 16:15:02 +0300115 sxtw x2, w2
116 sxtw x3, w3
117 sxtw x4, w4
118 sxtw x5, w5
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530119
120
121 and x7, x7, #12 //Finds y-offset
122 lsr x7, x7, #3 //dydx>>3
123 mul x7, x2, x7
124 add x7, x0, x7 //pu1_src + (y_offset>>1)*src_strd
125 sub x14, x4, #16
126 movi v22.8h, #20 // Filter coeff 0x14 into Q11
127 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd
128 subs x12, x5, #8 //if wd=8 branch to loop_8
129 movi v24.8h, #5 // Filter coeff 0x4 into Q12
130 beq loop_8_start
131
132 subs x12, x5, #4 //if wd=4 branch to loop_4
133 beq loop_4_start
134
135
136 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0]
137 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0]
138 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0]
139 ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0]
140 add x14, x14, #1 //for checking loop
141 ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0]
142 uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
143 ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
144
145loop_16: //when wd=16
146
147 uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0]
148 uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0]
149 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
150 uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8]
151 uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8]
152 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
153 ld1 {v0.2s, v1.2s}, [x0], x2
154 uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8]
155 uaddl v12.8h, v6.8b, v8.8b
156 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
157 uaddl v16.8h, v2.8b, v0.8b
158 uaddl v18.8h, v4.8b, v10.8b
159 mla v16.8h, v12.8h , v22.8h
160 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
161 uaddl v26.8h, v5.8b, v11.8b
162 uaddl v12.8h, v7.8b, v9.8b
163 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
164 uaddl v14.8h, v3.8b, v1.8b
165 ld1 {v2.2s, v3.2s}, [x0], x2
166 mla v14.8h, v12.8h , v22.8h
167 mls v16.8h, v18.8h , v24.8h
168 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
169 ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 0
170 urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
171 urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
172 uaddl v18.8h, v4.8b, v2.8b
173 uaddl v12.8h, v8.8b, v10.8b
174 st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0]
175 mla v18.8h, v12.8h , v22.8h
176 uaddl v20.8h, v6.8b, v0.8b
177 mls v14.8h, v26.8h , v24.8h
178 sqrshrun v30.8b, v16.8h, #5
179 uaddl v12.8h, v9.8b, v11.8b
180 uaddl v16.8h, v5.8b, v3.8b
181 uaddl v26.8h, v7.8b, v1.8b
182 mla v16.8h, v12.8h , v22.8h
183 mls v18.8h, v20.8h , v24.8h
184 ld1 {v4.2s, v5.2s}, [x0], x2
185 sqrshrun v31.8b, v14.8h, #5
186 ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 1
187 uaddl v12.8h, v10.8b, v0.8b
188 urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
189 urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
190 uaddl v14.8h, v6.8b, v4.8b
191 uaddl v20.8h, v8.8b, v2.8b
192 mla v14.8h, v12.8h , v22.8h
193 mls v16.8h, v26.8h , v24.8h
194 st1 {v30.2s, v31.2s}, [x1], x3 //store row 1
195 sqrshrun v30.8b, v18.8h, #5
196 uaddl v18.8h, v7.8b, v5.8b
197 uaddl v12.8h, v11.8b, v1.8b
198 mla v18.8h, v12.8h , v22.8h
199 uaddl v26.8h, v9.8b, v3.8b
200 mls v14.8h, v20.8h , v24.8h
201 ld1 {v6.2s, v7.2s}, [x0], x2
202 sqrshrun v31.8b, v16.8h, #5
203 ld1 {v16.2s, v17.2s}, [x7], x2 // Load for interpolation row 2
204 mls v18.8h, v26.8h , v24.8h
205 urhadd v30.16b, v16.16b , v30.16b // Interpolation to obtain qpel value
206 urhadd v31.16b, v17.16b , v31.16b // Interpolation to obtain qpel value
207 uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0]
208 st1 {v30.2s, v31.2s}, [x1], x3 //store row 2
209 uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0]
210 uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8]
211 sqrshrun v30.8b, v14.8h, #5
212 uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8]
213 uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0]
214 sqrshrun v31.8b, v18.8h, #5
215 ld1 {v18.2s, v19.2s}, [x7], x2 // Load for interpolation row 3
216 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
217 urhadd v30.16b, v18.16b , v30.16b // Interpolation to obtain qpel value
218 urhadd v31.16b, v19.16b , v31.16b // Interpolation to obtain qpel value
219 uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8]
220 st1 {v30.2s, v31.2s}, [x1], x3 //store row 3
221 // 4 rows processed
222 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
223 ld1 {v8.2s, v9.2s}, [x0], x2
224 uaddl v12.8h, v2.8b, v4.8b
225 uaddl v18.8h, v3.8b, v5.8b
226 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
227 uaddl v28.8h, v9.8b, v11.8b
228 uaddl v16.8h, v6.8b, v0.8b
229 mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20
230 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
231 uaddl v26.8h, v1.8b, v7.8b
232 uaddl v18.8h, v5.8b, v7.8b
233 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
234 uaddl v14.8h, v8.8b, v10.8b
235 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
236 ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 4
237 ld1 {v10.2s, v11.2s}, [x0], x2
238 urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
239 urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
240 mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
241 st1 {v30.2s, v31.2s}, [x1], x3 // store row 4
242 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
243 uaddl v20.8h, v11.8b, v1.8b
244 uaddl v26.8h, v3.8b, v9.8b
245 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
246 uaddl v12.8h, v6.8b, v4.8b
247 uaddl v18.8h, v7.8b, v9.8b
248 sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
249 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
250 uaddl v16.8h, v8.8b, v2.8b
251 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
252 ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 5
253 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
254 urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
255 urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
256 uaddl v14.8h, v10.8b, v0.8b
257 st1 {v30.2s, v31.2s}, [x1], x3 // store row 5
258 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
259 ld1 {v0.2s, v1.2s}, [x0], x2
260 uaddl v26.8h, v5.8b, v11.8b
261 uaddl v12.8h, v8.8b, v6.8b
262 uaddl v28.8h, v0.8b, v2.8b
263 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
264 mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20
265 uaddl v20.8h, v1.8b, v3.8b
266 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
267 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
268 uaddl v16.8h, v10.8b, v4.8b
269 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
270 ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 6
271 mov v2.8b, v6.8b
272 mov v3.8b, v7.8b
273 urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
274 urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
275
276 mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5
277 st1 {v30.2s, v31.2s}, [x1], x3 // store row 6
278 sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
279 swp v0.8b, v4.8b // swapping registers to put it in order
280 swp v1.8b, v5.8b // swapping registers to put it in order
281
282 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
283 mov v6.8b, v10.8b
284 mov v7.8b, v11.8b
285 subs x12, x14, #1 // if height==16 - looping
286 swp v4.8b, v8.8b
287 swp v5.8b, v9.8b
288 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
289 ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 7
290 urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
291 urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
292 st1 {v30.2s, v31.2s}, [x1], x3 // store row 7
293 bne end_func //if height =8 end function
294 add x14, x14, #1 //for checking loop
295 ld1 {v10.2s, v11.2s}, [x0], x2
296 uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
297
298 b loop_16 // looping if height =16
299
300loop_8_start:
301//// Processing row0 and row1
302
303 ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0]
304 ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0]
305 ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0]
306 ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0]
307 add x14, x14, #1 //for checking loop
308 ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0]
309 ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0]
310
311loop_8:
312 //for checking loop
313 uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0]
314 uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
315 uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
316 mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20
317 ld1 {v6.2s}, [x0], x2
318 uaddl v14.8h, v3.8b, v4.8b
319 uaddl v16.8h, v1.8b, v6.8b
320 uaddl v18.8h, v2.8b, v5.8b
321 mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5
322 mla v16.8h, v14.8h , v22.8h
323 ld1 {v7.2s}, [x0], x2
324 uaddl v20.8h, v4.8b, v5.8b
325 uaddl v12.8h, v2.8b, v7.8b
326 uaddl v10.8h, v3.8b, v6.8b
327 mls v16.8h, v18.8h , v24.8h
328 sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
329 mla v12.8h, v20.8h , v22.8h
330 ld1 {v8.2s}, [x7], x2 //Load value for interpolation (row0)
331 ld1 {v9.2s}, [x7], x2 //Load value for interpolation (row1)
332 ld1 {v0.2s}, [x0], x2
333 uaddl v14.8h, v5.8b, v6.8b
334 sqrshrun v27.8b, v16.8h, #5
335 urhadd v26.16b, v8.16b , v26.16b // Interpolation step for qpel calculation
336 urhadd v27.16b, v9.16b , v27.16b // Interpolation step for qpel calculation
337
338 uaddl v20.8h, v3.8b, v0.8b
339 mls v12.8h, v10.8h , v24.8h
340 st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0]
341 uaddl v18.8h, v4.8b, v7.8b
342 mla v20.8h, v14.8h , v22.8h
343 st1 {v27.2s}, [x1], x3 // Vector store to dst[1_0]
344 sqrshrun v28.8b, v12.8h, #5
345 mls v20.8h, v18.8h , v24.8h
346 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (row2)
347 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (row3)
348 ld1 {v1.2s}, [x0], x2
349 sqrshrun v29.8b, v20.8h, #5
350 subs x9, x4, #4
351 urhadd v28.16b, v12.16b , v28.16b
352 urhadd v29.16b, v13.16b , v29.16b
353 st1 {v28.2s}, [x1], x3 //store row 2
354 st1 {v29.2s}, [x1], x3 //store row 3
355 beq end_func // Branch if height==4
356 uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0]
357 uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
358 uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
359 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
360 ld1 {v2.2s}, [x0], x2
361 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
362 uaddl v8.8h, v0.8b, v7.8b
363 uaddl v10.8h, v1.8b, v6.8b
364 uaddl v12.8h, v2.8b, v5.8b
365 sqrshrun v26.8b, v18.8h, #5
366 mla v12.8h, v8.8h , v22.8h
367 ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row4)
368 ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row5)
369 ld1 {v3.2s}, [x0], x2
370 mls v12.8h, v10.8h , v24.8h
371 sqrshrun v27.8b, v12.8h, #5
372 urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
373 urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
374
375 st1 {v26.2s}, [x1], x3 // store row 4
376 st1 {v27.2s}, [x1], x3 // store row 5
377 uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0]
378 uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0]
379 uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0]
380 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
381 ld1 {v4.2s}, [x0], x2
382 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
383 uaddl v8.8h, v2.8b, v1.8b
384 uaddl v10.8h, v3.8b, v0.8b
385 uaddl v12.8h, v4.8b, v7.8b
386 sqrshrun v26.8b, v18.8h, #5
387 mla v12.8h, v8.8h , v22.8h
388 ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row6)
389 ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row7)
390 ld1 {v5.2s}, [x0], x2
391 mls v12.8h, v10.8h , v24.8h
392 sqrshrun v27.8b, v12.8h, #5
393 urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
394 urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
395
396 subs x12, x14, #1
397 st1 {v26.2s}, [x1], x3 // store row 6
398 st1 {v27.2s}, [x1], x3 // store row 7
399 add x14, x14, #1
400 beq loop_8 //looping if height ==16
401
402 b end_func
403
404
405loop_4_start:
406//// Processing row0 and row1
407
408
409 ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0]
410 ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0]
411 ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0]
412 ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0]
413 ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0]
414 ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0]
415
416 uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0]
417 uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
418 uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
419 mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20
420 ld1 {v6.2s}, [x0], x2
421 uaddl v14.8h, v3.8b, v4.8b
422 uaddl v16.8h, v1.8b, v6.8b
423 uaddl v18.8h, v2.8b, v5.8b
424 mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5
425 ld1 {v7.s}[0], [x0], x2
426 mla v16.8h, v14.8h , v22.8h
427 uaddl v20.8h, v4.8b, v5.8b
428 uaddl v12.8h, v2.8b, v7.8b
429 uaddl v10.8h, v3.8b, v6.8b
430 mls v16.8h, v18.8h , v24.8h
431 sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
432 ld1 {v8.s}[0], [x7], x2 //Load value for interpolation - row 0
433 ld1 {v9.s}[0], [x7], x2 //Load value for interpolation - row 1
434 mla v12.8h, v20.8h , v22.8h
435 ld1 {v0.s}[0], [x0], x2
436 uaddl v14.8h, v5.8b, v6.8b
437 sqrshrun v27.8b, v16.8h, #5
438 uaddl v20.8h, v3.8b, v0.8b
439 urhadd v26.16b, v26.16b , v8.16b //Interpolation step for qpel calculation
440 urhadd v27.16b, v27.16b , v9.16b //Interpolation step for qpel calculation
441
442 mls v12.8h, v10.8h , v24.8h
443 st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0]
444 uaddl v18.8h, v4.8b, v7.8b
445 mla v20.8h, v14.8h , v22.8h
446 st1 {v27.s}[0], [x1], x3 // store row 1
447 sqrshrun v28.8b, v12.8h, #5
448 ld1 {v12.s}[0], [x7], x2 //Load value for interpolation - row 2
449 ld1 {v13.s}[0], [x7], x2 //Load value for interpolation - row 3
450
451 mls v20.8h, v18.8h , v24.8h
452 ld1 {v1.s}[0], [x0], x2
453 sqrshrun v29.8b, v20.8h, #5
454 urhadd v28.16b, v12.16b , v28.16b //Interpolation step for qpel calculation
455 urhadd v29.16b, v13.16b , v29.16b //Interpolation step for qpel calculation
456
457 st1 {v28.s}[0], [x1], x3 //store row 2
458 st1 {v29.s}[0], [x1], x3 //store row 3
459
460 subs x9, x4, #4
461 beq end_func // Branch if height==4
462
463
464 uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0]
465 uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
466 uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
467 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
468 ld1 {v2.s}[0], [x0], x2
469 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
470 uaddl v8.8h, v0.8b, v7.8b
471 uaddl v10.8h, v1.8b, v6.8b
472 uaddl v12.8h, v2.8b, v5.8b
473 sqrshrun v26.8b, v18.8h, #5
474 ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 4
475 ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 5
476 mla v12.8h, v8.8h , v22.8h
477 ld1 {v3.s}[0], [x0], x2
478 mls v12.8h, v10.8h , v24.8h
479 sqrshrun v27.8b, v12.8h, #5
480 urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
481 urhadd v27.16b, v27.16b , v19.16b //Interpolation step for qpel calculation
482
483 st1 {v26.s}[0], [x1], x3 //store row 4
484 st1 {v27.s}[0], [x1], x3 // store row 5
485 uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0]
486 uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0]
487 uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0]
488 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
489 ld1 {v4.s}[0], [x0], x2
490 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
491 uaddl v8.8h, v2.8b, v1.8b
492 uaddl v10.8h, v3.8b, v0.8b
493 uaddl v12.8h, v4.8b, v7.8b
494 sqrshrun v26.8b, v18.8h, #5
495 ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 6
496 ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 7
497 mla v12.8h, v8.8h , v22.8h
498 ld1 {v5.s}[0], [x0], x2
499 mls v12.8h, v10.8h , v24.8h
500 sqrshrun v27.8b, v12.8h, #5
501 urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
502 urhadd v27.16b, v19.16b , v27.16b //Interpolation step for qpel calculation
503
504 st1 {v26.s}[0], [x1], x3 // store row 6
505 st1 {v27.s}[0], [x1], x3 // store row 7
506
507
508end_func:
509 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
510 ldp x19, x20, [sp], #16
511 pop_v_regs
512 ret
513
514
515