blob: 09917b16bf25fff2b169fa3654597c39caf70c89 [file] [log] [blame]
tlegrand@google.com3c3902f2013-12-09 08:35:25 +00001; Copyright (c) 2007-2008 CSIRO
2; Copyright (c) 2007-2009 Xiph.Org Foundation
3; Copyright (c) 2013 Parrot
4; Written by Aurélien Zanelli
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions
8; are met:
9;
10; - Redistributions of source code must retain the above copyright
11; notice, this list of conditions and the following disclaimer.
12;
13; - Redistributions in binary form must reproduce the above copyright
14; notice, this list of conditions and the following disclaimer in the
15; documentation and/or other materials provided with the distribution.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
21; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 AREA |.text|, CODE, READONLY
30
31 GET celt/arm/armopts.s
32
33IF OPUS_ARM_MAY_HAVE_EDSP
34 EXPORT celt_pitch_xcorr_edsp
35ENDIF
36
37IF OPUS_ARM_MAY_HAVE_NEON
38 EXPORT celt_pitch_xcorr_neon
39ENDIF
40
41IF OPUS_ARM_MAY_HAVE_NEON
42
43; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
44xcorr_kernel_neon PROC
45 ; input:
46 ; r3 = int len
47 ; r4 = opus_val16 *x
48 ; r5 = opus_val16 *y
49 ; q0 = opus_val32 sum[4]
50 ; output:
51 ; q0 = opus_val32 sum[4]
52 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
53 ; internal usage:
54 ; r12 = int j
55 ; d3 = y_3|y_2|y_1|y_0
56 ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
57 ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
58 ; q8 = scratch
59 ;
60 ; Load y[0...3]
61 ; This requires len>0 to always be valid (which we assert in the C code).
62 VLD1.16 {d5}, [r5]!
63 SUBS r12, r3, #8
64 BLE xcorr_kernel_neon_process4
65; Process 8 samples at a time.
66; This loop loads one y value more than we actually need. Therefore we have to
67; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
68; reading past the end of the array.
69xcorr_kernel_neon_process8
70 ; This loop has 19 total instructions (10 cycles to issue, minimum), with
71 ; - 2 cycles of ARM insrtuctions,
72 ; - 10 cycles of load/store/byte permute instructions, and
73 ; - 9 cycles of data processing instructions.
74 ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
75 ; latter two categories, meaning the whole loop should run in 10 cycles per
76 ; iteration, barring cache misses.
77 ;
78 ; Load x[0...7]
79 VLD1.16 {d6, d7}, [r4]!
80 ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
81 ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
82 VAND d3, d5, d5
83 SUBS r12, r12, #8
84 ; Load y[4...11]
85 VLD1.16 {d4, d5}, [r5]!
86 VMLAL.S16 q0, d3, d6[0]
87 VEXT.16 d16, d3, d4, #1
88 VMLAL.S16 q0, d4, d7[0]
89 VEXT.16 d17, d4, d5, #1
90 VMLAL.S16 q0, d16, d6[1]
91 VEXT.16 d16, d3, d4, #2
92 VMLAL.S16 q0, d17, d7[1]
93 VEXT.16 d17, d4, d5, #2
94 VMLAL.S16 q0, d16, d6[2]
95 VEXT.16 d16, d3, d4, #3
96 VMLAL.S16 q0, d17, d7[2]
97 VEXT.16 d17, d4, d5, #3
98 VMLAL.S16 q0, d16, d6[3]
99 VMLAL.S16 q0, d17, d7[3]
100 BGT xcorr_kernel_neon_process8
101; Process 4 samples here if we have > 4 left (still reading one extra y value).
102xcorr_kernel_neon_process4
103 ADDS r12, r12, #4
104 BLE xcorr_kernel_neon_process2
105 ; Load x[0...3]
106 VLD1.16 d6, [r4]!
107 ; Use VAND since it's a data processing instruction again.
108 VAND d4, d5, d5
109 SUB r12, r12, #4
110 ; Load y[4...7]
111 VLD1.16 d5, [r5]!
112 VMLAL.S16 q0, d4, d6[0]
113 VEXT.16 d16, d4, d5, #1
114 VMLAL.S16 q0, d16, d6[1]
115 VEXT.16 d16, d4, d5, #2
116 VMLAL.S16 q0, d16, d6[2]
117 VEXT.16 d16, d4, d5, #3
118 VMLAL.S16 q0, d16, d6[3]
119; Process 2 samples here if we have > 2 left (still reading one extra y value).
120xcorr_kernel_neon_process2
121 ADDS r12, r12, #2
122 BLE xcorr_kernel_neon_process1
123 ; Load x[0...1]
124 VLD2.16 {d6[],d7[]}, [r4]!
125 ; Use VAND since it's a data processing instruction again.
126 VAND d4, d5, d5
127 SUB r12, r12, #2
128 ; Load y[4...5]
129 VLD1.32 {d5[]}, [r5]!
130 VMLAL.S16 q0, d4, d6
131 VEXT.16 d16, d4, d5, #1
132 ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
133 ; instead of VEXT, since it's a data-processing instruction.
134 VSRI.64 d5, d4, #32
135 VMLAL.S16 q0, d16, d7
136; Process 1 sample using the extra y value we loaded above.
137xcorr_kernel_neon_process1
138 ; Load next *x
139 VLD1.16 {d6[]}, [r4]!
140 ADDS r12, r12, #1
141 ; y[0...3] are left in d5 from prior iteration(s) (if any)
142 VMLAL.S16 q0, d5, d6
143 MOVLE pc, lr
144; Now process 1 last sample, not reading ahead.
145 ; Load last *y
146 VLD1.16 {d4[]}, [r5]!
147 VSRI.64 d4, d5, #16
148 ; Load last *x
149 VLD1.16 {d6[]}, [r4]!
150 VMLAL.S16 q0, d4, d6
151 MOV pc, lr
152 ENDP
153
154; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
155; opus_val32 *xcorr, int len, int max_pitch)
156celt_pitch_xcorr_neon PROC
157 ; input:
158 ; r0 = opus_val16 *_x
159 ; r1 = opus_val16 *_y
160 ; r2 = opus_val32 *xcorr
161 ; r3 = int len
162 ; output:
163 ; r0 = int maxcorr
164 ; internal usage:
165 ; r4 = opus_val16 *x (for xcorr_kernel_neon())
166 ; r5 = opus_val16 *y (for xcorr_kernel_neon())
167 ; r6 = int max_pitch
168 ; r12 = int j
169 ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
170 STMFD sp!, {r4-r6, lr}
171 LDR r6, [sp, #16]
172 VMOV.S32 q15, #1
173 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
174 SUBS r6, r6, #4
175 BLT celt_pitch_xcorr_neon_process4_done
176celt_pitch_xcorr_neon_process4
177 ; xcorr_kernel_neon parameters:
178 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
179 MOV r4, r0
180 MOV r5, r1
181 VEOR q0, q0, q0
182 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
183 ; So we don't save/restore any other registers.
184 BL xcorr_kernel_neon
185 SUBS r6, r6, #4
186 VST1.32 {q0}, [r2]!
187 ; _y += 4
188 ADD r1, r1, #8
189 VMAX.S32 q15, q15, q0
190 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
191 BGE celt_pitch_xcorr_neon_process4
192; We have less than 4 sums left to compute.
193celt_pitch_xcorr_neon_process4_done
194 ADDS r6, r6, #4
195 ; Reduce maxcorr to a single value
196 VMAX.S32 d30, d30, d31
197 VPMAX.S32 d30, d30, d30
198 ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
199 BLE celt_pitch_xcorr_neon_done
200; Now compute each remaining sum one at a time.
201celt_pitch_xcorr_neon_process_remaining
202 MOV r4, r0
203 MOV r5, r1
204 VMOV.I32 q0, #0
205 SUBS r12, r3, #8
206 BLT celt_pitch_xcorr_neon_process_remaining4
207; Sum terms 8 at a time.
208celt_pitch_xcorr_neon_process_remaining_loop8
209 ; Load x[0...7]
210 VLD1.16 {q1}, [r4]!
211 ; Load y[0...7]
212 VLD1.16 {q2}, [r5]!
213 SUBS r12, r12, #8
214 VMLAL.S16 q0, d4, d2
215 VMLAL.S16 q0, d5, d3
216 BGE celt_pitch_xcorr_neon_process_remaining_loop8
217; Sum terms 4 at a time.
218celt_pitch_xcorr_neon_process_remaining4
219 ADDS r12, r12, #4
220 BLT celt_pitch_xcorr_neon_process_remaining4_done
221 ; Load x[0...3]
222 VLD1.16 {d2}, [r4]!
223 ; Load y[0...3]
224 VLD1.16 {d3}, [r5]!
225 SUB r12, r12, #4
226 VMLAL.S16 q0, d3, d2
227celt_pitch_xcorr_neon_process_remaining4_done
228 ; Reduce the sum to a single value.
229 VADD.S32 d0, d0, d1
230 VPADDL.S32 d0, d0
231 ADDS r12, r12, #4
232 BLE celt_pitch_xcorr_neon_process_remaining_loop_done
233; Sum terms 1 at a time.
234celt_pitch_xcorr_neon_process_remaining_loop1
235 VLD1.16 {d2[]}, [r4]!
236 VLD1.16 {d3[]}, [r5]!
237 SUBS r12, r12, #1
238 VMLAL.S16 q0, d2, d3
239 BGT celt_pitch_xcorr_neon_process_remaining_loop1
240celt_pitch_xcorr_neon_process_remaining_loop_done
241 VST1.32 {d0[0]}, [r2]!
242 VMAX.S32 d30, d30, d0
243 SUBS r6, r6, #1
244 ; _y++
245 ADD r1, r1, #2
246 ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
247 BGT celt_pitch_xcorr_neon_process_remaining
248celt_pitch_xcorr_neon_done
249 VMOV.32 r0, d30[0]
250 LDMFD sp!, {r4-r6, pc}
251 ENDP
252
253ENDIF
254
255IF OPUS_ARM_MAY_HAVE_EDSP
256
257; This will get used on ARMv7 devices without NEON, so it has been optimized
258; to take advantage of dual-issuing where possible.
259xcorr_kernel_edsp PROC
260 ; input:
261 ; r3 = int len
262 ; r4 = opus_val16 *_x (must be 32-bit aligned)
263 ; r5 = opus_val16 *_y (must be 32-bit aligned)
264 ; r6...r9 = opus_val32 sum[4]
265 ; output:
266 ; r6...r9 = opus_val32 sum[4]
267 ; preserved: r0-r5
268 ; internal usage
269 ; r2 = int j
270 ; r12,r14 = opus_val16 x[4]
271 ; r10,r11 = opus_val16 y[4]
272 STMFD sp!, {r2,r4,r5,lr}
273 LDR r10, [r5], #4 ; Load y[0...1]
274 SUBS r2, r3, #4 ; j = len-4
275 LDR r11, [r5], #4 ; Load y[2...3]
276 BLE xcorr_kernel_edsp_process4_done
277 LDR r12, [r4], #4 ; Load x[0...1]
278 ; Stall
279xcorr_kernel_edsp_process4
280 ; The multiplies must issue from pipeline 0, and can't dual-issue with each
281 ; other. Every other instruction here dual-issues with a multiply, and is
282 ; thus "free". There should be no stalls in the body of the loop.
283 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
284 LDR r14, [r4], #4 ; Load x[2...3]
285 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
286 SUBS r2, r2, #4 ; j-=4
287 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
288 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
289 SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
290 LDR r10, [r5], #4 ; Load y[4...5]
291 SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
292 SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
293 SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
294 LDRGT r12, [r4], #4 ; Load x[0...1]
295 SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
296 SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
297 SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
298 SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
299 SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
300 LDR r11, [r5], #4 ; Load y[6...7]
301 SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
302 SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
303 SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
304 BGT xcorr_kernel_edsp_process4
305xcorr_kernel_edsp_process4_done
306 ADDS r2, r2, #4
307 BLE xcorr_kernel_edsp_done
308 LDRH r12, [r4], #2 ; r12 = *x++
309 SUBS r2, r2, #1 ; j--
310 ; Stall
311 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
312 LDRGTH r14, [r4], #2 ; r14 = *x++
313 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
314 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
315 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
316 BLE xcorr_kernel_edsp_done
317 SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
318 SUBS r2, r2, #1 ; j--
319 SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
320 LDRH r10, [r5], #2 ; r10 = y_4 = *y++
321 SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
322 LDRGTH r12, [r4], #2 ; r12 = *x++
323 SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
324 BLE xcorr_kernel_edsp_done
325 SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
326 CMP r2, #1 ; j--
327 SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
328 LDRH r2, [r5], #2 ; r2 = y_5 = *y++
329 SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
330 LDRGTH r14, [r4] ; r14 = *x
331 SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
332 BLE xcorr_kernel_edsp_done
333 SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
334 LDRH r11, [r5] ; r11 = y_6 = *y
335 SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
336 SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
337 SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
338xcorr_kernel_edsp_done
339 LDMFD sp!, {r2,r4,r5,pc}
340 ENDP
341
342celt_pitch_xcorr_edsp PROC
343 ; input:
344 ; r0 = opus_val16 *_x (must be 32-bit aligned)
345 ; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
346 ; r2 = opus_val32 *xcorr
347 ; r3 = int len
348 ; output:
349 ; r0 = maxcorr
350 ; internal usage
351 ; r4 = opus_val16 *x
352 ; r5 = opus_val16 *y
353 ; r6 = opus_val32 sum0
354 ; r7 = opus_val32 sum1
355 ; r8 = opus_val32 sum2
356 ; r9 = opus_val32 sum3
357 ; r1 = int max_pitch
358 ; r12 = int j
359 STMFD sp!, {r4-r11, lr}
360 MOV r5, r1
361 LDR r1, [sp, #36]
362 MOV r4, r0
363 TST r5, #3
364 ; maxcorr = 1
365 MOV r0, #1
366 BEQ celt_pitch_xcorr_edsp_process1u_done
367; Compute one sum at the start to make y 32-bit aligned.
368 SUBS r12, r3, #4
369 ; r14 = sum = 0
370 MOV r14, #0
371 LDRH r8, [r5], #2
372 BLE celt_pitch_xcorr_edsp_process1u_loop4_done
373 LDR r6, [r4], #4
374 MOV r8, r8, LSL #16
375celt_pitch_xcorr_edsp_process1u_loop4
376 LDR r9, [r5], #4
377 SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
378 LDR r7, [r4], #4
379 SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
380 LDR r8, [r5], #4
381 SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
382 SUBS r12, r12, #4 ; j-=4
383 SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
384 LDRGT r6, [r4], #4
385 BGT celt_pitch_xcorr_edsp_process1u_loop4
386 MOV r8, r8, LSR #16
387celt_pitch_xcorr_edsp_process1u_loop4_done
388 ADDS r12, r12, #4
389celt_pitch_xcorr_edsp_process1u_loop1
390 LDRGEH r6, [r4], #2
391 ; Stall
392 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
393 SUBGES r12, r12, #1
394 LDRGTH r8, [r5], #2
395 BGT celt_pitch_xcorr_edsp_process1u_loop1
396 ; Restore _x
397 SUB r4, r4, r3, LSL #1
398 ; Restore and advance _y
399 SUB r5, r5, r3, LSL #1
400 ; maxcorr = max(maxcorr, sum)
401 CMP r0, r14
402 ADD r5, r5, #2
403 MOVLT r0, r14
404 SUBS r1, r1, #1
405 ; xcorr[i] = sum
406 STR r14, [r2], #4
407 BLE celt_pitch_xcorr_edsp_done
408celt_pitch_xcorr_edsp_process1u_done
409 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
410 SUBS r1, r1, #4
411 BLT celt_pitch_xcorr_edsp_process2
412celt_pitch_xcorr_edsp_process4
413 ; xcorr_kernel_edsp parameters:
414 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
415 MOV r6, #0
416 MOV r7, #0
417 MOV r8, #0
418 MOV r9, #0
419 BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
420 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
421 CMP r0, r6
422 ; _y+=4
423 ADD r5, r5, #8
424 MOVLT r0, r6
425 CMP r0, r7
426 MOVLT r0, r7
427 CMP r0, r8
428 MOVLT r0, r8
429 CMP r0, r9
430 MOVLT r0, r9
431 STMIA r2!, {r6-r9}
432 SUBS r1, r1, #4
433 BGE celt_pitch_xcorr_edsp_process4
434celt_pitch_xcorr_edsp_process2
435 ADDS r1, r1, #2
436 BLT celt_pitch_xcorr_edsp_process1a
437 SUBS r12, r3, #4
438 ; {r10, r11} = {sum0, sum1} = {0, 0}
439 MOV r10, #0
440 MOV r11, #0
441 LDR r8, [r5], #4
442 BLE celt_pitch_xcorr_edsp_process2_loop_done
443 LDR r6, [r4], #4
444 LDR r9, [r5], #4
445celt_pitch_xcorr_edsp_process2_loop4
446 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
447 LDR r7, [r4], #4
448 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
449 SUBS r12, r12, #4 ; j-=4
450 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
451 LDR r8, [r5], #4
452 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
453 LDRGT r6, [r4], #4
454 SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
455 SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
456 SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
457 LDRGT r9, [r5], #4
458 SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
459 BGT celt_pitch_xcorr_edsp_process2_loop4
460celt_pitch_xcorr_edsp_process2_loop_done
461 ADDS r12, r12, #2
462 BLE celt_pitch_xcorr_edsp_process2_1
463 LDR r6, [r4], #4
464 ; Stall
465 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
466 LDR r9, [r5], #4
467 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
468 SUB r12, r12, #2
469 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
470 MOV r8, r9
471 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
472celt_pitch_xcorr_edsp_process2_1
473 LDRH r6, [r4], #2
474 ADDS r12, r12, #1
475 ; Stall
476 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
477 LDRGTH r7, [r4], #2
478 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
479 BLE celt_pitch_xcorr_edsp_process2_done
480 LDRH r9, [r5], #2
481 SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
482 SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
483celt_pitch_xcorr_edsp_process2_done
484 ; Restore _x
485 SUB r4, r4, r3, LSL #1
486 ; Restore and advance _y
487 SUB r5, r5, r3, LSL #1
488 ; maxcorr = max(maxcorr, sum0)
489 CMP r0, r10
490 ADD r5, r5, #2
491 MOVLT r0, r10
492 SUB r1, r1, #2
493 ; maxcorr = max(maxcorr, sum1)
494 CMP r0, r11
495 ; xcorr[i] = sum
496 STR r10, [r2], #4
497 MOVLT r0, r11
498 STR r11, [r2], #4
499celt_pitch_xcorr_edsp_process1a
500 ADDS r1, r1, #1
501 BLT celt_pitch_xcorr_edsp_done
502 SUBS r12, r3, #4
503 ; r14 = sum = 0
504 MOV r14, #0
505 BLT celt_pitch_xcorr_edsp_process1a_loop_done
506 LDR r6, [r4], #4
507 LDR r8, [r5], #4
508 LDR r7, [r4], #4
509 LDR r9, [r5], #4
510celt_pitch_xcorr_edsp_process1a_loop4
511 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
512 SUBS r12, r12, #4 ; j-=4
513 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
514 LDRGE r6, [r4], #4
515 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
516 LDRGE r8, [r5], #4
517 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
518 LDRGE r7, [r4], #4
519 LDRGE r9, [r5], #4
520 BGE celt_pitch_xcorr_edsp_process1a_loop4
521celt_pitch_xcorr_edsp_process1a_loop_done
522 ADDS r12, r12, #2
523 LDRGE r6, [r4], #4
524 LDRGE r8, [r5], #4
525 ; Stall
526 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
527 SUBGE r12, r12, #2
528 SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
529 ADDS r12, r12, #1
530 LDRGEH r6, [r4], #2
531 LDRGEH r8, [r5], #2
532 ; Stall
533 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
534 ; maxcorr = max(maxcorr, sum)
535 CMP r0, r14
536 ; xcorr[i] = sum
537 STR r14, [r2], #4
538 MOVLT r0, r14
539celt_pitch_xcorr_edsp_done
540 LDMFD sp!, {r4-r11, pc}
541 ENDP
542
543ENDIF
544
545END