blob: c16c5e95a13e5aef49c99d22d5c7305e9b53c34a [file] [log] [blame]
Gloria Wang79130732010-02-08 14:41:04 -08001@ Tremolo library
Gloria Wang2da723a2010-03-18 15:56:16 -07002@-----------------------------------------------------------------------
3@ Copyright (C) 2002-2009, Xiph.org Foundation
4@ Copyright (C) 2010, Robin Watts for Pinknoise Productions Ltd
5@ All rights reserved.
6
7@ Redistribution and use in source and binary forms, with or without
8@ modification, are permitted provided that the following conditions
9@ are met:
10
11@ * Redistributions of source code must retain the above copyright
12@ notice, this list of conditions and the following disclaimer.
13@ * Redistributions in binary form must reproduce the above
14@ copyright notice, this list of conditions and the following disclaimer
15@ in the documentation and/or other materials provided with the
16@ distribution.
17@ * Neither the names of the Xiph.org Foundation nor Pinknoise
18@ Productions Ltd nor the names of its contributors may be used to
19@ endorse or promote products derived from this software without
20@ specific prior written permission.
21@
22@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23@ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26@ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33@ ----------------------------------------------------------------------
Gloria Wang79130732010-02-08 14:41:04 -080034
35 .text
36
37 @ full accuracy version
38
39 .global mdct_backwardARM
40 .global mdct_shift_right
41 .global mdct_unroll_prelap
42 .global mdct_unroll_part2
43 .global mdct_unroll_part3
44 .global mdct_unroll_postlap
45
Ray Essickec4b0952020-08-30 20:04:52 -070046 .type mdct_backwardARM, %function
47 .type mdct_shift_right, %function
48 .type mdct_unroll_prelap, %function
49 .type mdct_unroll_part2, %function
50 .type mdct_unroll_part3, %function
51 .type mdct_unroll_postlap, %function
52
Gloria Wang79130732010-02-08 14:41:04 -080053 .extern sincos_lookup0
54 .extern sincos_lookup1
Ard Biesheuvel27736022014-08-08 08:30:00 +020055 .hidden sincos_lookup0
56 .hidden sincos_lookup1
Gloria Wang79130732010-02-08 14:41:04 -080057
Elliott Hughes7649bf92021-04-30 09:36:43 -070058 @ clang doesn't support ADRL.
59 @ Workaround based on that at https://bugs.llvm.org/show_bug.cgi?id=24350.
60 .macro ADRL reg:req, label:req
61 add \reg, pc, #((\label - .L_adrl_\@) & 0xff00)
62 add \reg, \reg, #((\label - .L_adrl_\@) - ((\label - .L_adrl_\@) & 0xff00))
63 .L_adrl_\@:
64 .endm
65
Gloria Wang79130732010-02-08 14:41:04 -080066mdct_unroll_prelap:
67 @ r0 = out
68 @ r1 = post
69 @ r2 = r
70 @ r3 = step
71 STMFD r13!,{r4-r7,r14}
72 MVN r4, #0x8000
73 MOV r3, r3, LSL #1
74 SUB r1, r2, r1 @ r1 = r - post
75 SUBS r1, r1, #16 @ r1 = r - post - 16
76 BLT unroll_over
77unroll_loop:
78 LDMDB r2!,{r5,r6,r7,r12}
79
80 MOV r5, r5, ASR #9 @ r5 = (*--r)>>9
81 MOV r6, r6, ASR #9 @ r6 = (*--r)>>9
82 MOV r7, r7, ASR #9 @ r7 = (*--r)>>9
83 MOV r12,r12,ASR #9 @ r12= (*--r)>>9
84
85 MOV r14,r12,ASR #15
86 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
87 EORNE r12,r4, r14,ASR #31
88 STRH r12,[r0], r3
89
90 MOV r14,r7, ASR #15
91 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
92 EORNE r7, r4, r14,ASR #31
93 STRH r7, [r0], r3
94
95 MOV r14,r6, ASR #15
96 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
97 EORNE r6, r4, r14,ASR #31
98 STRH r6, [r0], r3
99
100 MOV r14,r5, ASR #15
101 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
102 EORNE r5, r4, r14,ASR #31
103 STRH r5, [r0], r3
104
105 SUBS r1, r1, #16
106 BGE unroll_loop
107
108unroll_over:
109 ADDS r1, r1, #16
110 BLE unroll_end
111unroll_loop2:
112 LDR r5,[r2,#-4]!
113 @ stall
114 @ stall (Xscale)
115 MOV r5, r5, ASR #9 @ r5 = (*--r)>>9
116 MOV r14,r5, ASR #15
117 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
118 EORNE r5, r4, r14,ASR #31
119 STRH r5, [r0], r3
120 SUBS r1, r1, #4
121 BGT unroll_loop2
122unroll_end:
123 LDMFD r13!,{r4-r7,PC}
124
125mdct_unroll_postlap:
126 @ r0 = out
127 @ r1 = post
128 @ r2 = l
129 @ r3 = step
130 STMFD r13!,{r4-r7,r14}
131 MVN r4, #0x8000
132 MOV r3, r3, LSL #1
133 SUB r1, r1, r2 @ r1 = post - l
134 MOV r1, r1, ASR #1 @ r1 = (post - l)>>1
135 SUBS r1, r1, #16 @ r1 = ((post - l)>>1) - 4
136 BLT unroll_over3
137unroll_loop3:
138 LDR r12,[r2],#8
139 LDR r7, [r2],#8
140 LDR r6, [r2],#8
141 LDR r5, [r2],#8
142
143 RSB r12,r12,#0
144 RSB r5, r5, #0
145 RSB r6, r6, #0
146 RSB r7, r7, #0
147
148 MOV r12, r12,ASR #9 @ r12= (-*l)>>9
149 MOV r5, r5, ASR #9 @ r5 = (-*l)>>9
150 MOV r6, r6, ASR #9 @ r6 = (-*l)>>9
151 MOV r7, r7, ASR #9 @ r7 = (-*l)>>9
152
153 MOV r14,r12,ASR #15
154 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
155 EORNE r12,r4, r14,ASR #31
156 STRH r12,[r0], r3
157
158 MOV r14,r7, ASR #15
159 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
160 EORNE r7, r4, r14,ASR #31
161 STRH r7, [r0], r3
162
163 MOV r14,r6, ASR #15
164 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
165 EORNE r6, r4, r14,ASR #31
166 STRH r6, [r0], r3
167
168 MOV r14,r5, ASR #15
169 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
170 EORNE r5, r4, r14,ASR #31
171 STRH r5, [r0], r3
172
173 SUBS r1, r1, #16
174 BGE unroll_loop3
175
176unroll_over3:
177 ADDS r1, r1, #16
178 BLE unroll_over4
179unroll_loop4:
180 LDR r5,[r2], #8
181 @ stall
182 @ stall (Xscale)
183 RSB r5, r5, #0
184 MOV r5, r5, ASR #9 @ r5 = (-*l)>>9
185 MOV r14,r5, ASR #15
186 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
187 EORNE r5, r4, r14,ASR #31
188 STRH r5, [r0], r3
189 SUBS r1, r1, #4
190 BGT unroll_loop4
191unroll_over4:
192 LDMFD r13!,{r4-r7,PC}
193
194mdct_unroll_part2:
195 @ r0 = out
196 @ r1 = post
197 @ r2 = l
198 @ r3 = r
199 @ <> = step
200 @ <> = wL
201 @ <> = wR
202 MOV r12,r13
203 STMFD r13!,{r4,r6-r11,r14}
204 LDMFD r12,{r8,r9,r10} @ r8 = step
205 @ r9 = wL
206 @ r10= wR
207 MVN r4, #0x8000
208 MOV r8, r8, LSL #1
209 SUBS r1, r3, r1 @ r1 = (r - post)
210 BLE unroll_over5
211unroll_loop5:
212 LDR r12,[r2, #-8]! @ r12= *l (but l -= 2 first)
213 LDR r11,[r9],#4 @ r11= *wL++
214 LDR r7, [r3, #-4]! @ r7 = *--r
215 LDR r6, [r10,#-4]! @ r6 = *--wR
216
217 @ Can save a cycle here, at the cost of 1bit errors in rounding
218 SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++
219 SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR
220 ADD r6, r6, r11
221 MOV r6, r6, ASR #8
222 MOV r14,r6, ASR #15
223 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
224 EORNE r6, r4, r14,ASR #31
225 STRH r6, [r0], r8
226
227 SUBS r1, r1, #4
228 BGT unroll_loop5
229
230unroll_over5:
231 LDMFD r13!,{r4,r6-r11,PC}
232
233mdct_unroll_part3:
234 @ r0 = out
235 @ r1 = post
236 @ r2 = l
237 @ r3 = r
238 @ <> = step
239 @ <> = wL
240 @ <> = wR
241 MOV r12,r13
242 STMFD r13!,{r4,r6-r11,r14}
243 LDMFD r12,{r8,r9,r10} @ r8 = step
244 @ r9 = wL
245 @ r10= wR
246 MVN r4, #0x8000
247 MOV r8, r8, LSL #1
248 SUBS r1, r1, r3 @ r1 = (post - r)
249 BLE unroll_over6
250unroll_loop6:
251 LDR r12,[r2],#8 @ r12= *l (but l += 2 first)
252 LDR r11,[r9],#4 @ r11= *wL++
253 LDR r7, [r3],#4 @ r7 = *r++
254 LDR r6, [r10,#-4]! @ r6 = *--wR
255
256 @ Can save a cycle here, at the cost of 1bit errors in rounding
257 SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++
258 SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR
259 SUB r6, r6, r11
260 MOV r6, r6, ASR #8
261 MOV r14,r6, ASR #15
262 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
263 EORNE r6, r4, r14,ASR #31
264 STRH r6, [r0], r8
265
266 SUBS r1, r1, #4
267 BGT unroll_loop6
268
269unroll_over6:
270 LDMFD r13!,{r4,r6-r11,PC}
271
272mdct_shift_right:
273 @ r0 = n
274 @ r1 = in
275 @ r2 = right
276 STMFD r13!,{r4-r11,r14}
277
278 MOV r0, r0, LSR #2 @ n >>= 2
279 ADD r1, r1, #4
280
281 SUBS r0, r0, #8
282 BLT sr_less_than_8
283sr_loop:
284 LDR r3, [r1], #8
285 LDR r4, [r1], #8
286 LDR r5, [r1], #8
287 LDR r6, [r1], #8
288 LDR r7, [r1], #8
289 LDR r8, [r1], #8
290 LDR r12,[r1], #8
291 LDR r14,[r1], #8
292 SUBS r0, r0, #8
293 STMIA r2!,{r3,r4,r5,r6,r7,r8,r12,r14}
294 BGE sr_loop
295sr_less_than_8:
296 ADDS r0, r0, #8
297 BEQ sr_end
298sr_loop2:
299 LDR r3, [r1], #8
300 SUBS r0, r0, #1
301 STR r3, [r2], #4
302 BGT sr_loop2
303sr_end:
304 LDMFD r13!,{r4-r11,PC}
305
306mdct_backwardARM:
307 @ r0 = n
308 @ r1 = in
309 STMFD r13!,{r4-r11,r14}
310
311 MOV r2,#1<<4 @ r2 = 1<<shift
312 MOV r3,#13-4 @ r3 = 13-shift
313find_shift_loop:
314 TST r0,r2 @ if (n & (1<<shift)) == 0
315 MOV r2,r2,LSL #1
316 SUBEQ r3,r3,#1 @ shift--
317 BEQ find_shift_loop
318 MOV r2,#2
319 MOV r2,r2,LSL r3 @ r2 = step = 2<<shift
320
321 @ presymmetry
322 @ r0 = n (a multiple of 4)
323 @ r1 = in
324 @ r2 = step
325 @ r3 = shift
326
327 ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1)
328 ADD r14,r1, r0 @ r14= in+(n>>2)
329 SUB r4, r4, #3*4 @ r4 = aX = in+n2-3
Ard Biesheuvele96d4492012-08-11 23:47:34 +0200330 ADRL r7, .Lsincos_lookup
331 LDR r5, [r7] @ r5 = T=sincos_lookup0
332 ADD r5, r7
Gloria Wang79130732010-02-08 14:41:04 -0800333
334presymmetry_loop1:
335 LDR r7, [r4,#8] @ r6 = s2 = aX[2]
336 LDR r11,[r5,#4] @ r11= T[1]
337 LDR r6, [r4] @ r6 = s0 = aX[0]
338 LDR r10,[r5],r2,LSL #2 @ r10= T[0] T += step
339
340 @ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2])
341 SMULL r8, r9, r7, r11 @ (r8, r9) = s2*T[1]
342 @ stall
343 @ stall ?
344 SMLAL r8, r9, r6, r10 @ (r8, r9) += s0*T[0]
345 RSB r6, r6, #0
346 @ stall ?
347 SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[0]
348 MOV r9, r9, LSL #1
349 @ stall ?
350 SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[1]
351 STR r9, [r4],#-16 @ aX[0] = r9
352 CMP r4,r14
353 MOV r12,r12,LSL #1
354 STR r12,[r4,#8+16] @ aX[2] = r12
355
356 BGE presymmetry_loop1 @ while (aX >= in+n4)
357
358presymmetry_loop2:
359 LDR r6,[r4] @ r6 = s0 = aX[0]
360 LDR r10,[r5,#4] @ r10= T[1]
361 LDR r7,[r4,#8] @ r6 = s2 = aX[2]
362 LDR r11,[r5],-r2,LSL #2 @ r11= T[0] T -= step
363
364 @ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2])
365 SMULL r8, r9, r6, r10 @ (r8, r9) = s0*T[1]
366 @ stall
367 @ stall ?
368 SMLAL r8, r9, r7, r11 @ (r8, r9) += s2*T[0]
369 RSB r6, r6, #0
370 @ stall ?
371 SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[1]
372 MOV r9, r9, LSL #1
373 @ stall ?
374 SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[0]
375 STR r9, [r4],#-16 @ aX[0] = r9
376 CMP r4,r1
377 MOV r12,r12,LSL #1
378 STR r12,[r4,#8+16] @ aX[2] = r12
379
380 BGE presymmetry_loop2 @ while (aX >= in)
381
382 @ r0 = n
383 @ r1 = in
384 @ r2 = step
385 @ r3 = shift
386 STMFD r13!,{r3}
Ard Biesheuvele96d4492012-08-11 23:47:34 +0200387 ADRL r4, .Lsincos_lookup
388 LDR r5, [r4] @ r5 = T=sincos_lookup0
389 ADD r5, r4
Gloria Wang79130732010-02-08 14:41:04 -0800390 ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1)
391 SUB r4, r4, #4*4 @ r4 = aX = in+(n>>1)-4
392 LDR r11,[r5,#4] @ r11= T[1]
393 LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step
394presymmetry_loop3:
395 LDR r8,[r1],#16 @ r8 = ro0 = bX[0]
396 LDR r9,[r1,#8-16] @ r9 = ro2 = bX[2]
397 LDR r6,[r4] @ r6 = ri0 = aX[0]
398
399 @ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] )
400 @ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31
401 SMULL r14,r12,r8, r11 @ (r14,r12) = ro0*T[1]
402 RSB r8,r8,#0 @ r8 = -ro0
403 @ Stall ?
404 SMLAL r14,r12,r9, r10 @ (r14,r12) += ro2*T[0]
405 LDR r7,[r4,#8] @ r7 = ri2 = aX[2]
406 @ Stall ?
407 SMULL r14,r3, r9, r11 @ (r14,r3) = ro2*T[1]
408 MOV r12,r12,LSL #1
409 LDR r11,[r5,#4] @ r11= T[1]
410 SMLAL r14,r3, r8, r10 @ (r14,r3) -= ro0*T[0]
411 LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step
412 STR r12,[r4,#8]
413 MOV r3, r3, LSL #1
414 STR r3, [r4],#-16
415
416 @ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] )
417 @ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31
418 SMULL r14,r12,r6, r10 @ (r14,r12) = ri0*T[0]
419 RSB r6,r6,#0 @ r6 = -ri0
420 @ stall ?
421 SMLAL r14,r12,r7, r11 @ (r14,r12) += ri2*T[1]
422 @ stall ?
423 @ stall ?
424 SMULL r14,r3, r7, r10 @ (r14,r3) = ri2*T[0]
425 MOV r12,r12,LSL #1
426 @ stall ?
427 SMLAL r14,r3, r6, r11 @ (r14,r3) -= ri0*T[1]
428 CMP r4,r1
429 STR r12,[r1,#8-16]
430 MOV r3, r3, LSL #1
431 STR r3, [r1,#-16]
432
433 BGE presymmetry_loop3
434
435 SUB r1,r1,r0 @ r1 = in -= n>>2 (i.e. restore in)
436
437 LDR r3,[r13]
438 STR r2,[r13,#-4]!
439
440 @ mdct_butterflies
441 @ r0 = n = (points * 2)
442 @ r1 = in = x
443 @ r2 = i
444 @ r3 = shift
445 STMFD r13!,{r0-r1}
Ard Biesheuvele96d4492012-08-11 23:47:34 +0200446 ADRL r4, .Lsincos_lookup
447 LDR r5, [r4]
448 ADD r5, r4
Gloria Wang79130732010-02-08 14:41:04 -0800449 RSBS r4,r3,#6 @ r4 = stages = 7-shift then --stages
Gloria Wang79130732010-02-08 14:41:04 -0800450 BLE no_generics
451 MOV r14,#4 @ r14= 4 (i=0)
452 MOV r6, r14,LSL r3 @ r6 = (4<<i)<<shift
453mdct_butterflies_loop1:
454 MOV r0, r0, LSR #1 @ r0 = points>>i = POINTS
455 MOV r2, r14,LSR #2 @ r2 = (1<<i)-j (j=0)
456 STMFD r13!,{r4,r14}
457mdct_butterflies_loop2:
458
459 @ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift))
460 @ mdct_butterfly_generic(r1, r0, r6)
461 @ r0 = points
462 @ r1 = x
463 @ preserve r2 (external loop counter)
464 @ preserve r3
465 @ preserve r4 (external loop counter)
466 @ r5 = T = sincos_lookup0
467 @ r6 = step
468 @ preserve r14
469
470 STR r2,[r13,#-4]! @ stack r2
471 ADD r1,r1,r0,LSL #1 @ r1 = x2+4 = x + (POINTS>>1)
472 ADD r7,r1,r0,LSL #1 @ r7 = x1+4 = x + POINTS
473 ADD r12,r5,#1024*4 @ r12= sincos_lookup0+1024
474
475mdct_bufferfly_generic_loop1:
476 LDMDB r7!,{r2,r3,r8,r11} @ r2 = x1[0]
477 @ r3 = x1[1]
478 @ r8 = x1[2]
479 @ r11= x1[3] x1 -= 4
480 LDMDB r1!,{r4,r9,r10,r14} @ r4 = x2[0]
481 @ r9 = x2[1]
482 @ r10= x2[2]
483 @ r14= x2[3] x2 -= 4
484
485 SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1]
486 ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0])
487 SUB r11,r11,r8 @ r11= s1 = x1[3] - x1[2]
488 ADD r8, r11,r8, LSL #1 @ r8 = x1[3] + x1[2] (-> x1[2])
489 SUB r9, r9, r4 @ r9 = s2 = x2[1] - x2[0]
490 ADD r4, r9, r4, LSL #1 @ r4 = x2[1] + x2[0] (-> x1[1])
491 SUB r14,r14,r10 @ r14= s3 = x2[3] - x2[2]
492 ADD r10,r14,r10,LSL #1 @ r10= x2[3] + x2[2] (-> x1[3])
493 STMIA r7,{r3,r4,r8,r10}
494
495 @ r0 = points
496 @ r1 = x2
497 @ r2 = s0
498 @ r3 free
499 @ r4 free
500 @ r5 = T
501 @ r6 = step
502 @ r7 = x1
503 @ r8 free
504 @ r9 = s2
505 @ r10 free
506 @ r11= s1
507 @ r12= limit
508 @ r14= s3
509
510 LDR r8, [r5,#4] @ r8 = T[1]
511 LDR r10,[r5],r6,LSL #2 @ r10= T[0] T += step
512
513 @ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2])
514 @ x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31
515 @ stall Xscale
516 SMULL r4, r3, r2, r8 @ (r4, r3) = s0*T[1]
517 SMLAL r4, r3, r11,r10 @ (r4, r3) += s1*T[0]
518 RSB r11,r11,#0
519 SMULL r11,r4, r8, r11 @ (r11,r4) = -s1*T[1]
520 SMLAL r11,r4, r2, r10 @ (r11,r4) += s0*T[0]
521 MOV r2, r3, LSL #1 @ r2 = r3<<1 = Value for x2[0]
522
523 @ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3])
524 @ x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31
525 SMULL r11,r3, r9, r10 @ (r11,r3) = s2*T[0]
526 MOV r4, r4, LSL #1 @ r4 = r4<<1 = Value for x2[2]
527 SMLAL r11,r3, r14,r8 @ (r11,r3) += s3*T[1]
528 RSB r9, r9, #0
529 SMULL r10,r11,r14,r10 @ (r10,r11) = s3*T[0]
530 MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1]
531 SMLAL r10,r11,r9,r8 @ (r10,r11) -= s2*T[1]
532 CMP r5, r12
533 MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3]
534
535 STMIA r1,{r2,r3,r4,r11}
536
537 BLT mdct_bufferfly_generic_loop1
538
539 SUB r12,r12,#1024*4
540mdct_bufferfly_generic_loop2:
541 LDMDB r7!,{r2,r3,r9,r10} @ r2 = x1[0]
542 @ r3 = x1[1]
543 @ r9 = x1[2]
544 @ r10= x1[3] x1 -= 4
545 LDMDB r1!,{r4,r8,r11,r14} @ r4 = x2[0]
546 @ r8 = x2[1]
547 @ r11= x2[2]
548 @ r14= x2[3] x2 -= 4
549
550 SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1]
551 ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0])
552 SUB r9, r9,r10 @ r9 = s1 = x1[2] - x1[3]
553 ADD r10,r9,r10, LSL #1 @ r10= x1[2] + x1[3] (-> x1[2])
554 SUB r4, r4, r8 @ r4 = s2 = x2[0] - x2[1]
555 ADD r8, r4, r8, LSL #1 @ r8 = x2[0] + x2[1] (-> x1[1])
556 SUB r14,r14,r11 @ r14= s3 = x2[3] - x2[2]
557 ADD r11,r14,r11,LSL #1 @ r11= x2[3] + x2[2] (-> x1[3])
558 STMIA r7,{r3,r8,r10,r11}
559
560 @ r0 = points
561 @ r1 = x2
562 @ r2 = s0
563 @ r3 free
564 @ r4 = s2
565 @ r5 = T
566 @ r6 = step
567 @ r7 = x1
568 @ r8 free
569 @ r9 = s1
570 @ r10 free
571 @ r11 free
572 @ r12= limit
573 @ r14= s3
574
575 LDR r8, [r5,#4] @ r8 = T[1]
576 LDR r10,[r5],-r6,LSL #2 @ r10= T[0] T -= step
577
578 @ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2])
579 @ x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31
580 @ stall Xscale
581 SMULL r3, r11,r2, r8 @ (r3, r11) = s0*T[1]
582 SMLAL r3, r11,r9, r10 @ (r3, r11) += s1*T[0]
583 RSB r9, r9, #0
584 SMULL r3, r2, r10,r2 @ (r3, r2) = s0*T[0]
585 SMLAL r3, r2, r9, r8 @ (r3, r2) += -s1*T[1]
586 MOV r9, r11,LSL #1 @ r9 = r11<<1 = Value for x2[2]
587
588 @ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3])
589 @ x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31
590 SMULL r3, r11,r4, r10 @ (r3,r11) = s2*T[0]
591 MOV r2, r2, LSL #1 @ r2 = r2<<1 = Value for x2[0]
592 SMLAL r3, r11,r14,r8 @ (r3,r11) += s3*T[1]
593 RSB r4, r4, #0
594 SMULL r10,r3,r14,r10 @ (r10,r3) = s3*T[0]
595 MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3]
596 SMLAL r10,r3, r4, r8 @ (r10,r3) -= s2*T[1]
597 CMP r5, r12
598 MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1]
599
600 STMIA r1,{r2,r3,r9,r11}
601
602 BGT mdct_bufferfly_generic_loop2
603
604 LDR r2,[r13],#4 @ unstack r2
605 ADD r1, r1, r0, LSL #2 @ r1 = x+POINTS*j
606 @ stall Xscale
607 SUBS r2, r2, #1 @ r2-- (j++)
608 BGT mdct_butterflies_loop2
609
610 LDMFD r13!,{r4,r14}
611
612 LDR r1,[r13,#4]
613
614 SUBS r4, r4, #1 @ stages--
615 MOV r14,r14,LSL #1 @ r14= 4<<i (i++)
616 MOV r6, r6, LSL #1 @ r6 = step <<= 1 (i++)
617 BGE mdct_butterflies_loop1
618 LDMFD r13,{r0-r1}
619no_generics:
620 @ mdct_butterflies part2 (loop around mdct_bufferfly_32)
621 @ r0 = points
622 @ r1 = in
623 @ r2 = step
624 @ r3 = shift
625
626mdct_bufferflies_loop3:
627 @ mdct_bufferfly_32
628
629 @ block1
630 ADD r4, r1, #16*4 @ r4 = &in[16]
631 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[16]
632 @ r6 = x[17]
633 @ r9 = x[18]
634 @ r10= x[19]
635 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
636 @ r8 = x[1]
637 @ r11= x[2]
638 @ r12= x[3]
639 SUB r5, r5, r6 @ r5 = s0 = x[16] - x[17]
640 ADD r6, r5, r6, LSL #1 @ r6 = x[16] + x[17] -> x[16]
641 SUB r9, r9, r10 @ r9 = s1 = x[18] - x[19]
642 ADD r10,r9, r10,LSL #1 @ r10= x[18] + x[19] -> x[18]
643 SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
644 ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[17]
645 SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
646 ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[19]
647 STMIA r4!,{r6,r7,r10,r11}
648
649 LDR r6,cPI1_8
650 LDR r7,cPI3_8
651
652 @ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] )
653 @ x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8
654 @ stall Xscale
655 SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI1_8
656 SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI3_8
657 RSB r9, r9, #0
658 SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI3_8
659 SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI1_8
660 MOV r11,r11,LSL #1
661 MOV r5, r5, LSL #1
662
663 @ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] )
664 @ x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8
665 SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI1_8
666 SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI3_8
667 RSB r8,r8,#0
668 SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI1_8
669 SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI3_8
670 MOV r9, r9, LSL #1
671 MOV r12,r12,LSL #1
672 STMIA r1!,{r5,r9,r11,r12}
673
674 @ block2
675 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[20]
676 @ r6 = x[21]
677 @ r9 = x[22]
678 @ r10= x[23]
679 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[4]
680 @ r8 = x[5]
681 @ r11= x[6]
682 @ r12= x[7]
683 SUB r5, r5, r6 @ r5 = s0 = x[20] - x[21]
684 ADD r6, r5, r6, LSL #1 @ r6 = x[20] + x[21] -> x[20]
685 SUB r9, r9, r10 @ r9 = s1 = x[22] - x[23]
686 ADD r10,r9, r10,LSL #1 @ r10= x[22] + x[23] -> x[22]
687 SUB r8, r8, r7 @ r8 = s2 = x[ 5] - x[ 4]
688 ADD r7, r8, r7, LSL #1 @ r7 = x[ 5] + x[ 4] -> x[21]
689 SUB r12,r12,r11 @ r12= s3 = x[ 7] - x[ 6]
690 ADD r11,r12,r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[23]
691 LDR r14,cPI2_8
692 STMIA r4!,{r6,r7,r10,r11}
693
694 SUB r5, r5, r9 @ r5 = s0 - s1
695 ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
696 SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8
697 SUB r12,r12,r8 @ r12= s3 - s2
698 ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
699
700 SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8
701 MOV r5, r5, LSL #1
702 SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8
703 MOV r8, r8, LSL #1
704 SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8
705 MOV r9, r9, LSL #1
706 MOV r12,r12,LSL #1
707 STMIA r1!,{r5,r8,r9,r12}
708
709 @ block3
710 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[24]
711 @ r6 = x[25]
712 @ r9 = x[25]
713 @ r10= x[26]
714 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[8]
715 @ r8 = x[9]
716 @ r11= x[10]
717 @ r12= x[11]
718 SUB r5, r5, r6 @ r5 = s0 = x[24] - x[25]
719 ADD r6, r5, r6, LSL #1 @ r6 = x[24] + x[25] -> x[25]
720 SUB r9, r9, r10 @ r9 = s1 = x[26] - x[27]
721 ADD r10,r9, r10,LSL #1 @ r10= x[26] + x[27] -> x[26]
722 SUB r8, r8, r7 @ r8 = s2 = x[ 9] - x[ 8]
723 ADD r7, r8, r7, LSL #1 @ r7 = x[ 9] + x[ 8] -> x[25]
724 SUB r12,r12,r11 @ r12= s3 = x[11] - x[10]
725 ADD r11,r12,r11, LSL #1 @ r11= x[11] + x[10] -> x[27]
726 STMIA r4!,{r6,r7,r10,r11}
727
728 LDR r6,cPI3_8
729 LDR r7,cPI1_8
730
731 @ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] )
732 @ x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8
733 @ stall Xscale
734 SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI3_8
735 SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI1_8
736 RSB r9, r9, #0
737 SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI1_8
738 SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI3_8
739 MOV r11,r11,LSL #1
740 MOV r5, r5, LSL #1
741
742 @ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] )
743 @ x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8
744 SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI3_8
745 SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI1_8
746 RSB r8,r8,#0
747 SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI3_8
748 SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI1_8
749 MOV r9, r9, LSL #1
750 MOV r12,r12,LSL #1
751 STMIA r1!,{r5,r9,r11,r12}
752
753 @ block4
754 LDMIA r4,{r5,r6,r10,r11} @ r5 = x[28]
755 @ r6 = x[29]
756 @ r10= x[30]
757 @ r11= x[31]
758 LDMIA r1,{r8,r9,r12,r14} @ r8 = x[12]
759 @ r9 = x[13]
760 @ r12= x[14]
761 @ r14= x[15]
762 SUB r5, r5, r6 @ r5 = s0 = x[28] - x[29]
763 ADD r6, r5, r6, LSL #1 @ r6 = x[28] + x[29] -> x[28]
764 SUB r7, r14,r12 @ r7 = s3 = x[15] - x[14]
765 ADD r12,r7, r12, LSL #1 @ r12= x[15] + x[14] -> x[31]
766 SUB r10,r10,r11 @ r10= s1 = x[30] - x[31]
767 ADD r11,r10,r11,LSL #1 @ r11= x[30] + x[31] -> x[30]
768 SUB r14, r8, r9 @ r14= s2 = x[12] - x[13]
769 ADD r9, r14, r9, LSL #1 @ r9 = x[12] + x[13] -> x[29]
770 STMIA r4!,{r6,r9,r11,r12}
771 STMIA r1!,{r5,r7,r10,r14}
772
773 @ mdct_butterfly16 (1st version)
774 @ block 1
775 SUB r1,r1,#16*4
776 ADD r4,r1,#8*4
777 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8]
778 @ r6 = x[ 9]
779 @ r9 = x[10]
780 @ r10= x[11]
781 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
782 @ r8 = x[1]
783 @ r11= x[2]
784 @ r12= x[3]
785 SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9]
786 ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8]
787 SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11]
788 ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10]
789 SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
790 ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9]
791 SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
792 ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11]
793 LDR r14,cPI2_8
794 STMIA r4!,{r6,r7,r10,r11}
795
796 SUB r5, r5, r9 @ r5 = s0 - s1
797 ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
798 SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8
799 SUB r12,r12,r8 @ r12= s3 - s2
800 ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
801
802 SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8
803 MOV r5, r5, LSL #1
804 SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8
805 MOV r8, r8, LSL #1
806 SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8
807 MOV r9, r9, LSL #1
808 MOV r12,r12,LSL #1
809 STMIA r1!,{r5,r8,r9,r12}
810
811 @ block4
812 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12]
813 @ r6 = x[13]
814 @ r9 = x[14]
815 @ r10= x[15]
816 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4]
817 @ r8 = x[ 5]
818 @ r11= x[ 6]
819 @ r12= x[ 7]
820 SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5]
821 ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13]
822 SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6]
823 ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15]
824 SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13]
825 ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12]
826 SUB r12,r9, r10 @ r12= s3 = x[14] - x[15]
827 ADD r10,r12,r10,LSL #1 @ r10= x[14] + x[15] -> x[14]
828 STMIA r4!,{r6,r8,r10,r11}
829 STMIA r1!,{r5,r7,r12,r14}
830
831 @ mdct_butterfly_8
832 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14}
833 @ r6 = x[0]
834 @ r7 = x[1]
835 @ r8 = x[2]
836 @ r9 = x[3]
837 @ r10= x[4]
838 @ r11= x[5]
839 @ r12= x[6]
840 @ r14= x[7]
841 ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
842 SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
843 ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
844 SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
845 ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
846 SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
847 ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
848 SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
849
850 ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
851 SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
852 SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
853 ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
854 SUB r10,r10,r6 @ r10= x[4] = s4 - s0
855 SUB r11,r12,r8 @ r11= x[5] = s6 - s2
856 ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
857 ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
858 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14}
859
860 @ mdct_butterfly_8
861 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14}
862 @ r6 = x[0]
863 @ r7 = x[1]
864 @ r8 = x[2]
865 @ r9 = x[3]
866 @ r10= x[4]
867 @ r11= x[5]
868 @ r12= x[6]
869 @ r14= x[7]
870 ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
871 SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
872 ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
873 SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
874 ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
875 SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
876 ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
877 SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
878
879 ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
880 SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
881 SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
882 ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
883 SUB r10,r10,r6 @ r10= x[4] = s4 - s0
884 SUB r11,r12,r8 @ r11= x[5] = s6 - s2
885 ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
886 ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
887 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14}
888
889 @ block 2
890 ADD r1,r1,#16*4-8*4
891 ADD r4,r1,#8*4
892 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8]
893 @ r6 = x[ 9]
894 @ r9 = x[10]
895 @ r10= x[11]
896 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
897 @ r8 = x[1]
898 @ r11= x[2]
899 @ r12= x[3]
900 SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9]
901 ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8]
902 SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11]
903 ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10]
904 SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
905 ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9]
906 SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
907 ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11]
908 LDR r14,cPI2_8
909 STMIA r4!,{r6,r7,r10,r11}
910
911 SUB r5, r5, r9 @ r5 = s0 - s1
912 ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
913 SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8
914 SUB r12,r12,r8 @ r12= s3 - s2
915 ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
916
917 SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8
918 MOV r5, r5, LSL #1
919 SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8
920 MOV r8, r8, LSL #1
921 SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8
922 MOV r9, r9, LSL #1
923 MOV r12,r12,LSL #1
924 STMIA r1!,{r5,r8,r9,r12}
925
926 @ block4
927 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12]
928 @ r6 = x[13]
929 @ r9 = x[14]
930 @ r10= x[15]
931 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4]
932 @ r8 = x[ 5]
933 @ r11= x[ 6]
934 @ r12= x[ 7]
935 SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13]
936 ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12]
937 SUB r9, r9, r10 @ r9 = s3 = x[14] - x[15]
938 ADD r10,r9, r10,LSL #1 @ r10= x[14] + x[15] -> x[14]
939 SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5]
940 ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13]
941 SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6]
942 ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15]
943 STMIA r4!,{r6,r8,r10,r11}
944 STMIA r1!,{r5,r7,r9,r14}
945
946 @ mdct_butterfly_8
947 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14}
948 @ r6 = x[0]
949 @ r7 = x[1]
950 @ r8 = x[2]
951 @ r9 = x[3]
952 @ r10= x[4]
953 @ r11= x[5]
954 @ r12= x[6]
955 @ r14= x[7]
956 ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
957 SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
958 ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
959 SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
960 ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
961 SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
962 ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
963 SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
964
965 ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
966 SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
967 SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
968 ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
969 SUB r10,r10,r6 @ r10= x[4] = s4 - s0
970 SUB r11,r12,r8 @ r11= x[5] = s6 - s2
971 ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
972 ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
973 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14}
974
975 @ mdct_butterfly_8
976 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14}
977 @ r6 = x[0]
978 @ r7 = x[1]
979 @ r8 = x[2]
980 @ r9 = x[3]
981 @ r10= x[4]
982 @ r11= x[5]
983 @ r12= x[6]
984 @ r14= x[7]
985 ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
986 SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
987 ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
988 SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
989 ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
990 SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
991 ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
992 SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
993
994 ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
995 SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
996 SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
997 ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
998 SUB r10,r10,r6 @ r10= x[4] = s4 - s0
999 SUB r11,r12,r8 @ r11= x[5] = s6 - s2
1000 ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
1001 ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
1002 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14}
1003
1004 ADD r1,r1,#8*4
1005 SUBS r0,r0,#64
1006 BGT mdct_bufferflies_loop3
1007
1008 LDMFD r13,{r0-r3}
1009
1010mdct_bitreverseARM:
Gloria Wang021523c2010-02-17 16:29:17 -08001011 @ r0 = points = n
Gloria Wang79130732010-02-08 14:41:04 -08001012 @ r1 = in
1013 @ r2 = step
1014 @ r3 = shift
1015
1016 MOV r4, #0 @ r4 = bit = 0
1017 ADD r5, r1, r0, LSL #1 @ r5 = w = x + (n>>1)
1018 ADR r6, bitrev
Gloria Wang79130732010-02-08 14:41:04 -08001019 SUB r5, r5, #8
1020brev_lp:
1021 LDRB r7, [r6, r4, LSR #6]
1022 AND r8, r4, #0x3f
1023 LDRB r8, [r6, r8]
1024 ADD r4, r4, #1 @ bit++
1025 @ stall XScale
1026 ORR r7, r7, r8, LSL #6 @ r7 = bitrev[bit]
Gloria Wang021523c2010-02-17 16:29:17 -08001027 MOV r7, r7, LSR r3
1028 ADD r9, r1, r7, LSL #2 @ r9 = xx = x + (b>>shift)
Gloria Wang79130732010-02-08 14:41:04 -08001029 CMP r5, r9 @ if (w > xx)
1030 LDR r10,[r5],#-8 @ r10 = w[0] w -= 2
1031 LDRGT r11,[r5,#12] @ r11 = w[1]
1032 LDRGT r12,[r9] @ r12 = xx[0]
1033 LDRGT r14,[r9,#4] @ r14 = xx[1]
1034 STRGT r10,[r9] @ xx[0]= w[0]
1035 STRGT r11,[r9,#4] @ xx[1]= w[1]
1036 STRGT r12,[r5,#8] @ w[0] = xx[0]
1037 STRGT r14,[r5,#12] @ w[1] = xx[1]
1038 CMP r5,r1
1039 BGT brev_lp
1040
1041 @ mdct_step7
1042 @ r0 = points
1043 @ r1 = in
1044 @ r2 = step
Gloria Wang021523c2010-02-17 16:29:17 -08001045 @ r3 = shift
Gloria Wang79130732010-02-08 14:41:04 -08001046
1047 CMP r2, #4 @ r5 = T = (step>=4) ?
Ard Biesheuvele96d4492012-08-11 23:47:34 +02001048 ADR r7, .Lsincos_lookup @ sincos_lookup0 +
1049 ADDLT r7, #4 @ sincos_lookup1
1050 LDR r5, [r7]
1051 ADD r5, r7
Gloria Wang79130732010-02-08 14:41:04 -08001052 ADD r7, r1, r0, LSL #1 @ r7 = w1 = x + (n>>1)
1053 ADDGE r5, r5, r2, LSL #1 @ (step>>1)
1054 ADD r8, r5, #1024*4 @ r8 = Ttop
1055step7_loop1:
1056 LDR r6, [r1] @ r6 = w0[0]
1057 LDR r9, [r1,#4] @ r9 = w0[1]
1058 LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2
1059 LDR r11,[r7,#4] @ r11= w1[1]
1060 LDR r14,[r5,#4] @ r14= T[1]
1061 LDR r12,[r5],r2,LSL #2 @ r12= T[0] T += step
1062
1063 ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0]
1064 SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0]
1065 SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1]
1066 ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1]
1067
1068 @ Can save 1 cycle by using SMULL SMLAL - at the cost of being
1069 @ 1 off.
1070 SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[1]
1071 SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[0] = s2
1072 ADD r3, r3, r4
1073 SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[1]
1074 SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[0] = s3
1075 SUB r14,r14,r12
1076
1077 @ r9 = s0b<<1
1078 @ r10= s1b<<1
1079 ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2
1080 SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2
1081
1082 SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b
1083 ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b
1084 STR r9, [r1],#4
1085 STR r10,[r1],#4 @ w0 += 2
1086 STR r3, [r7]
1087 STR r12,[r7,#4]
1088
1089 CMP r5,r8
1090 BLT step7_loop1
1091
1092step7_loop2:
1093 LDR r6, [r1] @ r6 = w0[0]
1094 LDR r9, [r1,#4] @ r9 = w0[1]
1095 LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2
1096 LDR r11,[r7,#4] @ r11= w1[1]
1097 LDR r14,[r5,-r2,LSL #2]! @ r12= T[1] T -= step
1098 LDR r12,[r5,#4] @ r14= T[0]
1099
1100 ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0]
1101 SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0]
1102 SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1]
1103 ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1]
1104
1105 @ Can save 1 cycle by using SMULL SMLAL - at the cost of being
1106 @ 1 off.
1107 SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[0]
1108 SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[1] = s2
1109 ADD r3, r3, r4
1110 SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[0]
1111 SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[1] = s3
1112 SUB r14,r14,r12
1113
1114 @ r9 = s0b<<1
1115 @ r10= s1b<<1
1116 ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2
1117 SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2
1118
1119 SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b
1120 ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b
1121 STR r9, [r1],#4
1122 STR r10,[r1],#4 @ w0 += 2
1123 STR r3, [r7]
1124 STR r12,[r7,#4]
1125
1126 CMP r1,r7
1127 BLT step7_loop2
1128
1129 LDMFD r13!,{r0-r3}
1130
1131 @ r0 = points
1132 @ r1 = in
1133 @ r2 = step
1134 @ r3 = shift
1135 MOV r2, r2, ASR #2 @ r2 = step >>= 2
1136 CMP r2, #0
1137 CMPNE r2, #1
1138 BEQ mdct_end
1139
1140 @ step > 1 (default case)
1141 CMP r2, #4 @ r5 = T = (step>=4) ?
Ard Biesheuvele96d4492012-08-11 23:47:34 +02001142 ADR r7, .Lsincos_lookup @ sincos_lookup0 +
1143 ADDLT r7, #4 @ sincos_lookup1
1144 LDR r5, [r7]
1145 ADD r5, r7
Gloria Wang79130732010-02-08 14:41:04 -08001146 ADD r7, r1, r0, LSL #1 @ r7 = iX = x + (n>>1)
1147 ADDGE r5, r5, r2, LSL #1 @ (step>>1)
1148mdct_step8_default:
1149 LDR r6, [r1],#4 @ r6 = s0 = x[0]
1150 LDR r8, [r1],#4 @ r8 = -s1 = x[1]
1151 LDR r12,[r5,#4] @ r12= T[1]
1152 LDR r14,[r5],r2,LSL #2 @ r14= T[0] T += step
1153 RSB r8, r8, #0 @ r8 = s1
1154
1155 @ XPROD31(s0, s1, T[0], T[1], x, x+1)
1156 @ x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1]
1157 SMULL r9, r10, r8, r12 @ (r9,r10) = s1 * T[1]
1158 CMP r1, r7
1159 SMLAL r9, r10, r6, r14 @ (r9,r10) += s0 * T[0]
1160 RSB r6, r6, #0 @ r6 = -s0
1161 SMULL r9, r11, r8, r14 @ (r9,r11) = s1 * T[0]
1162 MOV r10,r10,LSL #1
1163 SMLAL r9, r11, r6, r12 @ (r9,r11) -= s0 * T[1]
1164 STR r10,[r1,#-8]
1165 MOV r11,r11,LSL #1
1166 STR r11,[r1,#-4]
1167 BLT mdct_step8_default
1168
1169mdct_end:
1170 MOV r0, r2
1171 LDMFD r13!,{r4-r11,PC}
1172
1173cPI1_8:
1174 .word 0x7641af3d
1175cPI2_8:
1176 .word 0x5a82799a
1177cPI3_8:
1178 .word 0x30fbc54d
1179bitrev:
1180 .byte 0
1181 .byte 32
1182 .byte 16
1183 .byte 48
1184 .byte 8
1185 .byte 40
1186 .byte 24
1187 .byte 56
1188 .byte 4
1189 .byte 36
1190 .byte 20
1191 .byte 52
1192 .byte 12
1193 .byte 44
1194 .byte 28
1195 .byte 60
1196 .byte 2
1197 .byte 34
1198 .byte 18
1199 .byte 50
1200 .byte 10
1201 .byte 42
1202 .byte 26
1203 .byte 58
1204 .byte 6
1205 .byte 38
1206 .byte 22
1207 .byte 54
1208 .byte 14
1209 .byte 46
1210 .byte 30
1211 .byte 62
1212 .byte 1
1213 .byte 33
1214 .byte 17
1215 .byte 49
1216 .byte 9
1217 .byte 41
1218 .byte 25
1219 .byte 57
1220 .byte 5
1221 .byte 37
1222 .byte 21
1223 .byte 53
1224 .byte 13
1225 .byte 45
1226 .byte 29
1227 .byte 61
1228 .byte 3
1229 .byte 35
1230 .byte 19
1231 .byte 51
1232 .byte 11
1233 .byte 43
1234 .byte 27
1235 .byte 59
1236 .byte 7
1237 .byte 39
1238 .byte 23
1239 .byte 55
1240 .byte 15
1241 .byte 47
1242 .byte 31
1243 .byte 63
1244
Ard Biesheuvele96d4492012-08-11 23:47:34 +02001245.Lsincos_lookup:
1246 .word sincos_lookup0-.Lsincos_lookup
1247 .word sincos_lookup1-(.Lsincos_lookup+4)
1248
Gloria Wang79130732010-02-08 14:41:04 -08001249 @ END