blob: d7df88b394a8952125d36e33313d892f3e94aeaf [file] [log] [blame]
Gloria Wang79130732010-02-08 14:41:04 -08001@ Tremolo library
2@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
3
4 .text
5
6 @ full accuracy version
7
8 .global mdct_backwardARM
9 .global mdct_shift_right
10 .global mdct_unroll_prelap
11 .global mdct_unroll_part2
12 .global mdct_unroll_part3
13 .global mdct_unroll_postlap
14
15 .extern sincos_lookup0
16 .extern sincos_lookup1
17
18mdct_unroll_prelap:
19 @ r0 = out
20 @ r1 = post
21 @ r2 = r
22 @ r3 = step
23 STMFD r13!,{r4-r7,r14}
24 MVN r4, #0x8000
25 MOV r3, r3, LSL #1
26 SUB r1, r2, r1 @ r1 = r - post
27 SUBS r1, r1, #16 @ r1 = r - post - 16
28 BLT unroll_over
29unroll_loop:
30 LDMDB r2!,{r5,r6,r7,r12}
31
32 MOV r5, r5, ASR #9 @ r5 = (*--r)>>9
33 MOV r6, r6, ASR #9 @ r6 = (*--r)>>9
34 MOV r7, r7, ASR #9 @ r7 = (*--r)>>9
35 MOV r12,r12,ASR #9 @ r12= (*--r)>>9
36
37 MOV r14,r12,ASR #15
38 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
39 EORNE r12,r4, r14,ASR #31
40 STRH r12,[r0], r3
41
42 MOV r14,r7, ASR #15
43 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
44 EORNE r7, r4, r14,ASR #31
45 STRH r7, [r0], r3
46
47 MOV r14,r6, ASR #15
48 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
49 EORNE r6, r4, r14,ASR #31
50 STRH r6, [r0], r3
51
52 MOV r14,r5, ASR #15
53 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
54 EORNE r5, r4, r14,ASR #31
55 STRH r5, [r0], r3
56
57 SUBS r1, r1, #16
58 BGE unroll_loop
59
60unroll_over:
61 ADDS r1, r1, #16
62 BLE unroll_end
63unroll_loop2:
64 LDR r5,[r2,#-4]!
65 @ stall
66 @ stall (Xscale)
67 MOV r5, r5, ASR #9 @ r5 = (*--r)>>9
68 MOV r14,r5, ASR #15
69 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
70 EORNE r5, r4, r14,ASR #31
71 STRH r5, [r0], r3
72 SUBS r1, r1, #4
73 BGT unroll_loop2
74unroll_end:
75 LDMFD r13!,{r4-r7,PC}
76
77mdct_unroll_postlap:
78 @ r0 = out
79 @ r1 = post
80 @ r2 = l
81 @ r3 = step
82 STMFD r13!,{r4-r7,r14}
83 MVN r4, #0x8000
84 MOV r3, r3, LSL #1
85 SUB r1, r1, r2 @ r1 = post - l
86 MOV r1, r1, ASR #1 @ r1 = (post - l)>>1
87 SUBS r1, r1, #16 @ r1 = ((post - l)>>1) - 4
88 BLT unroll_over3
89unroll_loop3:
90 LDR r12,[r2],#8
91 LDR r7, [r2],#8
92 LDR r6, [r2],#8
93 LDR r5, [r2],#8
94
95 RSB r12,r12,#0
96 RSB r5, r5, #0
97 RSB r6, r6, #0
98 RSB r7, r7, #0
99
100 MOV r12, r12,ASR #9 @ r12= (-*l)>>9
101 MOV r5, r5, ASR #9 @ r5 = (-*l)>>9
102 MOV r6, r6, ASR #9 @ r6 = (-*l)>>9
103 MOV r7, r7, ASR #9 @ r7 = (-*l)>>9
104
105 MOV r14,r12,ASR #15
106 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
107 EORNE r12,r4, r14,ASR #31
108 STRH r12,[r0], r3
109
110 MOV r14,r7, ASR #15
111 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
112 EORNE r7, r4, r14,ASR #31
113 STRH r7, [r0], r3
114
115 MOV r14,r6, ASR #15
116 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
117 EORNE r6, r4, r14,ASR #31
118 STRH r6, [r0], r3
119
120 MOV r14,r5, ASR #15
121 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
122 EORNE r5, r4, r14,ASR #31
123 STRH r5, [r0], r3
124
125 SUBS r1, r1, #16
126 BGE unroll_loop3
127
128unroll_over3:
129 ADDS r1, r1, #16
130 BLE unroll_over4
131unroll_loop4:
132 LDR r5,[r2], #8
133 @ stall
134 @ stall (Xscale)
135 RSB r5, r5, #0
136 MOV r5, r5, ASR #9 @ r5 = (-*l)>>9
137 MOV r14,r5, ASR #15
138 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
139 EORNE r5, r4, r14,ASR #31
140 STRH r5, [r0], r3
141 SUBS r1, r1, #4
142 BGT unroll_loop4
143unroll_over4:
144 LDMFD r13!,{r4-r7,PC}
145
146mdct_unroll_part2:
147 @ r0 = out
148 @ r1 = post
149 @ r2 = l
150 @ r3 = r
151 @ <> = step
152 @ <> = wL
153 @ <> = wR
154 MOV r12,r13
155 STMFD r13!,{r4,r6-r11,r14}
156 LDMFD r12,{r8,r9,r10} @ r8 = step
157 @ r9 = wL
158 @ r10= wR
159 MVN r4, #0x8000
160 MOV r8, r8, LSL #1
161 SUBS r1, r3, r1 @ r1 = (r - post)
162 BLE unroll_over5
163unroll_loop5:
164 LDR r12,[r2, #-8]! @ r12= *l (but l -= 2 first)
165 LDR r11,[r9],#4 @ r11= *wL++
166 LDR r7, [r3, #-4]! @ r7 = *--r
167 LDR r6, [r10,#-4]! @ r6 = *--wR
168
169 @ Can save a cycle here, at the cost of 1bit errors in rounding
170 SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++
171 SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR
172 ADD r6, r6, r11
173 MOV r6, r6, ASR #8
174 MOV r14,r6, ASR #15
175 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
176 EORNE r6, r4, r14,ASR #31
177 STRH r6, [r0], r8
178
179 SUBS r1, r1, #4
180 BGT unroll_loop5
181
182unroll_over5:
183 LDMFD r13!,{r4,r6-r11,PC}
184
185mdct_unroll_part3:
186 @ r0 = out
187 @ r1 = post
188 @ r2 = l
189 @ r3 = r
190 @ <> = step
191 @ <> = wL
192 @ <> = wR
193 MOV r12,r13
194 STMFD r13!,{r4,r6-r11,r14}
195 LDMFD r12,{r8,r9,r10} @ r8 = step
196 @ r9 = wL
197 @ r10= wR
198 MVN r4, #0x8000
199 MOV r8, r8, LSL #1
200 SUBS r1, r1, r3 @ r1 = (post - r)
201 BLE unroll_over6
202unroll_loop6:
203 LDR r12,[r2],#8 @ r12= *l (but l += 2 first)
204 LDR r11,[r9],#4 @ r11= *wL++
205 LDR r7, [r3],#4 @ r7 = *r++
206 LDR r6, [r10,#-4]! @ r6 = *--wR
207
208 @ Can save a cycle here, at the cost of 1bit errors in rounding
209 SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++
210 SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR
211 SUB r6, r6, r11
212 MOV r6, r6, ASR #8
213 MOV r14,r6, ASR #15
214 TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
215 EORNE r6, r4, r14,ASR #31
216 STRH r6, [r0], r8
217
218 SUBS r1, r1, #4
219 BGT unroll_loop6
220
221unroll_over6:
222 LDMFD r13!,{r4,r6-r11,PC}
223
224mdct_shift_right:
225 @ r0 = n
226 @ r1 = in
227 @ r2 = right
228 STMFD r13!,{r4-r11,r14}
229
230 MOV r0, r0, LSR #2 @ n >>= 2
231 ADD r1, r1, #4
232
233 SUBS r0, r0, #8
234 BLT sr_less_than_8
235sr_loop:
236 LDR r3, [r1], #8
237 LDR r4, [r1], #8
238 LDR r5, [r1], #8
239 LDR r6, [r1], #8
240 LDR r7, [r1], #8
241 LDR r8, [r1], #8
242 LDR r12,[r1], #8
243 LDR r14,[r1], #8
244 SUBS r0, r0, #8
245 STMIA r2!,{r3,r4,r5,r6,r7,r8,r12,r14}
246 BGE sr_loop
247sr_less_than_8:
248 ADDS r0, r0, #8
249 BEQ sr_end
250sr_loop2:
251 LDR r3, [r1], #8
252 SUBS r0, r0, #1
253 STR r3, [r2], #4
254 BGT sr_loop2
255sr_end:
256 LDMFD r13!,{r4-r11,PC}
257
258mdct_backwardARM:
259 @ r0 = n
260 @ r1 = in
261 STMFD r13!,{r4-r11,r14}
262
263 MOV r2,#1<<4 @ r2 = 1<<shift
264 MOV r3,#13-4 @ r3 = 13-shift
265find_shift_loop:
266 TST r0,r2 @ if (n & (1<<shift)) == 0
267 MOV r2,r2,LSL #1
268 SUBEQ r3,r3,#1 @ shift--
269 BEQ find_shift_loop
270 MOV r2,#2
271 MOV r2,r2,LSL r3 @ r2 = step = 2<<shift
272
273 @ presymmetry
274 @ r0 = n (a multiple of 4)
275 @ r1 = in
276 @ r2 = step
277 @ r3 = shift
278
279 ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1)
280 ADD r14,r1, r0 @ r14= in+(n>>2)
281 SUB r4, r4, #3*4 @ r4 = aX = in+n2-3
282 LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0
283
284presymmetry_loop1:
285 LDR r7, [r4,#8] @ r6 = s2 = aX[2]
286 LDR r11,[r5,#4] @ r11= T[1]
287 LDR r6, [r4] @ r6 = s0 = aX[0]
288 LDR r10,[r5],r2,LSL #2 @ r10= T[0] T += step
289
290 @ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2])
291 SMULL r8, r9, r7, r11 @ (r8, r9) = s2*T[1]
292 @ stall
293 @ stall ?
294 SMLAL r8, r9, r6, r10 @ (r8, r9) += s0*T[0]
295 RSB r6, r6, #0
296 @ stall ?
297 SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[0]
298 MOV r9, r9, LSL #1
299 @ stall ?
300 SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[1]
301 STR r9, [r4],#-16 @ aX[0] = r9
302 CMP r4,r14
303 MOV r12,r12,LSL #1
304 STR r12,[r4,#8+16] @ aX[2] = r12
305
306 BGE presymmetry_loop1 @ while (aX >= in+n4)
307
308presymmetry_loop2:
309 LDR r6,[r4] @ r6 = s0 = aX[0]
310 LDR r10,[r5,#4] @ r10= T[1]
311 LDR r7,[r4,#8] @ r6 = s2 = aX[2]
312 LDR r11,[r5],-r2,LSL #2 @ r11= T[0] T -= step
313
314 @ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2])
315 SMULL r8, r9, r6, r10 @ (r8, r9) = s0*T[1]
316 @ stall
317 @ stall ?
318 SMLAL r8, r9, r7, r11 @ (r8, r9) += s2*T[0]
319 RSB r6, r6, #0
320 @ stall ?
321 SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[1]
322 MOV r9, r9, LSL #1
323 @ stall ?
324 SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[0]
325 STR r9, [r4],#-16 @ aX[0] = r9
326 CMP r4,r1
327 MOV r12,r12,LSL #1
328 STR r12,[r4,#8+16] @ aX[2] = r12
329
330 BGE presymmetry_loop2 @ while (aX >= in)
331
332 @ r0 = n
333 @ r1 = in
334 @ r2 = step
335 @ r3 = shift
336 STMFD r13!,{r3}
337 LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0
338 ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1)
339 SUB r4, r4, #4*4 @ r4 = aX = in+(n>>1)-4
340 LDR r11,[r5,#4] @ r11= T[1]
341 LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step
342presymmetry_loop3:
343 LDR r8,[r1],#16 @ r8 = ro0 = bX[0]
344 LDR r9,[r1,#8-16] @ r9 = ro2 = bX[2]
345 LDR r6,[r4] @ r6 = ri0 = aX[0]
346
347 @ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] )
348 @ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31
349 SMULL r14,r12,r8, r11 @ (r14,r12) = ro0*T[1]
350 RSB r8,r8,#0 @ r8 = -ro0
351 @ Stall ?
352 SMLAL r14,r12,r9, r10 @ (r14,r12) += ro2*T[0]
353 LDR r7,[r4,#8] @ r7 = ri2 = aX[2]
354 @ Stall ?
355 SMULL r14,r3, r9, r11 @ (r14,r3) = ro2*T[1]
356 MOV r12,r12,LSL #1
357 LDR r11,[r5,#4] @ r11= T[1]
358 SMLAL r14,r3, r8, r10 @ (r14,r3) -= ro0*T[0]
359 LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step
360 STR r12,[r4,#8]
361 MOV r3, r3, LSL #1
362 STR r3, [r4],#-16
363
364 @ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] )
365 @ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31
366 SMULL r14,r12,r6, r10 @ (r14,r12) = ri0*T[0]
367 RSB r6,r6,#0 @ r6 = -ri0
368 @ stall ?
369 SMLAL r14,r12,r7, r11 @ (r14,r12) += ri2*T[1]
370 @ stall ?
371 @ stall ?
372 SMULL r14,r3, r7, r10 @ (r14,r3) = ri2*T[0]
373 MOV r12,r12,LSL #1
374 @ stall ?
375 SMLAL r14,r3, r6, r11 @ (r14,r3) -= ri0*T[1]
376 CMP r4,r1
377 STR r12,[r1,#8-16]
378 MOV r3, r3, LSL #1
379 STR r3, [r1,#-16]
380
381 BGE presymmetry_loop3
382
383 SUB r1,r1,r0 @ r1 = in -= n>>2 (i.e. restore in)
384
385 LDR r3,[r13]
386 STR r2,[r13,#-4]!
387
388 @ mdct_butterflies
389 @ r0 = n = (points * 2)
390 @ r1 = in = x
391 @ r2 = i
392 @ r3 = shift
393 STMFD r13!,{r0-r1}
394 RSBS r4,r3,#6 @ r4 = stages = 7-shift then --stages
395 LDR r5,=sincos_lookup0
396 BLE no_generics
397 MOV r14,#4 @ r14= 4 (i=0)
398 MOV r6, r14,LSL r3 @ r6 = (4<<i)<<shift
399mdct_butterflies_loop1:
400 MOV r0, r0, LSR #1 @ r0 = points>>i = POINTS
401 MOV r2, r14,LSR #2 @ r2 = (1<<i)-j (j=0)
402 STMFD r13!,{r4,r14}
403mdct_butterflies_loop2:
404
405 @ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift))
406 @ mdct_butterfly_generic(r1, r0, r6)
407 @ r0 = points
408 @ r1 = x
409 @ preserve r2 (external loop counter)
410 @ preserve r3
411 @ preserve r4 (external loop counter)
412 @ r5 = T = sincos_lookup0
413 @ r6 = step
414 @ preserve r14
415
416 STR r2,[r13,#-4]! @ stack r2
417 ADD r1,r1,r0,LSL #1 @ r1 = x2+4 = x + (POINTS>>1)
418 ADD r7,r1,r0,LSL #1 @ r7 = x1+4 = x + POINTS
419 ADD r12,r5,#1024*4 @ r12= sincos_lookup0+1024
420
421mdct_bufferfly_generic_loop1:
422 LDMDB r7!,{r2,r3,r8,r11} @ r2 = x1[0]
423 @ r3 = x1[1]
424 @ r8 = x1[2]
425 @ r11= x1[3] x1 -= 4
426 LDMDB r1!,{r4,r9,r10,r14} @ r4 = x2[0]
427 @ r9 = x2[1]
428 @ r10= x2[2]
429 @ r14= x2[3] x2 -= 4
430
431 SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1]
432 ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0])
433 SUB r11,r11,r8 @ r11= s1 = x1[3] - x1[2]
434 ADD r8, r11,r8, LSL #1 @ r8 = x1[3] + x1[2] (-> x1[2])
435 SUB r9, r9, r4 @ r9 = s2 = x2[1] - x2[0]
436 ADD r4, r9, r4, LSL #1 @ r4 = x2[1] + x2[0] (-> x1[1])
437 SUB r14,r14,r10 @ r14= s3 = x2[3] - x2[2]
438 ADD r10,r14,r10,LSL #1 @ r10= x2[3] + x2[2] (-> x1[3])
439 STMIA r7,{r3,r4,r8,r10}
440
441 @ r0 = points
442 @ r1 = x2
443 @ r2 = s0
444 @ r3 free
445 @ r4 free
446 @ r5 = T
447 @ r6 = step
448 @ r7 = x1
449 @ r8 free
450 @ r9 = s2
451 @ r10 free
452 @ r11= s1
453 @ r12= limit
454 @ r14= s3
455
456 LDR r8, [r5,#4] @ r8 = T[1]
457 LDR r10,[r5],r6,LSL #2 @ r10= T[0] T += step
458
459 @ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2])
460 @ x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31
461 @ stall Xscale
462 SMULL r4, r3, r2, r8 @ (r4, r3) = s0*T[1]
463 SMLAL r4, r3, r11,r10 @ (r4, r3) += s1*T[0]
464 RSB r11,r11,#0
465 SMULL r11,r4, r8, r11 @ (r11,r4) = -s1*T[1]
466 SMLAL r11,r4, r2, r10 @ (r11,r4) += s0*T[0]
467 MOV r2, r3, LSL #1 @ r2 = r3<<1 = Value for x2[0]
468
469 @ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3])
470 @ x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31
471 SMULL r11,r3, r9, r10 @ (r11,r3) = s2*T[0]
472 MOV r4, r4, LSL #1 @ r4 = r4<<1 = Value for x2[2]
473 SMLAL r11,r3, r14,r8 @ (r11,r3) += s3*T[1]
474 RSB r9, r9, #0
475 SMULL r10,r11,r14,r10 @ (r10,r11) = s3*T[0]
476 MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1]
477 SMLAL r10,r11,r9,r8 @ (r10,r11) -= s2*T[1]
478 CMP r5, r12
479 MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3]
480
481 STMIA r1,{r2,r3,r4,r11}
482
483 BLT mdct_bufferfly_generic_loop1
484
485 SUB r12,r12,#1024*4
486mdct_bufferfly_generic_loop2:
487 LDMDB r7!,{r2,r3,r9,r10} @ r2 = x1[0]
488 @ r3 = x1[1]
489 @ r9 = x1[2]
490 @ r10= x1[3] x1 -= 4
491 LDMDB r1!,{r4,r8,r11,r14} @ r4 = x2[0]
492 @ r8 = x2[1]
493 @ r11= x2[2]
494 @ r14= x2[3] x2 -= 4
495
496 SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1]
497 ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0])
498 SUB r9, r9,r10 @ r9 = s1 = x1[2] - x1[3]
499 ADD r10,r9,r10, LSL #1 @ r10= x1[2] + x1[3] (-> x1[2])
500 SUB r4, r4, r8 @ r4 = s2 = x2[0] - x2[1]
501 ADD r8, r4, r8, LSL #1 @ r8 = x2[0] + x2[1] (-> x1[1])
502 SUB r14,r14,r11 @ r14= s3 = x2[3] - x2[2]
503 ADD r11,r14,r11,LSL #1 @ r11= x2[3] + x2[2] (-> x1[3])
504 STMIA r7,{r3,r8,r10,r11}
505
506 @ r0 = points
507 @ r1 = x2
508 @ r2 = s0
509 @ r3 free
510 @ r4 = s2
511 @ r5 = T
512 @ r6 = step
513 @ r7 = x1
514 @ r8 free
515 @ r9 = s1
516 @ r10 free
517 @ r11 free
518 @ r12= limit
519 @ r14= s3
520
521 LDR r8, [r5,#4] @ r8 = T[1]
522 LDR r10,[r5],-r6,LSL #2 @ r10= T[0] T -= step
523
524 @ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2])
525 @ x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31
526 @ stall Xscale
527 SMULL r3, r11,r2, r8 @ (r3, r11) = s0*T[1]
528 SMLAL r3, r11,r9, r10 @ (r3, r11) += s1*T[0]
529 RSB r9, r9, #0
530 SMULL r3, r2, r10,r2 @ (r3, r2) = s0*T[0]
531 SMLAL r3, r2, r9, r8 @ (r3, r2) += -s1*T[1]
532 MOV r9, r11,LSL #1 @ r9 = r11<<1 = Value for x2[2]
533
534 @ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3])
535 @ x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31
536 SMULL r3, r11,r4, r10 @ (r3,r11) = s2*T[0]
537 MOV r2, r2, LSL #1 @ r2 = r2<<1 = Value for x2[0]
538 SMLAL r3, r11,r14,r8 @ (r3,r11) += s3*T[1]
539 RSB r4, r4, #0
540 SMULL r10,r3,r14,r10 @ (r10,r3) = s3*T[0]
541 MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3]
542 SMLAL r10,r3, r4, r8 @ (r10,r3) -= s2*T[1]
543 CMP r5, r12
544 MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1]
545
546 STMIA r1,{r2,r3,r9,r11}
547
548 BGT mdct_bufferfly_generic_loop2
549
550 LDR r2,[r13],#4 @ unstack r2
551 ADD r1, r1, r0, LSL #2 @ r1 = x+POINTS*j
552 @ stall Xscale
553 SUBS r2, r2, #1 @ r2-- (j++)
554 BGT mdct_butterflies_loop2
555
556 LDMFD r13!,{r4,r14}
557
558 LDR r1,[r13,#4]
559
560 SUBS r4, r4, #1 @ stages--
561 MOV r14,r14,LSL #1 @ r14= 4<<i (i++)
562 MOV r6, r6, LSL #1 @ r6 = step <<= 1 (i++)
563 BGE mdct_butterflies_loop1
564 LDMFD r13,{r0-r1}
565no_generics:
566 @ mdct_butterflies part2 (loop around mdct_bufferfly_32)
567 @ r0 = points
568 @ r1 = in
569 @ r2 = step
570 @ r3 = shift
571
572mdct_bufferflies_loop3:
573 @ mdct_bufferfly_32
574
575 @ block1
576 ADD r4, r1, #16*4 @ r4 = &in[16]
577 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[16]
578 @ r6 = x[17]
579 @ r9 = x[18]
580 @ r10= x[19]
581 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
582 @ r8 = x[1]
583 @ r11= x[2]
584 @ r12= x[3]
585 SUB r5, r5, r6 @ r5 = s0 = x[16] - x[17]
586 ADD r6, r5, r6, LSL #1 @ r6 = x[16] + x[17] -> x[16]
587 SUB r9, r9, r10 @ r9 = s1 = x[18] - x[19]
588 ADD r10,r9, r10,LSL #1 @ r10= x[18] + x[19] -> x[18]
589 SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
590 ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[17]
591 SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
592 ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[19]
593 STMIA r4!,{r6,r7,r10,r11}
594
595 LDR r6,cPI1_8
596 LDR r7,cPI3_8
597
598 @ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] )
599 @ x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8
600 @ stall Xscale
601 SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI1_8
602 SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI3_8
603 RSB r9, r9, #0
604 SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI3_8
605 SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI1_8
606 MOV r11,r11,LSL #1
607 MOV r5, r5, LSL #1
608
609 @ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] )
610 @ x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8
611 SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI1_8
612 SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI3_8
613 RSB r8,r8,#0
614 SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI1_8
615 SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI3_8
616 MOV r9, r9, LSL #1
617 MOV r12,r12,LSL #1
618 STMIA r1!,{r5,r9,r11,r12}
619
620 @ block2
621 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[20]
622 @ r6 = x[21]
623 @ r9 = x[22]
624 @ r10= x[23]
625 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[4]
626 @ r8 = x[5]
627 @ r11= x[6]
628 @ r12= x[7]
629 SUB r5, r5, r6 @ r5 = s0 = x[20] - x[21]
630 ADD r6, r5, r6, LSL #1 @ r6 = x[20] + x[21] -> x[20]
631 SUB r9, r9, r10 @ r9 = s1 = x[22] - x[23]
632 ADD r10,r9, r10,LSL #1 @ r10= x[22] + x[23] -> x[22]
633 SUB r8, r8, r7 @ r8 = s2 = x[ 5] - x[ 4]
634 ADD r7, r8, r7, LSL #1 @ r7 = x[ 5] + x[ 4] -> x[21]
635 SUB r12,r12,r11 @ r12= s3 = x[ 7] - x[ 6]
636 ADD r11,r12,r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[23]
637 LDR r14,cPI2_8
638 STMIA r4!,{r6,r7,r10,r11}
639
640 SUB r5, r5, r9 @ r5 = s0 - s1
641 ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
642 SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8
643 SUB r12,r12,r8 @ r12= s3 - s2
644 ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
645
646 SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8
647 MOV r5, r5, LSL #1
648 SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8
649 MOV r8, r8, LSL #1
650 SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8
651 MOV r9, r9, LSL #1
652 MOV r12,r12,LSL #1
653 STMIA r1!,{r5,r8,r9,r12}
654
655 @ block3
656 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[24]
657 @ r6 = x[25]
658 @ r9 = x[25]
659 @ r10= x[26]
660 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[8]
661 @ r8 = x[9]
662 @ r11= x[10]
663 @ r12= x[11]
664 SUB r5, r5, r6 @ r5 = s0 = x[24] - x[25]
665 ADD r6, r5, r6, LSL #1 @ r6 = x[24] + x[25] -> x[25]
666 SUB r9, r9, r10 @ r9 = s1 = x[26] - x[27]
667 ADD r10,r9, r10,LSL #1 @ r10= x[26] + x[27] -> x[26]
668 SUB r8, r8, r7 @ r8 = s2 = x[ 9] - x[ 8]
669 ADD r7, r8, r7, LSL #1 @ r7 = x[ 9] + x[ 8] -> x[25]
670 SUB r12,r12,r11 @ r12= s3 = x[11] - x[10]
671 ADD r11,r12,r11, LSL #1 @ r11= x[11] + x[10] -> x[27]
672 STMIA r4!,{r6,r7,r10,r11}
673
674 LDR r6,cPI3_8
675 LDR r7,cPI1_8
676
677 @ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] )
678 @ x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8
679 @ stall Xscale
680 SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI3_8
681 SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI1_8
682 RSB r9, r9, #0
683 SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI1_8
684 SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI3_8
685 MOV r11,r11,LSL #1
686 MOV r5, r5, LSL #1
687
688 @ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] )
689 @ x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8
690 SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI3_8
691 SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI1_8
692 RSB r8,r8,#0
693 SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI3_8
694 SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI1_8
695 MOV r9, r9, LSL #1
696 MOV r12,r12,LSL #1
697 STMIA r1!,{r5,r9,r11,r12}
698
699 @ block4
700 LDMIA r4,{r5,r6,r10,r11} @ r5 = x[28]
701 @ r6 = x[29]
702 @ r10= x[30]
703 @ r11= x[31]
704 LDMIA r1,{r8,r9,r12,r14} @ r8 = x[12]
705 @ r9 = x[13]
706 @ r12= x[14]
707 @ r14= x[15]
708 SUB r5, r5, r6 @ r5 = s0 = x[28] - x[29]
709 ADD r6, r5, r6, LSL #1 @ r6 = x[28] + x[29] -> x[28]
710 SUB r7, r14,r12 @ r7 = s3 = x[15] - x[14]
711 ADD r12,r7, r12, LSL #1 @ r12= x[15] + x[14] -> x[31]
712 SUB r10,r10,r11 @ r10= s1 = x[30] - x[31]
713 ADD r11,r10,r11,LSL #1 @ r11= x[30] + x[31] -> x[30]
714 SUB r14, r8, r9 @ r14= s2 = x[12] - x[13]
715 ADD r9, r14, r9, LSL #1 @ r9 = x[12] + x[13] -> x[29]
716 STMIA r4!,{r6,r9,r11,r12}
717 STMIA r1!,{r5,r7,r10,r14}
718
719 @ mdct_butterfly16 (1st version)
720 @ block 1
721 SUB r1,r1,#16*4
722 ADD r4,r1,#8*4
723 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8]
724 @ r6 = x[ 9]
725 @ r9 = x[10]
726 @ r10= x[11]
727 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
728 @ r8 = x[1]
729 @ r11= x[2]
730 @ r12= x[3]
731 SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9]
732 ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8]
733 SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11]
734 ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10]
735 SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
736 ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9]
737 SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
738 ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11]
739 LDR r14,cPI2_8
740 STMIA r4!,{r6,r7,r10,r11}
741
742 SUB r5, r5, r9 @ r5 = s0 - s1
743 ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
744 SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8
745 SUB r12,r12,r8 @ r12= s3 - s2
746 ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
747
748 SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8
749 MOV r5, r5, LSL #1
750 SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8
751 MOV r8, r8, LSL #1
752 SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8
753 MOV r9, r9, LSL #1
754 MOV r12,r12,LSL #1
755 STMIA r1!,{r5,r8,r9,r12}
756
757 @ block4
758 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12]
759 @ r6 = x[13]
760 @ r9 = x[14]
761 @ r10= x[15]
762 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4]
763 @ r8 = x[ 5]
764 @ r11= x[ 6]
765 @ r12= x[ 7]
766 SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5]
767 ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13]
768 SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6]
769 ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15]
770 SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13]
771 ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12]
772 SUB r12,r9, r10 @ r12= s3 = x[14] - x[15]
773 ADD r10,r12,r10,LSL #1 @ r10= x[14] + x[15] -> x[14]
774 STMIA r4!,{r6,r8,r10,r11}
775 STMIA r1!,{r5,r7,r12,r14}
776
777 @ mdct_butterfly_8
778 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14}
779 @ r6 = x[0]
780 @ r7 = x[1]
781 @ r8 = x[2]
782 @ r9 = x[3]
783 @ r10= x[4]
784 @ r11= x[5]
785 @ r12= x[6]
786 @ r14= x[7]
787 ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
788 SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
789 ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
790 SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
791 ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
792 SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
793 ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
794 SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
795
796 ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
797 SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
798 SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
799 ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
800 SUB r10,r10,r6 @ r10= x[4] = s4 - s0
801 SUB r11,r12,r8 @ r11= x[5] = s6 - s2
802 ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
803 ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
804 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14}
805
806 @ mdct_butterfly_8
807 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14}
808 @ r6 = x[0]
809 @ r7 = x[1]
810 @ r8 = x[2]
811 @ r9 = x[3]
812 @ r10= x[4]
813 @ r11= x[5]
814 @ r12= x[6]
815 @ r14= x[7]
816 ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
817 SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
818 ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
819 SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
820 ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
821 SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
822 ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
823 SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
824
825 ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
826 SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
827 SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
828 ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
829 SUB r10,r10,r6 @ r10= x[4] = s4 - s0
830 SUB r11,r12,r8 @ r11= x[5] = s6 - s2
831 ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
832 ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
833 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14}
834
835 @ block 2
836 ADD r1,r1,#16*4-8*4
837 ADD r4,r1,#8*4
838 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8]
839 @ r6 = x[ 9]
840 @ r9 = x[10]
841 @ r10= x[11]
842 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
843 @ r8 = x[1]
844 @ r11= x[2]
845 @ r12= x[3]
846 SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9]
847 ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8]
848 SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11]
849 ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10]
850 SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
851 ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9]
852 SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
853 ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11]
854 LDR r14,cPI2_8
855 STMIA r4!,{r6,r7,r10,r11}
856
857 SUB r5, r5, r9 @ r5 = s0 - s1
858 ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
859 SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8
860 SUB r12,r12,r8 @ r12= s3 - s2
861 ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
862
863 SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8
864 MOV r5, r5, LSL #1
865 SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8
866 MOV r8, r8, LSL #1
867 SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8
868 MOV r9, r9, LSL #1
869 MOV r12,r12,LSL #1
870 STMIA r1!,{r5,r8,r9,r12}
871
872 @ block4
873 LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12]
874 @ r6 = x[13]
875 @ r9 = x[14]
876 @ r10= x[15]
877 LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4]
878 @ r8 = x[ 5]
879 @ r11= x[ 6]
880 @ r12= x[ 7]
881 SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13]
882 ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12]
883 SUB r9, r9, r10 @ r9 = s3 = x[14] - x[15]
884 ADD r10,r9, r10,LSL #1 @ r10= x[14] + x[15] -> x[14]
885 SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5]
886 ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13]
887 SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6]
888 ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15]
889 STMIA r4!,{r6,r8,r10,r11}
890 STMIA r1!,{r5,r7,r9,r14}
891
892 @ mdct_butterfly_8
893 LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14}
894 @ r6 = x[0]
895 @ r7 = x[1]
896 @ r8 = x[2]
897 @ r9 = x[3]
898 @ r10= x[4]
899 @ r11= x[5]
900 @ r12= x[6]
901 @ r14= x[7]
902 ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
903 SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
904 ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
905 SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
906 ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
907 SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
908 ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
909 SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
910
911 ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
912 SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
913 SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
914 ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
915 SUB r10,r10,r6 @ r10= x[4] = s4 - s0
916 SUB r11,r12,r8 @ r11= x[5] = s6 - s2
917 ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
918 ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
919 STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14}
920
921 @ mdct_butterfly_8
922 LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14}
923 @ r6 = x[0]
924 @ r7 = x[1]
925 @ r8 = x[2]
926 @ r9 = x[3]
927 @ r10= x[4]
928 @ r11= x[5]
929 @ r12= x[6]
930 @ r14= x[7]
931 ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
932 SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
933 ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
934 SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
935 ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
936 SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
937 ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
938 SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
939
940 ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
941 SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
942 SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
943 ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
944 SUB r10,r10,r6 @ r10= x[4] = s4 - s0
945 SUB r11,r12,r8 @ r11= x[5] = s6 - s2
946 ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
947 ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
948 STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14}
949
950 ADD r1,r1,#8*4
951 SUBS r0,r0,#64
952 BGT mdct_bufferflies_loop3
953
954 LDMFD r13,{r0-r3}
955
956mdct_bitreverseARM:
Gloria Wang021523c2010-02-17 16:29:17 -0800957 @ r0 = points = n
Gloria Wang79130732010-02-08 14:41:04 -0800958 @ r1 = in
959 @ r2 = step
960 @ r3 = shift
961
962 MOV r4, #0 @ r4 = bit = 0
963 ADD r5, r1, r0, LSL #1 @ r5 = w = x + (n>>1)
964 ADR r6, bitrev
Gloria Wang79130732010-02-08 14:41:04 -0800965 SUB r5, r5, #8
966brev_lp:
967 LDRB r7, [r6, r4, LSR #6]
968 AND r8, r4, #0x3f
969 LDRB r8, [r6, r8]
970 ADD r4, r4, #1 @ bit++
971 @ stall XScale
972 ORR r7, r7, r8, LSL #6 @ r7 = bitrev[bit]
Gloria Wang021523c2010-02-17 16:29:17 -0800973 MOV r7, r7, LSR r3
974 ADD r9, r1, r7, LSL #2 @ r9 = xx = x + (b>>shift)
Gloria Wang79130732010-02-08 14:41:04 -0800975 CMP r5, r9 @ if (w > xx)
976 LDR r10,[r5],#-8 @ r10 = w[0] w -= 2
977 LDRGT r11,[r5,#12] @ r11 = w[1]
978 LDRGT r12,[r9] @ r12 = xx[0]
979 LDRGT r14,[r9,#4] @ r14 = xx[1]
980 STRGT r10,[r9] @ xx[0]= w[0]
981 STRGT r11,[r9,#4] @ xx[1]= w[1]
982 STRGT r12,[r5,#8] @ w[0] = xx[0]
983 STRGT r14,[r5,#12] @ w[1] = xx[1]
984 CMP r5,r1
985 BGT brev_lp
986
987 @ mdct_step7
988 @ r0 = points
989 @ r1 = in
990 @ r2 = step
Gloria Wang021523c2010-02-17 16:29:17 -0800991 @ r3 = shift
Gloria Wang79130732010-02-08 14:41:04 -0800992
993 CMP r2, #4 @ r5 = T = (step>=4) ?
994 LDRGE r5, =sincos_lookup0 @ sincos_lookup0 +
995 LDRLT r5, =sincos_lookup1 @ sincos_lookup0 +
996 ADD r7, r1, r0, LSL #1 @ r7 = w1 = x + (n>>1)
997 ADDGE r5, r5, r2, LSL #1 @ (step>>1)
998 ADD r8, r5, #1024*4 @ r8 = Ttop
999step7_loop1:
1000 LDR r6, [r1] @ r6 = w0[0]
1001 LDR r9, [r1,#4] @ r9 = w0[1]
1002 LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2
1003 LDR r11,[r7,#4] @ r11= w1[1]
1004 LDR r14,[r5,#4] @ r14= T[1]
1005 LDR r12,[r5],r2,LSL #2 @ r12= T[0] T += step
1006
1007 ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0]
1008 SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0]
1009 SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1]
1010 ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1]
1011
1012 @ Can save 1 cycle by using SMULL SMLAL - at the cost of being
1013 @ 1 off.
1014 SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[1]
1015 SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[0] = s2
1016 ADD r3, r3, r4
1017 SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[1]
1018 SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[0] = s3
1019 SUB r14,r14,r12
1020
1021 @ r9 = s0b<<1
1022 @ r10= s1b<<1
1023 ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2
1024 SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2
1025
1026 SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b
1027 ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b
1028 STR r9, [r1],#4
1029 STR r10,[r1],#4 @ w0 += 2
1030 STR r3, [r7]
1031 STR r12,[r7,#4]
1032
1033 CMP r5,r8
1034 BLT step7_loop1
1035
1036step7_loop2:
1037 LDR r6, [r1] @ r6 = w0[0]
1038 LDR r9, [r1,#4] @ r9 = w0[1]
1039 LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2
1040 LDR r11,[r7,#4] @ r11= w1[1]
1041 LDR r14,[r5,-r2,LSL #2]! @ r12= T[1] T -= step
1042 LDR r12,[r5,#4] @ r14= T[0]
1043
1044 ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0]
1045 SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0]
1046 SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1]
1047 ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1]
1048
1049 @ Can save 1 cycle by using SMULL SMLAL - at the cost of being
1050 @ 1 off.
1051 SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[0]
1052 SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[1] = s2
1053 ADD r3, r3, r4
1054 SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[0]
1055 SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[1] = s3
1056 SUB r14,r14,r12
1057
1058 @ r9 = s0b<<1
1059 @ r10= s1b<<1
1060 ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2
1061 SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2
1062
1063 SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b
1064 ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b
1065 STR r9, [r1],#4
1066 STR r10,[r1],#4 @ w0 += 2
1067 STR r3, [r7]
1068 STR r12,[r7,#4]
1069
1070 CMP r1,r7
1071 BLT step7_loop2
1072
1073 LDMFD r13!,{r0-r3}
1074
1075 @ r0 = points
1076 @ r1 = in
1077 @ r2 = step
1078 @ r3 = shift
1079 MOV r2, r2, ASR #2 @ r2 = step >>= 2
1080 CMP r2, #0
1081 CMPNE r2, #1
1082 BEQ mdct_end
1083
1084 @ step > 1 (default case)
1085 CMP r2, #4 @ r5 = T = (step>=4) ?
1086 LDRGE r5, =sincos_lookup0 @ sincos_lookup0 +
1087 LDRLT r5, =sincos_lookup1 @ sincos_lookup1
1088 ADD r7, r1, r0, LSL #1 @ r7 = iX = x + (n>>1)
1089 ADDGE r5, r5, r2, LSL #1 @ (step>>1)
1090mdct_step8_default:
1091 LDR r6, [r1],#4 @ r6 = s0 = x[0]
1092 LDR r8, [r1],#4 @ r8 = -s1 = x[1]
1093 LDR r12,[r5,#4] @ r12= T[1]
1094 LDR r14,[r5],r2,LSL #2 @ r14= T[0] T += step
1095 RSB r8, r8, #0 @ r8 = s1
1096
1097 @ XPROD31(s0, s1, T[0], T[1], x, x+1)
1098 @ x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1]
1099 SMULL r9, r10, r8, r12 @ (r9,r10) = s1 * T[1]
1100 CMP r1, r7
1101 SMLAL r9, r10, r6, r14 @ (r9,r10) += s0 * T[0]
1102 RSB r6, r6, #0 @ r6 = -s0
1103 SMULL r9, r11, r8, r14 @ (r9,r11) = s1 * T[0]
1104 MOV r10,r10,LSL #1
1105 SMLAL r9, r11, r6, r12 @ (r9,r11) -= s0 * T[1]
1106 STR r10,[r1,#-8]
1107 MOV r11,r11,LSL #1
1108 STR r11,[r1,#-4]
1109 BLT mdct_step8_default
1110
1111mdct_end:
1112 MOV r0, r2
1113 LDMFD r13!,{r4-r11,PC}
1114
1115cPI1_8:
1116 .word 0x7641af3d
1117cPI2_8:
1118 .word 0x5a82799a
1119cPI3_8:
1120 .word 0x30fbc54d
1121bitrev:
1122 .byte 0
1123 .byte 32
1124 .byte 16
1125 .byte 48
1126 .byte 8
1127 .byte 40
1128 .byte 24
1129 .byte 56
1130 .byte 4
1131 .byte 36
1132 .byte 20
1133 .byte 52
1134 .byte 12
1135 .byte 44
1136 .byte 28
1137 .byte 60
1138 .byte 2
1139 .byte 34
1140 .byte 18
1141 .byte 50
1142 .byte 10
1143 .byte 42
1144 .byte 26
1145 .byte 58
1146 .byte 6
1147 .byte 38
1148 .byte 22
1149 .byte 54
1150 .byte 14
1151 .byte 46
1152 .byte 30
1153 .byte 62
1154 .byte 1
1155 .byte 33
1156 .byte 17
1157 .byte 49
1158 .byte 9
1159 .byte 41
1160 .byte 25
1161 .byte 57
1162 .byte 5
1163 .byte 37
1164 .byte 21
1165 .byte 53
1166 .byte 13
1167 .byte 45
1168 .byte 29
1169 .byte 61
1170 .byte 3
1171 .byte 35
1172 .byte 19
1173 .byte 51
1174 .byte 11
1175 .byte 43
1176 .byte 27
1177 .byte 59
1178 .byte 7
1179 .byte 39
1180 .byte 23
1181 .byte 55
1182 .byte 15
1183 .byte 47
1184 .byte 31
1185 .byte 63
1186
1187 @ END