blob: fb8af30e66524ac1f417ac6193e85c64d34077e3 [file] [log] [blame]
Josh Coalson31bdd702004-07-25 20:34:40 +00001; libFLAC - Free Lossless Audio Codec library
2; Copyright (C) 2004 Josh Coalson
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7;
8; - Redistributions of source code must retain the above copyright
9; notice, this list of conditions and the following disclaimer.
10;
11; - Redistributions in binary form must reproduce the above copyright
12; notice, this list of conditions and the following disclaimer in the
13; documentation and/or other materials provided with the distribution.
14;
15; - Neither the name of the Xiph.org Foundation nor the names of its
16; contributors may be used to endorse or promote products derived from
17; this software without specific prior written permission.
18;
19; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
23; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31.text
32 .align 2
33.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
34.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
35
36_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
37; r3: residual[]
38; r4: data_len
39; r5: qlp_coeff[]
40; r6: order
41; r7: lp_quantization
42; r8: data[]
43
44; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
45; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
46; bps<=15 for mid-side coding, since that uses an extra bit)
47
48; these should be fast; the inner loop is unrolled (it takes no more than
49; 3*(order%4) instructions, all of which are arithmetic), and all of the
50; coefficients and all relevant history stay in registers, so the outer loop
51; has only one load from memory (the residual)
52
53; I haven't yet run this through simg4, so there may be some avoidable stalls,
54; and there may be a somewhat more clever way to do the outer loop
55
56; the branch mechanism may prevent dynamic loading; I still need to examine
57; this issue, and there may be a more elegant method
58
59 stmw r31,-4(r1)
60
61 addi r9,r1,-28
62 li r31,0xf
63 andc r9,r9,r31 ; for quadword-aligned stack data
64
65 slwi r6,r6,2 ; adjust for word size
66 slwi r4,r4,2
67 add r4,r4,r8 ; r4 = data+data_len
68
69 mfspr r0,256 ; cache old vrsave
70 addis r31,0,hi16(0xfffffc00)
71 ori r31,r31,lo16(0xfffffc00)
72 mtspr 256,r31 ; declare VRs in vrsave
73
74 cmplw cr0,r8,r4 ; i<data_len
75 bc 4,0,L1400
76
77 ; load coefficients into v0-v7 and initial history into v8-v15
78 li r31,0xf
79 and r31,r8,r31 ; r31: data%4
80 li r11,16
81 subf r31,r31,r11 ; r31: 4-(data%4)
82 slwi r31,r31,3 ; convert to bits for vsro
83 li r10,-4
84 stw r31,-4(r9)
85 lvewx v0,r10,r9
86 vspltisb v18,-1
87 vsro v18,v18,v0 ; v18: mask vector
88
89 li r31,0x8
90 lvsl v0,0,r31
91 vsldoi v0,v0,v0,12
92 li r31,0xc
93 lvsl v1,0,r31
94 vspltisb v2,0
95 vspltisb v3,-1
96 vmrglw v2,v2,v3
97 vsel v0,v1,v0,v2 ; v0: reversal permutation vector
98
99 add r10,r5,r6
100 lvsl v17,0,r5 ; v17: coefficient alignment permutation vector
101 vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector
102
103 mr r11,r8
104 lvsl v16,0,r11 ; v16: history alignment permutation vector
105
106 lvx v0,0,r5
107 addi r5,r5,16
108 lvx v1,0,r5
109 vperm v0,v0,v1,v17
110 lvx v8,0,r11
111 addi r11,r11,-16
112 lvx v9,0,r11
113 vperm v8,v9,v8,v16
114 cmplw cr0,r5,r10
115 bc 12,0,L1101
116 vand v0,v0,v18
117 addis r31,0,hi16(L1307)
118 ori r31,r31,lo16(L1307)
119 b L1199
120
121L1101:
122 addi r5,r5,16
123 lvx v2,0,r5
124 vperm v1,v1,v2,v17
125 addi r11,r11,-16
126 lvx v10,0,r11
127 vperm v9,v10,v9,v16
128 cmplw cr0,r5,r10
129 bc 12,0,L1102
130 vand v1,v1,v18
131 addis r31,0,hi16(L1306)
132 ori r31,r31,lo16(L1306)
133 b L1199
134
135L1102:
136 addi r5,r5,16
137 lvx v3,0,r5
138 vperm v2,v2,v3,v17
139 addi r11,r11,-16
140 lvx v11,0,r11
141 vperm v10,v11,v10,v16
142 cmplw cr0,r5,r10
143 bc 12,0,L1103
144 vand v2,v2,v18
145 addis r31,0,hi16(L1305)
146 ori r31,r31,lo16(L1305)
147 b L1199
148
149L1103:
150 addi r5,r5,16
151 lvx v4,0,r5
152 vperm v3,v3,v4,v17
153 addi r11,r11,-16
154 lvx v12,0,r11
155 vperm v11,v12,v11,v16
156 cmplw cr0,r5,r10
157 bc 12,0,L1104
158 vand v3,v3,v18
159 addis r31,0,hi16(L1304)
160 ori r31,r31,lo16(L1304)
161 b L1199
162
163L1104:
164 addi r5,r5,16
165 lvx v5,0,r5
166 vperm v4,v4,v5,v17
167 addi r11,r11,-16
168 lvx v13,0,r11
169 vperm v12,v13,v12,v16
170 cmplw cr0,r5,r10
171 bc 12,0,L1105
172 vand v4,v4,v18
173 addis r31,0,hi16(L1303)
174 ori r31,r31,lo16(L1303)
175 b L1199
176
177L1105:
178 addi r5,r5,16
179 lvx v6,0,r5
180 vperm v5,v5,v6,v17
181 addi r11,r11,-16
182 lvx v14,0,r11
183 vperm v13,v14,v13,v16
184 cmplw cr0,r5,r10
185 bc 12,0,L1106
186 vand v5,v5,v18
187 addis r31,0,hi16(L1302)
188 ori r31,r31,lo16(L1302)
189 b L1199
190
191L1106:
192 addi r5,r5,16
193 lvx v7,0,r5
194 vperm v6,v6,v7,v17
195 addi r11,r11,-16
196 lvx v15,0,r11
197 vperm v14,v15,v14,v16
198 cmplw cr0,r5,r10
199 bc 12,0,L1107
200 vand v6,v6,v18
201 addis r31,0,hi16(L1301)
202 ori r31,r31,lo16(L1301)
203 b L1199
204
205L1107:
206 addi r5,r5,16
207 lvx v19,0,r5
208 vperm v7,v7,v19,v17
209 addi r11,r11,-16
210 lvx v19,0,r11
211 vperm v15,v19,v15,v16
212 vand v7,v7,v18
213 addis r31,0,hi16(L1300)
214 ori r31,r31,lo16(L1300)
215
216L1199:
217 mtctr r31
218
219 ; set up invariant vectors
220 vspltish v16,0 ; v16: zero vector
221
222 li r10,-12
223 lvsr v17,r10,r8 ; v17: result shift vector
224 lvsl v18,r10,r3 ; v18: residual shift back vector
225
226 li r10,-4
227 stw r7,-4(r9)
228 lvewx v19,r10,r9 ; v19: lp_quantization vector
229
230L1200:
231 vmulosh v20,v0,v8 ; v20: sum vector
232 bcctr 20,0
233
234L1300:
235 vmulosh v21,v7,v15
236 vsldoi v15,v15,v14,4 ; increment history
237 vaddsws v20,v20,v21
238
239L1301:
240 vmulosh v21,v6,v14
241 vsldoi v14,v14,v13,4
242 vaddsws v20,v20,v21
243
244L1302:
245 vmulosh v21,v5,v13
246 vsldoi v13,v13,v12,4
247 vaddsws v20,v20,v21
248
249L1303:
250 vmulosh v21,v4,v12
251 vsldoi v12,v12,v11,4
252 vaddsws v20,v20,v21
253
254L1304:
255 vmulosh v21,v3,v11
256 vsldoi v11,v11,v10,4
257 vaddsws v20,v20,v21
258
259L1305:
260 vmulosh v21,v2,v10
261 vsldoi v10,v10,v9,4
262 vaddsws v20,v20,v21
263
264L1306:
265 vmulosh v21,v1,v9
266 vsldoi v9,v9,v8,4
267 vaddsws v20,v20,v21
268
269L1307:
270 vsumsws v20,v20,v16 ; v20[3]: sum
271 vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization
272
273 lvewx v21,0,r3 ; v21[n]: *residual
274 vperm v21,v21,v21,v18 ; v21[3]: *residual
275 vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization)
276 vsldoi v18,v18,v18,4 ; increment shift vector
277
278 vperm v21,v20,v20,v17 ; v21[n]: shift for storage
279 vsldoi v17,v17,v17,12 ; increment shift vector
280 stvewx v21,0,r8
281
282 vsldoi v20,v20,v20,12
283 vsldoi v8,v8,v20,4 ; insert value onto history
284
285 addi r3,r3,4
286 addi r8,r8,4
287 cmplw cr0,r8,r4 ; i<data_len
288 bc 12,0,L1200
289
290L1400:
291 mtspr 256,r0 ; restore old vrsave
292 lmw r31,-4(r1)
293 blr
294
295_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
296; r3: residual[]
297; r4: data_len
298; r5: qlp_coeff[]
299; r6: order
300; r7: lp_quantization
301; r8: data[]
302
303; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
304; this version assumes order<=8; it uses fewer vector registers, which should
305; save time in context switches, and has less code, which may improve
306; instruction caching
307
308 stmw r31,-4(r1)
309
310 addi r9,r1,-28
311 li r31,0xf
312 andc r9,r9,r31 ; for quadword-aligned stack data
313
314 slwi r6,r6,2 ; adjust for word size
315 slwi r4,r4,2
316 add r4,r4,r8 ; r4 = data+data_len
317
318 mfspr r0,256 ; cache old vrsave
319 addis r31,0,hi16(0xffc00000)
320 ori r31,r31,lo16(0xffc00000)
321 mtspr 256,r31 ; declare VRs in vrsave
322
323 cmplw cr0,r8,r4 ; i<data_len
324 bc 4,0,L2400
325
326 ; load coefficients into v0-v1 and initial history into v2-v3
327 li r31,0xf
328 and r31,r8,r31 ; r31: data%4
329 li r11,16
330 subf r31,r31,r11 ; r31: 4-(data%4)
331 slwi r31,r31,3 ; convert to bits for vsro
332 li r10,-4
333 stw r31,-4(r9)
334 lvewx v0,r10,r9
335 vspltisb v6,-1
336 vsro v6,v6,v0 ; v6: mask vector
337
338 li r31,0x8
339 lvsl v0,0,r31
340 vsldoi v0,v0,v0,12
341 li r31,0xc
342 lvsl v1,0,r31
343 vspltisb v2,0
344 vspltisb v3,-1
345 vmrglw v2,v2,v3
346 vsel v0,v1,v0,v2 ; v0: reversal permutation vector
347
348 add r10,r5,r6
349 lvsl v5,0,r5 ; v5: coefficient alignment permutation vector
350 vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector
351
352 mr r11,r8
353 lvsl v4,0,r11 ; v4: history alignment permutation vector
354
355 lvx v0,0,r5
356 addi r5,r5,16
357 lvx v1,0,r5
358 vperm v0,v0,v1,v5
359 lvx v2,0,r11
360 addi r11,r11,-16
361 lvx v3,0,r11
362 vperm v2,v3,v2,v4
363 cmplw cr0,r5,r10
364 bc 12,0,L2101
365 vand v0,v0,v6
366 addis r31,0,hi16(L2301)
367 ori r31,r31,lo16(L2301)
368 b L2199
369
370L2101:
371 addi r5,r5,16
372 lvx v7,0,r5
373 vperm v1,v1,v7,v5
374 addi r11,r11,-16
375 lvx v7,0,r11
376 vperm v3,v7,v3,v4
377 vand v1,v1,v6
378 addis r31,0,hi16(L2300)
379 ori r31,r31,lo16(L2300)
380
381L2199:
382 mtctr r31
383
384 ; set up invariant vectors
385 vspltish v4,0 ; v4: zero vector
386
387 li r10,-12
388 lvsr v5,r10,r8 ; v5: result shift vector
389 lvsl v6,r10,r3 ; v6: residual shift back vector
390
391 li r10,-4
392 stw r7,-4(r9)
393 lvewx v7,r10,r9 ; v7: lp_quantization vector
394
395L2200:
396 vmulosh v8,v0,v2 ; v8: sum vector
397 bcctr 20,0
398
399L2300:
400 vmulosh v9,v1,v3
401 vsldoi v3,v3,v2,4
402 vaddsws v8,v8,v9
403
404L2301:
405 vsumsws v8,v8,v4 ; v8[3]: sum
406 vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization
407
408 lvewx v9,0,r3 ; v9[n]: *residual
409 vperm v9,v9,v9,v6 ; v9[3]: *residual
410 vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization)
411 vsldoi v6,v6,v6,4 ; increment shift vector
412
413 vperm v9,v8,v8,v5 ; v9[n]: shift for storage
414 vsldoi v5,v5,v5,12 ; increment shift vector
415 stvewx v9,0,r8
416
417 vsldoi v8,v8,v8,12
418 vsldoi v2,v2,v8,4 ; insert value onto history
419
420 addi r3,r3,4
421 addi r8,r8,4
422 cmplw cr0,r8,r4 ; i<data_len
423 bc 12,0,L2200
424
425L2400:
426 mtspr 256,r0 ; restore old vrsave
427 lmw r31,-4(r1)
428 blr