; libFLAC - Free Lossless Audio Codec library | |
; Copyright (C) 2004 Josh Coalson | |
; | |
; Redistribution and use in source and binary forms, with or without | |
; modification, are permitted provided that the following conditions | |
; are met: | |
; | |
; - Redistributions of source code must retain the above copyright | |
; notice, this list of conditions and the following disclaimer. | |
; | |
; - Redistributions in binary form must reproduce the above copyright | |
; notice, this list of conditions and the following disclaimer in the | |
; documentation and/or other materials provided with the distribution. | |
; | |
; - Neither the name of the Xiph.org Foundation nor the names of its | |
; contributors may be used to endorse or promote products derived from | |
; this software without specific prior written permission. | |
; | |
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR | |
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
.text | |
.align 2 | |
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16 | |
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8 | |
_FLAC__lpc_restore_signal_asm_ppc_altivec_16: | |
; r3: residual[] | |
; r4: data_len | |
; r5: qlp_coeff[] | |
; r6: order | |
; r7: lp_quantization | |
; r8: data[] | |
; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal() | |
; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual | |
; bps<=15 for mid-side coding, since that uses an extra bit) | |
; these should be fast; the inner loop is unrolled (it takes no more than | |
; 3*(order%4) instructions, all of which are arithmetic), and all of the | |
; coefficients and all relevant history stay in registers, so the outer loop | |
; has only one load from memory (the residual) | |
; I haven't yet run this through simg4, so there may be some avoidable stalls, | |
; and there may be a somewhat more clever way to do the outer loop | |
; the branch mechanism may prevent dynamic loading; I still need to examine | |
; this issue, and there may be a more elegant method | |
stmw r31,-4(r1) | |
addi r9,r1,-28 | |
li r31,0xf | |
andc r9,r9,r31 ; for quadword-aligned stack data | |
slwi r6,r6,2 ; adjust for word size | |
slwi r4,r4,2 | |
add r4,r4,r8 ; r4 = data+data_len | |
mfspr r0,256 ; cache old vrsave | |
addis r31,0,hi16(0xfffffc00) | |
ori r31,r31,lo16(0xfffffc00) | |
mtspr 256,r31 ; declare VRs in vrsave | |
cmplw cr0,r8,r4 ; i<data_len | |
bc 4,0,L1400 | |
; load coefficients into v0-v7 and initial history into v8-v15 | |
li r31,0xf | |
and r31,r8,r31 ; r31: data%4 | |
li r11,16 | |
subf r31,r31,r11 ; r31: 4-(data%4) | |
slwi r31,r31,3 ; convert to bits for vsro | |
li r10,-4 | |
stw r31,-4(r9) | |
lvewx v0,r10,r9 | |
vspltisb v18,-1 | |
vsro v18,v18,v0 ; v18: mask vector | |
li r31,0x8 | |
lvsl v0,0,r31 | |
vsldoi v0,v0,v0,12 | |
li r31,0xc | |
lvsl v1,0,r31 | |
vspltisb v2,0 | |
vspltisb v3,-1 | |
vmrglw v2,v2,v3 | |
vsel v0,v1,v0,v2 ; v0: reversal permutation vector | |
add r10,r5,r6 | |
lvsl v17,0,r5 ; v17: coefficient alignment permutation vector | |
vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector | |
mr r11,r8 | |
lvsl v16,0,r11 ; v16: history alignment permutation vector | |
lvx v0,0,r5 | |
addi r5,r5,16 | |
lvx v1,0,r5 | |
vperm v0,v0,v1,v17 | |
lvx v8,0,r11 | |
addi r11,r11,-16 | |
lvx v9,0,r11 | |
vperm v8,v9,v8,v16 | |
cmplw cr0,r5,r10 | |
bc 12,0,L1101 | |
vand v0,v0,v18 | |
addis r31,0,hi16(L1307) | |
ori r31,r31,lo16(L1307) | |
b L1199 | |
L1101: | |
addi r5,r5,16 | |
lvx v2,0,r5 | |
vperm v1,v1,v2,v17 | |
addi r11,r11,-16 | |
lvx v10,0,r11 | |
vperm v9,v10,v9,v16 | |
cmplw cr0,r5,r10 | |
bc 12,0,L1102 | |
vand v1,v1,v18 | |
addis r31,0,hi16(L1306) | |
ori r31,r31,lo16(L1306) | |
b L1199 | |
L1102: | |
addi r5,r5,16 | |
lvx v3,0,r5 | |
vperm v2,v2,v3,v17 | |
addi r11,r11,-16 | |
lvx v11,0,r11 | |
vperm v10,v11,v10,v16 | |
cmplw cr0,r5,r10 | |
bc 12,0,L1103 | |
vand v2,v2,v18 | |
addis r31,0,hi16(L1305) | |
ori r31,r31,lo16(L1305) | |
b L1199 | |
L1103: | |
addi r5,r5,16 | |
lvx v4,0,r5 | |
vperm v3,v3,v4,v17 | |
addi r11,r11,-16 | |
lvx v12,0,r11 | |
vperm v11,v12,v11,v16 | |
cmplw cr0,r5,r10 | |
bc 12,0,L1104 | |
vand v3,v3,v18 | |
addis r31,0,hi16(L1304) | |
ori r31,r31,lo16(L1304) | |
b L1199 | |
L1104: | |
addi r5,r5,16 | |
lvx v5,0,r5 | |
vperm v4,v4,v5,v17 | |
addi r11,r11,-16 | |
lvx v13,0,r11 | |
vperm v12,v13,v12,v16 | |
cmplw cr0,r5,r10 | |
bc 12,0,L1105 | |
vand v4,v4,v18 | |
addis r31,0,hi16(L1303) | |
ori r31,r31,lo16(L1303) | |
b L1199 | |
L1105: | |
addi r5,r5,16 | |
lvx v6,0,r5 | |
vperm v5,v5,v6,v17 | |
addi r11,r11,-16 | |
lvx v14,0,r11 | |
vperm v13,v14,v13,v16 | |
cmplw cr0,r5,r10 | |
bc 12,0,L1106 | |
vand v5,v5,v18 | |
addis r31,0,hi16(L1302) | |
ori r31,r31,lo16(L1302) | |
b L1199 | |
L1106: | |
addi r5,r5,16 | |
lvx v7,0,r5 | |
vperm v6,v6,v7,v17 | |
addi r11,r11,-16 | |
lvx v15,0,r11 | |
vperm v14,v15,v14,v16 | |
cmplw cr0,r5,r10 | |
bc 12,0,L1107 | |
vand v6,v6,v18 | |
addis r31,0,hi16(L1301) | |
ori r31,r31,lo16(L1301) | |
b L1199 | |
L1107: | |
addi r5,r5,16 | |
lvx v19,0,r5 | |
vperm v7,v7,v19,v17 | |
addi r11,r11,-16 | |
lvx v19,0,r11 | |
vperm v15,v19,v15,v16 | |
vand v7,v7,v18 | |
addis r31,0,hi16(L1300) | |
ori r31,r31,lo16(L1300) | |
L1199: | |
mtctr r31 | |
; set up invariant vectors | |
vspltish v16,0 ; v16: zero vector | |
li r10,-12 | |
lvsr v17,r10,r8 ; v17: result shift vector | |
lvsl v18,r10,r3 ; v18: residual shift back vector | |
li r10,-4 | |
stw r7,-4(r9) | |
lvewx v19,r10,r9 ; v19: lp_quantization vector | |
L1200: | |
vmulosh v20,v0,v8 ; v20: sum vector | |
bcctr 20,0 | |
L1300: | |
vmulosh v21,v7,v15 | |
vsldoi v15,v15,v14,4 ; increment history | |
vaddsws v20,v20,v21 | |
L1301: | |
vmulosh v21,v6,v14 | |
vsldoi v14,v14,v13,4 | |
vaddsws v20,v20,v21 | |
L1302: | |
vmulosh v21,v5,v13 | |
vsldoi v13,v13,v12,4 | |
vaddsws v20,v20,v21 | |
L1303: | |
vmulosh v21,v4,v12 | |
vsldoi v12,v12,v11,4 | |
vaddsws v20,v20,v21 | |
L1304: | |
vmulosh v21,v3,v11 | |
vsldoi v11,v11,v10,4 | |
vaddsws v20,v20,v21 | |
L1305: | |
vmulosh v21,v2,v10 | |
vsldoi v10,v10,v9,4 | |
vaddsws v20,v20,v21 | |
L1306: | |
vmulosh v21,v1,v9 | |
vsldoi v9,v9,v8,4 | |
vaddsws v20,v20,v21 | |
L1307: | |
vsumsws v20,v20,v16 ; v20[3]: sum | |
vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization | |
lvewx v21,0,r3 ; v21[n]: *residual | |
vperm v21,v21,v21,v18 ; v21[3]: *residual | |
vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization) | |
vsldoi v18,v18,v18,4 ; increment shift vector | |
vperm v21,v20,v20,v17 ; v21[n]: shift for storage | |
vsldoi v17,v17,v17,12 ; increment shift vector | |
stvewx v21,0,r8 | |
vsldoi v20,v20,v20,12 | |
vsldoi v8,v8,v20,4 ; insert value onto history | |
addi r3,r3,4 | |
addi r8,r8,4 | |
cmplw cr0,r8,r4 ; i<data_len | |
bc 12,0,L1200 | |
L1400: | |
mtspr 256,r0 ; restore old vrsave | |
lmw r31,-4(r1) | |
blr | |
_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8: | |
; r3: residual[] | |
; r4: data_len | |
; r5: qlp_coeff[] | |
; r6: order | |
; r7: lp_quantization | |
; r8: data[] | |
; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above | |
; this version assumes order<=8; it uses fewer vector registers, which should | |
; save time in context switches, and has less code, which may improve | |
; instruction caching | |
stmw r31,-4(r1) | |
addi r9,r1,-28 | |
li r31,0xf | |
andc r9,r9,r31 ; for quadword-aligned stack data | |
slwi r6,r6,2 ; adjust for word size | |
slwi r4,r4,2 | |
add r4,r4,r8 ; r4 = data+data_len | |
mfspr r0,256 ; cache old vrsave | |
addis r31,0,hi16(0xffc00000) | |
ori r31,r31,lo16(0xffc00000) | |
mtspr 256,r31 ; declare VRs in vrsave | |
cmplw cr0,r8,r4 ; i<data_len | |
bc 4,0,L2400 | |
; load coefficients into v0-v1 and initial history into v2-v3 | |
li r31,0xf | |
and r31,r8,r31 ; r31: data%4 | |
li r11,16 | |
subf r31,r31,r11 ; r31: 4-(data%4) | |
slwi r31,r31,3 ; convert to bits for vsro | |
li r10,-4 | |
stw r31,-4(r9) | |
lvewx v0,r10,r9 | |
vspltisb v6,-1 | |
vsro v6,v6,v0 ; v6: mask vector | |
li r31,0x8 | |
lvsl v0,0,r31 | |
vsldoi v0,v0,v0,12 | |
li r31,0xc | |
lvsl v1,0,r31 | |
vspltisb v2,0 | |
vspltisb v3,-1 | |
vmrglw v2,v2,v3 | |
vsel v0,v1,v0,v2 ; v0: reversal permutation vector | |
add r10,r5,r6 | |
lvsl v5,0,r5 ; v5: coefficient alignment permutation vector | |
vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector | |
mr r11,r8 | |
lvsl v4,0,r11 ; v4: history alignment permutation vector | |
lvx v0,0,r5 | |
addi r5,r5,16 | |
lvx v1,0,r5 | |
vperm v0,v0,v1,v5 | |
lvx v2,0,r11 | |
addi r11,r11,-16 | |
lvx v3,0,r11 | |
vperm v2,v3,v2,v4 | |
cmplw cr0,r5,r10 | |
bc 12,0,L2101 | |
vand v0,v0,v6 | |
addis r31,0,hi16(L2301) | |
ori r31,r31,lo16(L2301) | |
b L2199 | |
L2101: | |
addi r5,r5,16 | |
lvx v7,0,r5 | |
vperm v1,v1,v7,v5 | |
addi r11,r11,-16 | |
lvx v7,0,r11 | |
vperm v3,v7,v3,v4 | |
vand v1,v1,v6 | |
addis r31,0,hi16(L2300) | |
ori r31,r31,lo16(L2300) | |
L2199: | |
mtctr r31 | |
; set up invariant vectors | |
vspltish v4,0 ; v4: zero vector | |
li r10,-12 | |
lvsr v5,r10,r8 ; v5: result shift vector | |
lvsl v6,r10,r3 ; v6: residual shift back vector | |
li r10,-4 | |
stw r7,-4(r9) | |
lvewx v7,r10,r9 ; v7: lp_quantization vector | |
L2200: | |
vmulosh v8,v0,v2 ; v8: sum vector | |
bcctr 20,0 | |
L2300: | |
vmulosh v9,v1,v3 | |
vsldoi v3,v3,v2,4 | |
vaddsws v8,v8,v9 | |
L2301: | |
vsumsws v8,v8,v4 ; v8[3]: sum | |
vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization | |
lvewx v9,0,r3 ; v9[n]: *residual | |
vperm v9,v9,v9,v6 ; v9[3]: *residual | |
vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization) | |
vsldoi v6,v6,v6,4 ; increment shift vector | |
vperm v9,v8,v8,v5 ; v9[n]: shift for storage | |
vsldoi v5,v5,v5,12 ; increment shift vector | |
stvewx v9,0,r8 | |
vsldoi v8,v8,v8,12 | |
vsldoi v2,v2,v8,4 ; insert value onto history | |
addi r3,r3,4 | |
addi r8,r8,4 | |
cmplw cr0,r8,r4 ; i<data_len | |
bc 12,0,L2200 | |
L2400: | |
mtspr 256,r0 ; restore old vrsave | |
lmw r31,-4(r1) | |
blr |