add brady's first round of altivec implementations

commit: 31bdd7048afea2f3ed3ef2d48d91c5c32e1d6728 [log] [tgz]
author: Josh Coalson <jcoalson@users.sourceforce.net> Sun Jul 25 20:34:40 2004 +0000
committer: Josh Coalson <jcoalson@users.sourceforce.net> Sun Jul 25 20:34:40 2004 +0000
tree: 53e4b3b58d5d2c2204b960efb2dd21968439d0ed
parent: 41294c928f655c6983ad1210dcbb185bd81d9be4 [diff]
diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am
index 920ddf2..1eb9942 100644
--- a/src/libFLAC/Makefile.am
+++ b/src/libFLAC/Makefile.am

@@ -42,6 +42,10 @@
 libFLAC_la_LIBADD = ia32/libFLAC-asm.la
 endif
 endif
+if FLaC__CPU_PPC
+ARCH_SUBDIRS = ppc
+libFLAC_la_LIBADD = ppc/libFLAC-asm.la
+endif
 endif
 
 SUBDIRS = $(ARCH_SUBDIRS) include .

diff --git a/src/libFLAC/ppc/Makefile.am b/src/libFLAC/ppc/Makefile.am
new file mode 100644
index 0000000..cd39964
--- /dev/null
+++ b/src/libFLAC/ppc/Makefile.am

@@ -0,0 +1,38 @@
+#  libFLAC - Free Lossless Audio Codec library

+#  Copyright (C) 2004  Josh Coalson

+#

+#  Redistribution and use in source and binary forms, with or without

+#  modification, are permitted provided that the following conditions

+#  are met:

+#

+#  - Redistributions of source code must retain the above copyright

+#  notice, this list of conditions and the following disclaimer.

+#

+#  - Redistributions in binary form must reproduce the above copyright

+#  notice, this list of conditions and the following disclaimer in the

+#  documentation and/or other materials provided with the distribution.

+#

+#  - Neither the name of the Xiph.org Foundation nor the names of its

+#  contributors may be used to endorse or promote products derived from

+#  this software without specific prior written permission.

+#

+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+#  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+#  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+

+SUFFIXES = .s .lo

+

+.s.lo:

+	$(LIBTOOL) --mode=compile as -force_cpusubtype_ALL -o $@ $<

+

+noinst_LTLIBRARIES = libFLAC-asm.la

+libFLAC_asm_la_SOURCES = \

+	lpc_asm.s


diff --git a/src/libFLAC/ppc/lpc_asm.s b/src/libFLAC/ppc/lpc_asm.s
new file mode 100644
index 0000000..fb8af30
--- /dev/null
+++ b/src/libFLAC/ppc/lpc_asm.s

@@ -0,0 +1,428 @@
+;  libFLAC - Free Lossless Audio Codec library

+;  Copyright (C) 2004  Josh Coalson

+;

+;  Redistribution and use in source and binary forms, with or without

+;  modification, are permitted provided that the following conditions

+;  are met:

+;

+;  - Redistributions of source code must retain the above copyright

+;  notice, this list of conditions and the following disclaimer.

+;

+;  - Redistributions in binary form must reproduce the above copyright

+;  notice, this list of conditions and the following disclaimer in the

+;  documentation and/or other materials provided with the distribution.

+;

+;  - Neither the name of the Xiph.org Foundation nor the names of its

+;  contributors may be used to endorse or promote products derived from

+;  this software without specific prior written permission.

+;

+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+

+.text

+	.align 2

+.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16

+.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8

+

+_FLAC__lpc_restore_signal_asm_ppc_altivec_16:

+;	r3: residual[]

+;	r4: data_len

+;	r5: qlp_coeff[]

+;	r6: order

+;	r7: lp_quantization

+;	r8: data[]

+

+; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()

+; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual

+; bps<=15 for mid-side coding, since that uses an extra bit)

+

+; these should be fast; the inner loop is unrolled (it takes no more than

+; 3*(order%4) instructions, all of which are arithmetic), and all of the

+; coefficients and all relevant history stay in registers, so the outer loop

+; has only one load from memory (the residual)

+

+; I haven't yet run this through simg4, so there may be some avoidable stalls,

+; and there may be a somewhat more clever way to do the outer loop

+

+; the branch mechanism may prevent dynamic loading; I still need to examine

+; this issue, and there may be a more elegant method

+

+	stmw r31,-4(r1)

+

+	addi r9,r1,-28

+	li r31,0xf

+	andc r9,r9,r31 ; for quadword-aligned stack data

+

+	slwi r6,r6,2 ; adjust for word size

+	slwi r4,r4,2

+	add r4,r4,r8 ; r4 = data+data_len

+

+	mfspr r0,256 ; cache old vrsave

+	addis r31,0,hi16(0xfffffc00)

+	ori r31,r31,lo16(0xfffffc00)

+	mtspr 256,r31 ; declare VRs in vrsave

+

+	cmplw cr0,r8,r4 ; i<data_len

+	bc 4,0,L1400

+

+	; load coefficients into v0-v7 and initial history into v8-v15

+	li r31,0xf

+	and r31,r8,r31 ; r31: data%4

+	li r11,16

+	subf r31,r31,r11 ; r31: 4-(data%4)

+	slwi r31,r31,3 ; convert to bits for vsro

+	li r10,-4

+	stw r31,-4(r9)

+	lvewx v0,r10,r9

+	vspltisb v18,-1

+	vsro v18,v18,v0 ; v18: mask vector

+

+	li r31,0x8

+	lvsl v0,0,r31

+	vsldoi v0,v0,v0,12

+	li r31,0xc

+	lvsl v1,0,r31

+	vspltisb v2,0

+	vspltisb v3,-1

+	vmrglw v2,v2,v3

+	vsel v0,v1,v0,v2 ; v0: reversal permutation vector

+

+	add r10,r5,r6

+	lvsl v17,0,r5 ; v17: coefficient alignment permutation vector

+	vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector

+

+	mr r11,r8

+	lvsl v16,0,r11 ; v16: history alignment permutation vector

+

+	lvx v0,0,r5

+	addi r5,r5,16

+	lvx v1,0,r5

+	vperm v0,v0,v1,v17

+	lvx v8,0,r11

+	addi r11,r11,-16

+	lvx v9,0,r11

+	vperm v8,v9,v8,v16

+	cmplw cr0,r5,r10

+	bc 12,0,L1101

+	vand v0,v0,v18

+	addis r31,0,hi16(L1307)

+	ori r31,r31,lo16(L1307)

+	b L1199

+

+L1101:

+	addi r5,r5,16

+	lvx v2,0,r5

+	vperm v1,v1,v2,v17

+	addi r11,r11,-16

+	lvx v10,0,r11

+	vperm v9,v10,v9,v16

+	cmplw cr0,r5,r10

+	bc 12,0,L1102

+	vand v1,v1,v18

+	addis r31,0,hi16(L1306)

+	ori r31,r31,lo16(L1306)

+	b L1199

+

+L1102:

+	addi r5,r5,16

+	lvx v3,0,r5

+	vperm v2,v2,v3,v17

+	addi r11,r11,-16

+	lvx v11,0,r11

+	vperm v10,v11,v10,v16

+	cmplw cr0,r5,r10

+	bc 12,0,L1103

+	vand v2,v2,v18

+	addis r31,0,hi16(L1305)

+	ori r31,r31,lo16(L1305)

+	b L1199

+

+L1103:

+	addi r5,r5,16

+	lvx v4,0,r5

+	vperm v3,v3,v4,v17

+	addi r11,r11,-16

+	lvx v12,0,r11

+	vperm v11,v12,v11,v16

+	cmplw cr0,r5,r10

+	bc 12,0,L1104

+	vand v3,v3,v18

+	addis r31,0,hi16(L1304)

+	ori r31,r31,lo16(L1304)

+	b L1199

+

+L1104:

+	addi r5,r5,16

+	lvx v5,0,r5

+	vperm v4,v4,v5,v17

+	addi r11,r11,-16

+	lvx v13,0,r11

+	vperm v12,v13,v12,v16

+	cmplw cr0,r5,r10

+	bc 12,0,L1105

+	vand v4,v4,v18

+	addis r31,0,hi16(L1303)

+	ori r31,r31,lo16(L1303)

+	b L1199

+

+L1105:

+	addi r5,r5,16

+	lvx v6,0,r5

+	vperm v5,v5,v6,v17

+	addi r11,r11,-16

+	lvx v14,0,r11

+	vperm v13,v14,v13,v16

+	cmplw cr0,r5,r10

+	bc 12,0,L1106

+	vand v5,v5,v18

+	addis r31,0,hi16(L1302)

+	ori r31,r31,lo16(L1302)

+	b L1199

+

+L1106:

+	addi r5,r5,16

+	lvx v7,0,r5

+	vperm v6,v6,v7,v17

+	addi r11,r11,-16

+	lvx v15,0,r11

+	vperm v14,v15,v14,v16

+	cmplw cr0,r5,r10

+	bc 12,0,L1107

+	vand v6,v6,v18

+	addis r31,0,hi16(L1301)

+	ori r31,r31,lo16(L1301)

+	b L1199

+

+L1107:

+	addi r5,r5,16

+	lvx v19,0,r5

+	vperm v7,v7,v19,v17

+	addi r11,r11,-16

+	lvx v19,0,r11

+	vperm v15,v19,v15,v16

+	vand v7,v7,v18

+	addis r31,0,hi16(L1300)

+	ori r31,r31,lo16(L1300)

+

+L1199:

+	mtctr r31

+

+	; set up invariant vectors

+	vspltish v16,0 ; v16: zero vector

+

+	li r10,-12

+	lvsr v17,r10,r8 ; v17: result shift vector

+	lvsl v18,r10,r3 ; v18: residual shift back vector

+

+	li r10,-4

+	stw r7,-4(r9)

+	lvewx v19,r10,r9 ; v19: lp_quantization vector

+

+L1200:

+	vmulosh v20,v0,v8 ; v20: sum vector

+	bcctr 20,0

+

+L1300:

+	vmulosh v21,v7,v15

+	vsldoi v15,v15,v14,4 ; increment history

+	vaddsws v20,v20,v21

+

+L1301:

+	vmulosh v21,v6,v14

+	vsldoi v14,v14,v13,4

+	vaddsws v20,v20,v21

+

+L1302:

+	vmulosh v21,v5,v13

+	vsldoi v13,v13,v12,4

+	vaddsws v20,v20,v21

+

+L1303:

+	vmulosh v21,v4,v12

+	vsldoi v12,v12,v11,4

+	vaddsws v20,v20,v21

+

+L1304:

+	vmulosh v21,v3,v11

+	vsldoi v11,v11,v10,4

+	vaddsws v20,v20,v21

+

+L1305:

+	vmulosh v21,v2,v10

+	vsldoi v10,v10,v9,4

+	vaddsws v20,v20,v21

+

+L1306:

+	vmulosh v21,v1,v9

+	vsldoi v9,v9,v8,4

+	vaddsws v20,v20,v21

+

+L1307:

+	vsumsws v20,v20,v16 ; v20[3]: sum

+	vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization

+

+	lvewx v21,0,r3 ; v21[n]: *residual

+	vperm v21,v21,v21,v18 ; v21[3]: *residual

+	vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization)

+	vsldoi v18,v18,v18,4 ; increment shift vector

+

+	vperm v21,v20,v20,v17 ; v21[n]: shift for storage

+	vsldoi v17,v17,v17,12 ; increment shift vector

+	stvewx v21,0,r8

+

+	vsldoi v20,v20,v20,12

+	vsldoi v8,v8,v20,4 ; insert value onto history

+

+	addi r3,r3,4

+	addi r8,r8,4

+	cmplw cr0,r8,r4 ; i<data_len

+	bc 12,0,L1200

+

+L1400:

+	mtspr 256,r0 ; restore old vrsave

+	lmw r31,-4(r1)

+	blr

+

+_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:

+;	r3: residual[]

+;	r4: data_len

+;	r5: qlp_coeff[]

+;	r6: order

+;	r7: lp_quantization

+;	r8: data[]

+

+; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above

+; this version assumes order<=8; it uses fewer vector registers, which should

+; save time in context switches, and has less code, which may improve

+; instruction caching

+

+	stmw r31,-4(r1)

+

+	addi r9,r1,-28

+	li r31,0xf

+	andc r9,r9,r31 ; for quadword-aligned stack data

+

+	slwi r6,r6,2 ; adjust for word size

+	slwi r4,r4,2

+	add r4,r4,r8 ; r4 = data+data_len

+

+	mfspr r0,256 ; cache old vrsave

+	addis r31,0,hi16(0xffc00000)

+	ori r31,r31,lo16(0xffc00000)

+	mtspr 256,r31 ; declare VRs in vrsave

+

+	cmplw cr0,r8,r4 ; i<data_len

+	bc 4,0,L2400

+

+	; load coefficients into v0-v1 and initial history into v2-v3

+	li r31,0xf

+	and r31,r8,r31 ; r31: data%4

+	li r11,16

+	subf r31,r31,r11 ; r31: 4-(data%4)

+	slwi r31,r31,3 ; convert to bits for vsro

+	li r10,-4

+	stw r31,-4(r9)

+	lvewx v0,r10,r9

+	vspltisb v6,-1

+	vsro v6,v6,v0 ; v6: mask vector

+

+	li r31,0x8

+	lvsl v0,0,r31

+	vsldoi v0,v0,v0,12

+	li r31,0xc

+	lvsl v1,0,r31

+	vspltisb v2,0

+	vspltisb v3,-1

+	vmrglw v2,v2,v3

+	vsel v0,v1,v0,v2 ; v0: reversal permutation vector

+

+	add r10,r5,r6

+	lvsl v5,0,r5 ; v5: coefficient alignment permutation vector

+	vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector

+

+	mr r11,r8

+	lvsl v4,0,r11 ; v4: history alignment permutation vector

+

+	lvx v0,0,r5

+	addi r5,r5,16

+	lvx v1,0,r5

+	vperm v0,v0,v1,v5

+	lvx v2,0,r11

+	addi r11,r11,-16

+	lvx v3,0,r11

+	vperm v2,v3,v2,v4

+	cmplw cr0,r5,r10

+	bc 12,0,L2101

+	vand v0,v0,v6

+	addis r31,0,hi16(L2301)

+	ori r31,r31,lo16(L2301)

+	b L2199

+

+L2101:

+	addi r5,r5,16

+	lvx v7,0,r5

+	vperm v1,v1,v7,v5

+	addi r11,r11,-16

+	lvx v7,0,r11

+	vperm v3,v7,v3,v4

+	vand v1,v1,v6

+	addis r31,0,hi16(L2300)

+	ori r31,r31,lo16(L2300)

+

+L2199:

+	mtctr r31

+

+	; set up invariant vectors

+	vspltish v4,0 ; v4: zero vector

+

+	li r10,-12

+	lvsr v5,r10,r8 ; v5: result shift vector

+	lvsl v6,r10,r3 ; v6: residual shift back vector

+

+	li r10,-4

+	stw r7,-4(r9)

+	lvewx v7,r10,r9 ; v7: lp_quantization vector

+

+L2200:

+	vmulosh v8,v0,v2 ; v8: sum vector

+	bcctr 20,0

+

+L2300:

+	vmulosh v9,v1,v3

+	vsldoi v3,v3,v2,4

+	vaddsws v8,v8,v9

+

+L2301:

+	vsumsws v8,v8,v4 ; v8[3]: sum

+	vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization

+

+	lvewx v9,0,r3 ; v9[n]: *residual

+	vperm v9,v9,v9,v6 ; v9[3]: *residual

+	vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization)

+	vsldoi v6,v6,v6,4 ; increment shift vector

+

+	vperm v9,v8,v8,v5 ; v9[n]: shift for storage

+	vsldoi v5,v5,v5,12 ; increment shift vector

+	stvewx v9,0,r8

+

+	vsldoi v8,v8,v8,12

+	vsldoi v2,v2,v8,4 ; insert value onto history

+

+	addi r3,r3,4

+	addi r8,r8,4

+	cmplw cr0,r8,r4 ; i<data_len

+	bc 12,0,L2200

+

+L2400:

+	mtspr 256,r0 ; restore old vrsave

+	lmw r31,-4(r1)

+	blr
commit	31bdd7048afea2f3ed3ef2d48d91c5c32e1d6728	[log] [tgz]
author	Josh Coalson <jcoalson@users.sourceforce.net>	Sun Jul 25 20:34:40 2004 +0000
committer	Josh Coalson <jcoalson@users.sourceforce.net>	Sun Jul 25 20:34:40 2004 +0000
tree	53e4b3b58d5d2c2204b960efb2dd21968439d0ed
parent	41294c928f655c6983ad1210dcbb185bd81d9be4 [diff]