Tremolo (an ARM optimised version of the Tremor library for doing Ogg Vorbis decompression)
diff --git a/Tremolo/dpen.s b/Tremolo/dpen.s
new file mode 100644
index 0000000..278b061
--- /dev/null
+++ b/Tremolo/dpen.s
@@ -0,0 +1,459 @@
+@ Tremolo library
+@ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+
+    .text
+
+	.global	decode_packed_entry_number
+	.global decode_packed_entry_number_REALSTART
+	.global decode_map
+	.global vorbis_book_decodevv_add
+	.global _checksum
+
+	.extern	oggpack_adv
+	.extern	oggpack_look
+	.extern	oggpack_eop
+	.extern	crc_lookup
+
+decode_packed_entry_number_REALSTART:
+dpen_nobits:
+	MOV	r0,r5		@ r0 = b
+	MOV	r1,#1		@ r1 = 1
+	BL	oggpack_adv	@ oggpack_adv(b,1)      /* Force eop */
+duff:
+	MVN	r0,#0		@ return -1
+	LDMFD	r13!,{r4-r8,r10,PC}
+
+dpen_readfailed:
+	SUBS	r4,r4,#1	@ r4 = --read
+	BEQ	dpen_nobits
+	MOV	r0,r5		@ r0 = b
+	MOV	r1,r4		@ r1 = read
+	ADR	r14,dpen_read_return
+	B	oggpack_look
+
+decode_packed_entry_number:
+	@ r0 = codebook       *book
+	@ r1 = oggpack_buffer *b
+	STMFD	r13!,{r4-r8,r10,r14}
+
+	LDMIA	r0,{r4,r6,r7}		@ r4 = read = book->max_length
+					@ r6 = book->dec_table
+					@ r7 = book->dec_method
+	MOV	r5,r1		@ r5 = b
+
+	MOV	r0,r5		@ r0 = b
+	MOV	r1,r4		@ r1 = read
+	BL	oggpack_look
+dpen_read_return:
+	CMP	r0,#0
+	BLT	dpen_readfailed
+
+	@ r0 = lok
+	@ r4 = read
+	@ r5 = b
+	@ r6 = dec_table
+	@ r7 = dec_method
+
+	CMP	r7, #3
+	BGT	meth4
+	BEQ	meth3
+	CMP	r7, #1
+	BGT	meth2
+	BEQ	meth1
+meth0:
+	RSB	r1, r4, #0		@ r1 = i-read = 0-read
+	MOV	r7, #0			@ r7 = chase
+m0_loop:
+	MOVS	r0, r0, LSR #1		@ r0 = lok>>1   C = bottom bit
+	ADC	r2, r6, r7, LSL #1	@ r8 = &t[chase*2+C]
+	LDRB	r7, [r2]
+	ADDS	r1, r1, #1		@ r1 = i-read++ (i-read<0 => i<read)
+	@ stall Xscale
+	CMPLT	r7, #0x80
+	BLT	m0_loop
+	AND	r7, r7, #0x7F		@ r7 = chase
+	CMP	r1, #0			@ if (i-read >= 0) === (i >= read)
+	MVNGT	r7, #0			@ if (i >= read) value to return = -1
+	ADD	r1, r1, r4		@ r1 = i-read+read+1 = i +1
+	MOV	r0, r5			@ r0 = b
+	BL	oggpack_adv		@ oggpack_adv(b, i+1);
+	MOV	r0, r7			@ return chase
+	LDMFD	r13!,{r4-r8,r10,PC}
+
+meth1:
+	@ r0 = lok
+	@ r4 = read
+	@ r5 = b
+	@ r6 = dec_table
+	RSB	r1, r4, #0		@ r1 = i = -read
+	MOV	r10,#0			@ r10= next = 0
+m1_loop:
+	MOV	r7, r10			@ r7 = chase=next
+	MOVS	r0, r0, LSR #1		@ r0 = lok>>1     C = bottom bit
+	ADC	r8, r6, r7		@ r8 = t+chase+bit
+	LDRB	r10,[r8], -r6		@ r10= next=t[chase+bit] r8=chase+bit
+	ADDS	r1, r1, #1		@ r1 = i++
+	@ stall Xscale
+	CMPLT	r10,#0x80		@ if (next & 0x80) == 0
+	BLT	m1_loop
+
+	ADD	r1, r1, r4		@ r1 = i+read
+	MOV	r0, r5			@ r0 = b
+	BL	oggpack_adv		@ oggpack_adv(b, i)
+
+	CMP	r10,#0x80
+	BLT	duff
+
+	CMP	r8, r7			@ if bit==0 (chase+bit==chase) (sets C)
+	LDRNEB	r14,[r6, r7]		@ r14= t[chase]
+	MOVEQ	r14,#128
+	ADC	r12,r8, r6		@ r12= chase+bit+1+t
+	LDRB	r14,[r12,r14,LSR #7]	@ r14= t[chase+bit+1+(!bit || t[chase]0x0x80)]
+	BIC	r10,r10,#0x80		@ r3 = next &= ~0x80
+	@ stall Xscale
+	ORR	r0, r14,r10,LSL #8	@ r7 = chase = (next<<8) | r14
+
+	LDMFD	r13!,{r4-r8,r10,PC}
+
+
+meth2:
+	RSB	r1, r4, #0		@ r1 = i-read = 0-read
+	MOV	r7, #0			@ r7 = chase
+	MOV	r6, r6, LSR #1
+m2_loop:
+	MOVS	r0, r0, LSR #1		@ r0 = lok>>1   C = bottom bit
+	ADC	r2, r6, r7, LSL #1	@ r8 = &t[chase*2+C]
+	LDRH	r7, [r2, r2]
+	ADDS	r1, r1, #1		@ r1 = i-read++ (i-read<0 => i<read)
+	@ stall Xscale
+	CMPLT	r7, #0x8000
+	BLT	m2_loop
+	BIC	r7, r7, #0x8000		@ r7 = chase
+	CMP	r1, #0			@ if (i-read >= 0) === (i >= read)
+	MVNGT	r7, #0			@ if (i >= read) value to return = -1
+	ADD	r1, r1, r4		@ r1 = i-read+read+1 = i +1
+	MOV	r0, r5			@ r0 = b
+	BL	oggpack_adv		@ oggpack_adv(b, i+1);
+	MOV	r0, r7			@ return chase
+	LDMFD	r13!,{r4-r8,r10,PC}
+
+meth3:
+	@ r0 = lok
+	@ r4 = read
+	@ r5 = b
+	@ r6 = dec_table
+	RSB	r1, r4, #0		@ r1 = i = -read
+	MOV	r10,#0			@ r10= next = 0
+m3_loop:
+	MOV	r7, r10			@ r7 = chase=next
+	MOVS	r0, r0, LSR #1		@ r0 = lok>>1     C = bottom bit
+	ADC	r8, r7, #0		@ r8 = chase+bit
+	MOV	r8, r8, LSL #1		@ r8 = (chase+bit)<<1
+	LDRH	r10,[r6, r8]		@ r10= next=t[chase+bit]
+	ADDS	r1, r1, #1		@ r1 = i++
+	@ stall Xscale
+	CMPLT	r10,#0x8000		@ if (next & 0x8000) == 0
+	BLT	m3_loop
+
+	ADD	r1, r1, r4		@ r1 = i+read
+	MOV	r0, r5			@ r0 = b
+	BL	oggpack_adv		@ oggpack_adv(b, i)
+
+	CMP	r10,#0x8000
+	BLT	duff
+
+	MOV	r7, r7, LSL #1
+	CMP	r8, r7			@ if bit==0 (chase+bit==chase) sets C
+	LDRNEH	r14,[r6, r7]		@ r14= t[chase]
+	MOVEQ	r14,#0x8000
+	ADC	r12,r8, r14,LSR #15	@ r12= 1+((chase+bit)<<1)+(!bit || t[chase]0x0x8000)
+	ADC	r12,r12,r14,LSR #15	@ r12= t + (1+chase+bit+(!bit || t[chase]0x0x8000))<<1
+	LDRH	r14,[r6, r12]		@ r14= t[chase+bit+1
+	BIC	r10,r10,#0x8000		@ r3 = next &= ~0x8000
+	@ stall Xscale
+	ORR	r0, r14,r10,LSL #16	@ r7 = chase = (next<<16) | r14
+
+	LDMFD	r13!,{r4-r8,r10,PC}
+
+meth4:
+	RSB	r1, r4, #0		@ r1 = i-read = 0-read
+	MOV	r7, #0			@ r7 = chase
+m4_loop:
+	MOVS	r0, r0, LSR #1		@ r0 = lok>>1   C = bottom bit
+	ADC	r2, r7, r7		@ r8 = chase*2+C
+	LDR	r7, [r6, r2, LSL #2]
+	ADDS	r1, r1, #1		@ r1 = i-read++ (i-read<0 => i<read)
+	@ stall Xscale
+	CMPLT	r7, #0x80000000
+	BLT	m4_loop
+	BIC	r7, r7, #0x80000000	@ r7 = chase
+	CMP	r1, #0			@ if (i-read >= 0) === (i >= read)
+	MVNGT	r7, #0			@ if (i >= read) value to return = -1
+	ADD	r1, r1, r4		@ r1 = i-read+read+1 = i +1
+	MOV	r0, r5			@ r0 = b
+	BL	oggpack_adv		@ oggpack_adv(b, i+1);
+	MOV	r0, r7			@ return chase
+	LDMFD	r13!,{r4-r8,r10,PC}
+
+decode_map:
+	@ r0 = codebook *s
+	@ r1 = oggpack_buffer *b
+	@ r2 = int v
+	@ r3 = int point
+	STMFD	r13!,{r4-r11,r14}
+
+	MOV	r4, r0		@ r4 = s
+	MOV	r5, r1		@ r5 = b
+	MOV	r6, r2		@ r6 = v
+	MOV	r7, r3		@ r7 = point
+	BL	decode_packed_entry_number
+	MOV	r8, r0
+
+	MOV	r0, r5
+	BL	oggpack_eop
+	CMP	r0, #0
+	BNE	dm_duff
+
+	@ r4 = s
+	@ r5 = b
+	@ r6 = v
+	@ r7 = point
+	@ r8 = entry
+
+	LDR	r1, [r4,#12]	@ r1 = s->dec_type
+	LDR	r2, [r4,#16]	@ r2 = s->q_bits
+	LDR	r3, [r4,#20]	@ r3 = s->dim
+	LDR	r5, [r4,#24]	@ r5 = s->q_delp
+	LDR	r11,[r4,#28]	@ r11= s->q_minp
+	LDR	r12,[r4,#32]	@ r12= s->q_del = mul
+	LDR	r14,[r4,#36]	@ r14= s->q_min
+	SUBS	r11,r7, r11	@ r11= add    = point - s->q_minp
+
+	MOVGT	r14,r14,ASR r11	@ r14= add = s->q_min >> add  (if add >0)
+	RSBLT	r11,r11,#0
+	MOVLT	r14,r14,LSL r11	@ r14= add = s->q_min << -add (if add < 0)
+
+	SUBS	r5, r7, r5	@ r5 = shiftM = point - s->q_delp
+	LDR	r7, [r4,#40]	@ r7 = s->q_seq
+	RSBLT	r5, r5, #0	@ if (shiftM<0)  r5 =-shiftM
+	MOVLT	r12,r12,LSL r5	@                r12=mul<<-shiftM
+	MOVLT	r5, #0		@                r5 =shiftM = 0
+	MOVGT	r14,r14,LSL r5	@ add <<= shiftM
+
+	CMP	r7,#0		@ seqMask = (s->q_seq?-1:0)
+	MVNNE	r7,#0
+
+	CMP	r1, #2
+	BEQ	dm2
+	BGT	dm3
+	CMP	r1,#0		@ probably never happens
+	BLE	dm_duff
+dm1:
+	@ r1 = s->dec_type
+	@ r2 = s->q_bits
+	@ r3 = s->dim
+	@ r5 = shiftM
+	@ r6 = v
+	@ r7 = seqMask
+	@ r8 = entry
+	@ r12= mul
+	@ r14= add
+	MOV	r0, #1
+	RSB	r0, r0, r0, LSL r2	@ r0 = mask = (1<<s->q_bits)-1
+	MOV	r11,#0			@ r11= prev = 0
+dm1_loop:
+	AND	r1, r8, r0		@ r1 = v = entry & mask
+	MLA	r1, r12, r1, r14	@ r1 = (add + mul*v)
+	MOV	r8, r8, LSR r2		@ r8 = entry>>s->q_bits
+	SUBS	r3, r3, #1
+	ADD	r1, r11,r1, ASR r5	@ r1 = v = prev+((add+mul*v)>>shiftM)
+	AND	r11,r1, r7		@ r11= prev = seqMask & v
+	STR	r1, [r6], #4		@ *v++ = v
+	BGT	dm1_loop
+
+	MOV	r0, #0
+	LDMFD	r13!,{r4-r11,PC}
+dm2:
+	@ r1 = s->dec_type
+	@ r2 = s->q_bits
+	@ r3 = s->dim
+	@ r4 = s
+	@ r5 = shiftM
+	@ r6 = v
+	@ r7 = seqMask
+	@ r8 = entry
+	@ r12= mul
+	@ r14= add
+	LDR	r1, [r4,#44]		@ r1 = s->q_pack
+	LDR	r4, [r4,#48]		@ r4 = s->q_val
+	MOV	r11,#0			@ r11= prev
+	MOV	r0, #1
+	RSB	r0, r0, r0, LSL r1	@ r8 = mask = (1<<s->q_pack)-1
+	CMP	r2,#8
+	BGT	dm2_hword
+dm2_loop:
+	AND	r2, r8, r0		@ r2 = entry & mask
+	LDRB	r2, [r4, r2]		@ r2 = v = q->val[entry & mask]
+	MOV	r8, r8, LSR r1		@ r8 = entry>>q_pack
+	MLA	r2, r12,r2, r14		@ r2 = (add+mul*v)
+	SUBS	r3, r3, #1
+	ADD	r2, r11,r2, ASR r5	@ r2 = v = prev+(add+mul*v)>>shiftM
+	AND	r11,r2, r7		@ r11= prev = seqMask & v
+	STR	r2, [r6], #4		@ *v++ = v
+	BGT	dm2_loop
+	MOV	r0, #0
+	LDMFD	r13!,{r4-r11,PC}
+
+dm2_hword:
+	AND	r2, r8, r0		@ r2 = entry & mask
+	MOV	r2, r2, LSL #1		@ r2 = 2*r2
+	LDRH	r2, [r4, r2]		@ r2 = v = q->val[entry & mask]
+	MOV	r8, r8, LSR r1		@ r8 = entry>>q_pack
+	MLA	r2, r12,r2, r14		@ r2 = (add+mul*v)
+	SUBS	r3, r3, #1
+	ADD	r2, r11,r2, ASR r5	@ r2 = v = prev+(add+mul*v)>>shiftM
+	AND	r11,r2, r7		@ r11= prev = seqMask & v
+	STR	r2, [r6], #4		@ *v++ = v
+	BGT	dm2_hword
+	MOV	r0, #0
+	LDMFD	r13!,{r4-r11,PC}
+
+dm3:
+	@ r1 = s->dec_type
+	@ r2 = s->q_bits
+	@ r3 = s->dim
+	@ r4 = s
+	@ r5 = shiftM
+	@ r6 = v
+	@ r7 = seqMask
+	@ r8 = entry
+	@ r12= mul
+	@ r14= add
+	LDR	r1, [r4,#44]		@ r1 = s->q_pack
+	LDR	r4, [r4,#52]		@ r4 = s->q_val
+	CMP	r2,#8
+	MOV	r11,#0			@ r11= prev
+	MLA	r4,r1,r8,r4		@ r4 = ptr = s->q_val+entry*s->q_pack
+
+	BGT	dm3_hword
+dm3_loop:
+	LDRB	r2, [r4], #1		@ r2 = v = *ptr++
+	SUBS	r3, r3, #1
+	MLA	r2, r12,r2, r14		@ r2 = (add+mul*v)
+	ADD	r2, r11,r2, ASR r5	@ r2 = v = prev+(add+mul*v)>>shiftM
+	AND	r11,r2, r7		@ r11= prev = seqMask & v
+	STR	r2, [r6], #4		@ *v++ = v
+	BGT	dm3_loop
+	MOV	r0, #0
+	LDMFD	r13!,{r4-r11,PC}
+
+dm3_hword:
+	LDRH	r2, [r4], #2		@ r2 = *ptr++
+	SUBS	r3, r3, #1
+	MLA	r2, r12,r2, r14		@ r2 = (add+mul*v)
+	ADD	r2, r11,r2, ASR r5	@ r2 = v = prev+(add+mul*v)>>shiftM
+	AND	r11,r2, r7		@ r11= prev = seqMask & v
+	STR	r2, [r6], #4		@ *v++ = v
+	BGT	dm3_hword
+	MOV	r0, #0
+	LDMFD	r13!,{r4-r11,PC}
+
+dm_duff:
+	MVN	r0,#0
+	LDMFD	r13!,{r4-r11,PC}
+
+vorbis_book_decodevv_add:
+	@ r0 = codebook     *book
+	@ r1 = ogg_int32_t **a
+	@ r2 = long          offset
+	@ r3 = int           ch
+	@ <> = b
+	@ <> = n
+	@ <> = point
+	STMFD	r13!,{r4-r11,R14}
+	LDR	r7, [r0, #13*4]		@ r7 = used_entries
+	MOV	r9, r0			@ r9 = book
+	MOV	r10,r1			@ r10= 0xa[chptr]      chptr=0
+	MOV	r6, r3			@ r6 = ch
+	ADD	r8, r10,r3, LSL #2	@ r8 = 0xa[ch]
+	MOV	r11,r2			@ r11= offset
+	CMP	r7, #0			@ if (used_entries <= 0)
+	BLE	vbdvva_exit		@     exit
+	LDR	r5, [r13,#10*4]		@ r5 = n
+vbdvva_loop1:
+	@ r5 = n
+	@ r6 = ch
+	@ r8 = 0xa[ch]
+	@ r9 = book
+	@ r10= 0xa[chptr]
+	@ r11= offset
+	MOV	r0, r9			@ r0 = book
+	LDR	r1, [r13,# 9*4]		@ r1 = b
+	LDR	r2, [r9, #14*4]		@ r2 = v = dec_buf
+	LDR	r3, [r13,#11*4]		@ r3 = point
+	BL	decode_map
+	CMP	r0, #0
+	BNE	vbdvva_fail
+
+	LDR	r0, [r9, # 5*4]		@ r0 = book->dim
+	LDR	r1, [r9, #14*4]		@ r1 = v = dec_buf
+vbdvva_loop2:
+	LDR	r2, [r10],#4		@ r2 = a[chptr++]
+	LDR	r12,[r1], #4		@ r1 = v[j++]
+	CMP	r10,r8			@ if (chptr == ch)
+	SUBEQ	r10,r10,r6, LSL #2	@    chptr = 0
+	LDR	r14,[r2, r11,LSL #2]!	@ r2 = 0xa[chptr++][i] r14=[r12]
+	ADDEQ	r11,r11,#1		@    i++
+	SUBEQ	r5, r5, #1		@    n--
+	SUBS	r0, r0, #1		@ r0--
+	ADD	r12,r12,r14		@ r12= a[chptr++][i]+ v[j]
+	STR	r12,[r2]		@ r12= a[chptr++][i]+=v[j]
+	BGT	vbdvva_loop2
+	CMP	r5,#0
+	BGT	vbdvva_loop1
+vbdvva_exit:
+	MOV	r0, #0			@ return 0
+	LDMFD	r13!,{r4-r11,PC}
+vbdvva_fail:
+	MVN	r0, #0			@ return -1
+	LDMFD	r13!,{r4-r11,PC}
+
+_checksum:
+	@ r0 = ogg_reference *or
+	@ r1 = bytes
+	STMFD	r13!,{r5-r6,r14}
+
+	LDR	r5,=crc_lookup
+	MOV	r14,#0			@ r14= crc_reg = 0
+	MOVS	r12,r0
+	BEQ	_cs_end
+_cs_loop1:
+	LDMIA	r12,{r0,r2,r3,r12}	@ r0 = or->buffer
+					@ r2 = or->begin
+					@ r3 = or->length
+					@ r12= or->next
+	LDR	r0,[r0]			@ r0 = or->buffer->data
+	CMP	r1,r3			@ r3 = post = (bytes < or->length ?
+	MOVLT	r3,r1			@              bytes : or->length)
+	MOVS	r6,r3			@ r6 = j = post
+	BEQ	_cs_no_bytes
+	ADD	r0,r0,r2		@ r0 = or->buffer->data + or->begin
+_cs_loop2:
+	LDRB	r2, [r0],#1		@ r2 = data[j]
+	@ stall
+	@ stall Xscale
+	EOR	r2, r2, r14,LSR #24	@ r2 = (crc_reg>>24)^data[j]
+	LDR	r2, [r5, r2, LSL #2]	@ r2 = crc_lkp[(crc_reg>>24)^data[j]]
+	SUBS	r6, r6, #1		@ j--
+	@ stall Xscale
+	EOR	r14,r2, r14,LSL #8	@ r14= crc_reg = (crc_reg<<8)^r2
+	BGT	_cs_loop2
+_cs_no_bytes:
+	SUBS	r1, r1, r3
+	CMPNE	r12,#0
+	BNE	_cs_loop1
+_cs_end:
+	MOV	r0,r14
+	LDMFD	r13!,{r5-r6,PC}
+
+	@ END