Blame - celt/arm/celt_pitch_xcorr_arm.s - fp2-dev/platform/external/chromium_org/third_party/opus/src

blob: 09917b16bf25fff2b169fa3654597c39caf70c89 [file] [log] [blame]

tlegrand@google.com	3c3902f	2013-12-09 08:35:25 +0000	[diff] [blame]	1	; Copyright (c) 2007-2008 CSIRO
				2	; Copyright (c) 2007-2009 Xiph.Org Foundation
				3	; Copyright (c) 2013 Parrot
				4	; Written by Aurélien Zanelli
				5	;
				6	; Redistribution and use in source and binary forms, with or without
				7	; modification, are permitted provided that the following conditions
				8	; are met:
				9	;
				10	; - Redistributions of source code must retain the above copyright
				11	; notice, this list of conditions and the following disclaimer.
				12	;
				13	; - Redistributions in binary form must reproduce the above copyright
				14	; notice, this list of conditions and the following disclaimer in the
				15	; documentation and/or other materials provided with the distribution.
				16	;
				17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				18	; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
				21	; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
				22	; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
				23	; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				24	; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				25	; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				26	; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				27	; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				28
				29	AREA \|.text\|, CODE, READONLY
				30
				31	GET celt/arm/armopts.s
				32
				33	IF OPUS_ARM_MAY_HAVE_EDSP
				34	EXPORT celt_pitch_xcorr_edsp
				35	ENDIF
				36
				37	IF OPUS_ARM_MAY_HAVE_NEON
				38	EXPORT celt_pitch_xcorr_neon
				39	ENDIF
				40
				41	IF OPUS_ARM_MAY_HAVE_NEON
				42
				43	; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
				44	xcorr_kernel_neon PROC
				45	; input:
				46	; r3 = int len
				47	; r4 = opus_val16 *x
				48	; r5 = opus_val16 *y
				49	; q0 = opus_val32 sum[4]
				50	; output:
				51	; q0 = opus_val32 sum[4]
				52	; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
				53	; internal usage:
				54	; r12 = int j
				55	; d3 = y_3\|y_2\|y_1\|y_0
				56	; q2 = y_B\|y_A\|y_9\|y_8\|y_7\|y_6\|y_5\|y_4
				57	; q3 = x_7\|x_6\|x_5\|x_4\|x_3\|x_2\|x_1\|x_0
				58	; q8 = scratch
				59	;
				60	; Load y[0...3]
				61	; This requires len>0 to always be valid (which we assert in the C code).
				62	VLD1.16 {d5}, [r5]!
				63	SUBS r12, r3, #8
				64	BLE xcorr_kernel_neon_process4
				65	; Process 8 samples at a time.
				66	; This loop loads one y value more than we actually need. Therefore we have to
				67	; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
				68	; reading past the end of the array.
				69	xcorr_kernel_neon_process8
				70	; This loop has 19 total instructions (10 cycles to issue, minimum), with
				71	; - 2 cycles of ARM insrtuctions,
				72	; - 10 cycles of load/store/byte permute instructions, and
				73	; - 9 cycles of data processing instructions.
				74	; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
				75	; latter two categories, meaning the whole loop should run in 10 cycles per
				76	; iteration, barring cache misses.
				77	;
				78	; Load x[0...7]
				79	VLD1.16 {d6, d7}, [r4]!
				80	; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
				81	; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
				82	VAND d3, d5, d5
				83	SUBS r12, r12, #8
				84	; Load y[4...11]
				85	VLD1.16 {d4, d5}, [r5]!
				86	VMLAL.S16 q0, d3, d6[0]
				87	VEXT.16 d16, d3, d4, #1
				88	VMLAL.S16 q0, d4, d7[0]
				89	VEXT.16 d17, d4, d5, #1
				90	VMLAL.S16 q0, d16, d6[1]
				91	VEXT.16 d16, d3, d4, #2
				92	VMLAL.S16 q0, d17, d7[1]
				93	VEXT.16 d17, d4, d5, #2
				94	VMLAL.S16 q0, d16, d6[2]
				95	VEXT.16 d16, d3, d4, #3
				96	VMLAL.S16 q0, d17, d7[2]
				97	VEXT.16 d17, d4, d5, #3
				98	VMLAL.S16 q0, d16, d6[3]
				99	VMLAL.S16 q0, d17, d7[3]
				100	BGT xcorr_kernel_neon_process8
				101	; Process 4 samples here if we have > 4 left (still reading one extra y value).
				102	xcorr_kernel_neon_process4
				103	ADDS r12, r12, #4
				104	BLE xcorr_kernel_neon_process2
				105	; Load x[0...3]
				106	VLD1.16 d6, [r4]!
				107	; Use VAND since it's a data processing instruction again.
				108	VAND d4, d5, d5
				109	SUB r12, r12, #4
				110	; Load y[4...7]
				111	VLD1.16 d5, [r5]!
				112	VMLAL.S16 q0, d4, d6[0]
				113	VEXT.16 d16, d4, d5, #1
				114	VMLAL.S16 q0, d16, d6[1]
				115	VEXT.16 d16, d4, d5, #2
				116	VMLAL.S16 q0, d16, d6[2]
				117	VEXT.16 d16, d4, d5, #3
				118	VMLAL.S16 q0, d16, d6[3]
				119	; Process 2 samples here if we have > 2 left (still reading one extra y value).
				120	xcorr_kernel_neon_process2
				121	ADDS r12, r12, #2
				122	BLE xcorr_kernel_neon_process1
				123	; Load x[0...1]
				124	VLD2.16 {d6[],d7[]}, [r4]!
				125	; Use VAND since it's a data processing instruction again.
				126	VAND d4, d5, d5
				127	SUB r12, r12, #2
				128	; Load y[4...5]
				129	VLD1.32 {d5[]}, [r5]!
				130	VMLAL.S16 q0, d4, d6
				131	VEXT.16 d16, d4, d5, #1
				132	; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
				133	; instead of VEXT, since it's a data-processing instruction.
				134	VSRI.64 d5, d4, #32
				135	VMLAL.S16 q0, d16, d7
				136	; Process 1 sample using the extra y value we loaded above.
				137	xcorr_kernel_neon_process1
				138	; Load next *x
				139	VLD1.16 {d6[]}, [r4]!
				140	ADDS r12, r12, #1
				141	; y[0...3] are left in d5 from prior iteration(s) (if any)
				142	VMLAL.S16 q0, d5, d6
				143	MOVLE pc, lr
				144	; Now process 1 last sample, not reading ahead.
				145	; Load last *y
				146	VLD1.16 {d4[]}, [r5]!
				147	VSRI.64 d4, d5, #16
				148	; Load last *x
				149	VLD1.16 {d6[]}, [r4]!
				150	VMLAL.S16 q0, d4, d6
				151	MOV pc, lr
				152	ENDP
				153
				154	; opus_val32 celt_pitch_xcorr_neon(opus_val16 _x, opus_val16 _y,
				155	; opus_val32 *xcorr, int len, int max_pitch)
				156	celt_pitch_xcorr_neon PROC
				157	; input:
				158	; r0 = opus_val16 *_x
				159	; r1 = opus_val16 *_y
				160	; r2 = opus_val32 *xcorr
				161	; r3 = int len
				162	; output:
				163	; r0 = int maxcorr
				164	; internal usage:
				165	; r4 = opus_val16 *x (for xcorr_kernel_neon())
				166	; r5 = opus_val16 *y (for xcorr_kernel_neon())
				167	; r6 = int max_pitch
				168	; r12 = int j
				169	; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
				170	STMFD sp!, {r4-r6, lr}
				171	LDR r6, [sp, #16]
				172	VMOV.S32 q15, #1
				173	; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
				174	SUBS r6, r6, #4
				175	BLT celt_pitch_xcorr_neon_process4_done
				176	celt_pitch_xcorr_neon_process4
				177	; xcorr_kernel_neon parameters:
				178	; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
				179	MOV r4, r0
				180	MOV r5, r1
				181	VEOR q0, q0, q0
				182	; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
				183	; So we don't save/restore any other registers.
				184	BL xcorr_kernel_neon
				185	SUBS r6, r6, #4
				186	VST1.32 {q0}, [r2]!
				187	; _y += 4
				188	ADD r1, r1, #8
				189	VMAX.S32 q15, q15, q0
				190	; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
				191	BGE celt_pitch_xcorr_neon_process4
				192	; We have less than 4 sums left to compute.
				193	celt_pitch_xcorr_neon_process4_done
				194	ADDS r6, r6, #4
				195	; Reduce maxcorr to a single value
				196	VMAX.S32 d30, d30, d31
				197	VPMAX.S32 d30, d30, d30
				198	; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
				199	BLE celt_pitch_xcorr_neon_done
				200	; Now compute each remaining sum one at a time.
				201	celt_pitch_xcorr_neon_process_remaining
				202	MOV r4, r0
				203	MOV r5, r1
				204	VMOV.I32 q0, #0
				205	SUBS r12, r3, #8
				206	BLT celt_pitch_xcorr_neon_process_remaining4
				207	; Sum terms 8 at a time.
				208	celt_pitch_xcorr_neon_process_remaining_loop8
				209	; Load x[0...7]
				210	VLD1.16 {q1}, [r4]!
				211	; Load y[0...7]
				212	VLD1.16 {q2}, [r5]!
				213	SUBS r12, r12, #8
				214	VMLAL.S16 q0, d4, d2
				215	VMLAL.S16 q0, d5, d3
				216	BGE celt_pitch_xcorr_neon_process_remaining_loop8
				217	; Sum terms 4 at a time.
				218	celt_pitch_xcorr_neon_process_remaining4
				219	ADDS r12, r12, #4
				220	BLT celt_pitch_xcorr_neon_process_remaining4_done
				221	; Load x[0...3]
				222	VLD1.16 {d2}, [r4]!
				223	; Load y[0...3]
				224	VLD1.16 {d3}, [r5]!
				225	SUB r12, r12, #4
				226	VMLAL.S16 q0, d3, d2
				227	celt_pitch_xcorr_neon_process_remaining4_done
				228	; Reduce the sum to a single value.
				229	VADD.S32 d0, d0, d1
				230	VPADDL.S32 d0, d0
				231	ADDS r12, r12, #4
				232	BLE celt_pitch_xcorr_neon_process_remaining_loop_done
				233	; Sum terms 1 at a time.
				234	celt_pitch_xcorr_neon_process_remaining_loop1
				235	VLD1.16 {d2[]}, [r4]!
				236	VLD1.16 {d3[]}, [r5]!
				237	SUBS r12, r12, #1
				238	VMLAL.S16 q0, d2, d3
				239	BGT celt_pitch_xcorr_neon_process_remaining_loop1
				240	celt_pitch_xcorr_neon_process_remaining_loop_done
				241	VST1.32 {d0[0]}, [r2]!
				242	VMAX.S32 d30, d30, d0
				243	SUBS r6, r6, #1
				244	; _y++
				245	ADD r1, r1, #2
				246	; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
				247	BGT celt_pitch_xcorr_neon_process_remaining
				248	celt_pitch_xcorr_neon_done
				249	VMOV.32 r0, d30[0]
				250	LDMFD sp!, {r4-r6, pc}
				251	ENDP
				252
				253	ENDIF
				254
				255	IF OPUS_ARM_MAY_HAVE_EDSP
				256
				257	; This will get used on ARMv7 devices without NEON, so it has been optimized
				258	; to take advantage of dual-issuing where possible.
				259	xcorr_kernel_edsp PROC
				260	; input:
				261	; r3 = int len
				262	; r4 = opus_val16 *_x (must be 32-bit aligned)
				263	; r5 = opus_val16 *_y (must be 32-bit aligned)
				264	; r6...r9 = opus_val32 sum[4]
				265	; output:
				266	; r6...r9 = opus_val32 sum[4]
				267	; preserved: r0-r5
				268	; internal usage
				269	; r2 = int j
				270	; r12,r14 = opus_val16 x[4]
				271	; r10,r11 = opus_val16 y[4]
				272	STMFD sp!, {r2,r4,r5,lr}
				273	LDR r10, [r5], #4 ; Load y[0...1]
				274	SUBS r2, r3, #4 ; j = len-4
				275	LDR r11, [r5], #4 ; Load y[2...3]
				276	BLE xcorr_kernel_edsp_process4_done
				277	LDR r12, [r4], #4 ; Load x[0...1]
				278	; Stall
				279	xcorr_kernel_edsp_process4
				280	; The multiplies must issue from pipeline 0, and can't dual-issue with each
				281	; other. Every other instruction here dual-issues with a multiply, and is
				282	; thus "free". There should be no stalls in the body of the loop.
				283	SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
				284	LDR r14, [r4], #4 ; Load x[2...3]
				285	SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
				286	SUBS r2, r2, #4 ; j-=4
				287	SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
				288	SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
				289	SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
				290	LDR r10, [r5], #4 ; Load y[4...5]
				291	SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
				292	SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
				293	SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
				294	LDRGT r12, [r4], #4 ; Load x[0...1]
				295	SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
				296	SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
				297	SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
				298	SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
				299	SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
				300	LDR r11, [r5], #4 ; Load y[6...7]
				301	SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
				302	SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
				303	SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
				304	BGT xcorr_kernel_edsp_process4
				305	xcorr_kernel_edsp_process4_done
				306	ADDS r2, r2, #4
				307	BLE xcorr_kernel_edsp_done
				308	LDRH r12, [r4], #2 ; r12 = *x++
				309	SUBS r2, r2, #1 ; j--
				310	; Stall
				311	SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
				312	LDRGTH r14, [r4], #2 ; r14 = *x++
				313	SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
				314	SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
				315	SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
				316	BLE xcorr_kernel_edsp_done
				317	SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
				318	SUBS r2, r2, #1 ; j--
				319	SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
				320	LDRH r10, [r5], #2 ; r10 = y_4 = *y++
				321	SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
				322	LDRGTH r12, [r4], #2 ; r12 = *x++
				323	SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
				324	BLE xcorr_kernel_edsp_done
				325	SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
				326	CMP r2, #1 ; j--
				327	SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
				328	LDRH r2, [r5], #2 ; r2 = y_5 = *y++
				329	SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
				330	LDRGTH r14, [r4] ; r14 = *x
				331	SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
				332	BLE xcorr_kernel_edsp_done
				333	SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
				334	LDRH r11, [r5] ; r11 = y_6 = *y
				335	SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
				336	SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
				337	SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
				338	xcorr_kernel_edsp_done
				339	LDMFD sp!, {r2,r4,r5,pc}
				340	ENDP
				341
				342	celt_pitch_xcorr_edsp PROC
				343	; input:
				344	; r0 = opus_val16 *_x (must be 32-bit aligned)
				345	; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
				346	; r2 = opus_val32 *xcorr
				347	; r3 = int len
				348	; output:
				349	; r0 = maxcorr
				350	; internal usage
				351	; r4 = opus_val16 *x
				352	; r5 = opus_val16 *y
				353	; r6 = opus_val32 sum0
				354	; r7 = opus_val32 sum1
				355	; r8 = opus_val32 sum2
				356	; r9 = opus_val32 sum3
				357	; r1 = int max_pitch
				358	; r12 = int j
				359	STMFD sp!, {r4-r11, lr}
				360	MOV r5, r1
				361	LDR r1, [sp, #36]
				362	MOV r4, r0
				363	TST r5, #3
				364	; maxcorr = 1
				365	MOV r0, #1
				366	BEQ celt_pitch_xcorr_edsp_process1u_done
				367	; Compute one sum at the start to make y 32-bit aligned.
				368	SUBS r12, r3, #4
				369	; r14 = sum = 0
				370	MOV r14, #0
				371	LDRH r8, [r5], #2
				372	BLE celt_pitch_xcorr_edsp_process1u_loop4_done
				373	LDR r6, [r4], #4
				374	MOV r8, r8, LSL #16
				375	celt_pitch_xcorr_edsp_process1u_loop4
				376	LDR r9, [r5], #4
				377	SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
				378	LDR r7, [r4], #4
				379	SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
				380	LDR r8, [r5], #4
				381	SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
				382	SUBS r12, r12, #4 ; j-=4
				383	SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
				384	LDRGT r6, [r4], #4
				385	BGT celt_pitch_xcorr_edsp_process1u_loop4
				386	MOV r8, r8, LSR #16
				387	celt_pitch_xcorr_edsp_process1u_loop4_done
				388	ADDS r12, r12, #4
				389	celt_pitch_xcorr_edsp_process1u_loop1
				390	LDRGEH r6, [r4], #2
				391	; Stall
				392	SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x, y)
				393	SUBGES r12, r12, #1
				394	LDRGTH r8, [r5], #2
				395	BGT celt_pitch_xcorr_edsp_process1u_loop1
				396	; Restore _x
				397	SUB r4, r4, r3, LSL #1
				398	; Restore and advance _y
				399	SUB r5, r5, r3, LSL #1
				400	; maxcorr = max(maxcorr, sum)
				401	CMP r0, r14
				402	ADD r5, r5, #2
				403	MOVLT r0, r14
				404	SUBS r1, r1, #1
				405	; xcorr[i] = sum
				406	STR r14, [r2], #4
				407	BLE celt_pitch_xcorr_edsp_done
				408	celt_pitch_xcorr_edsp_process1u_done
				409	; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
				410	SUBS r1, r1, #4
				411	BLT celt_pitch_xcorr_edsp_process2
				412	celt_pitch_xcorr_edsp_process4
				413	; xcorr_kernel_edsp parameters:
				414	; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
				415	MOV r6, #0
				416	MOV r7, #0
				417	MOV r8, #0
				418	MOV r9, #0
				419	BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
				420	; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
				421	CMP r0, r6
				422	; _y+=4
				423	ADD r5, r5, #8
				424	MOVLT r0, r6
				425	CMP r0, r7
				426	MOVLT r0, r7
				427	CMP r0, r8
				428	MOVLT r0, r8
				429	CMP r0, r9
				430	MOVLT r0, r9
				431	STMIA r2!, {r6-r9}
				432	SUBS r1, r1, #4
				433	BGE celt_pitch_xcorr_edsp_process4
				434	celt_pitch_xcorr_edsp_process2
				435	ADDS r1, r1, #2
				436	BLT celt_pitch_xcorr_edsp_process1a
				437	SUBS r12, r3, #4
				438	; {r10, r11} = {sum0, sum1} = {0, 0}
				439	MOV r10, #0
				440	MOV r11, #0
				441	LDR r8, [r5], #4
				442	BLE celt_pitch_xcorr_edsp_process2_loop_done
				443	LDR r6, [r4], #4
				444	LDR r9, [r5], #4
				445	celt_pitch_xcorr_edsp_process2_loop4
				446	SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
				447	LDR r7, [r4], #4
				448	SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
				449	SUBS r12, r12, #4 ; j-=4
				450	SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
				451	LDR r8, [r5], #4
				452	SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
				453	LDRGT r6, [r4], #4
				454	SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
				455	SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
				456	SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
				457	LDRGT r9, [r5], #4
				458	SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
				459	BGT celt_pitch_xcorr_edsp_process2_loop4
				460	celt_pitch_xcorr_edsp_process2_loop_done
				461	ADDS r12, r12, #2
				462	BLE celt_pitch_xcorr_edsp_process2_1
				463	LDR r6, [r4], #4
				464	; Stall
				465	SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
				466	LDR r9, [r5], #4
				467	SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
				468	SUB r12, r12, #2
				469	SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
				470	MOV r8, r9
				471	SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
				472	celt_pitch_xcorr_edsp_process2_1
				473	LDRH r6, [r4], #2
				474	ADDS r12, r12, #1
				475	; Stall
				476	SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
				477	LDRGTH r7, [r4], #2
				478	SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
				479	BLE celt_pitch_xcorr_edsp_process2_done
				480	LDRH r9, [r5], #2
				481	SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
				482	SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
				483	celt_pitch_xcorr_edsp_process2_done
				484	; Restore _x
				485	SUB r4, r4, r3, LSL #1
				486	; Restore and advance _y
				487	SUB r5, r5, r3, LSL #1
				488	; maxcorr = max(maxcorr, sum0)
				489	CMP r0, r10
				490	ADD r5, r5, #2
				491	MOVLT r0, r10
				492	SUB r1, r1, #2
				493	; maxcorr = max(maxcorr, sum1)
				494	CMP r0, r11
				495	; xcorr[i] = sum
				496	STR r10, [r2], #4
				497	MOVLT r0, r11
				498	STR r11, [r2], #4
				499	celt_pitch_xcorr_edsp_process1a
				500	ADDS r1, r1, #1
				501	BLT celt_pitch_xcorr_edsp_done
				502	SUBS r12, r3, #4
				503	; r14 = sum = 0
				504	MOV r14, #0
				505	BLT celt_pitch_xcorr_edsp_process1a_loop_done
				506	LDR r6, [r4], #4
				507	LDR r8, [r5], #4
				508	LDR r7, [r4], #4
				509	LDR r9, [r5], #4
				510	celt_pitch_xcorr_edsp_process1a_loop4
				511	SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
				512	SUBS r12, r12, #4 ; j-=4
				513	SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
				514	LDRGE r6, [r4], #4
				515	SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
				516	LDRGE r8, [r5], #4
				517	SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
				518	LDRGE r7, [r4], #4
				519	LDRGE r9, [r5], #4
				520	BGE celt_pitch_xcorr_edsp_process1a_loop4
				521	celt_pitch_xcorr_edsp_process1a_loop_done
				522	ADDS r12, r12, #2
				523	LDRGE r6, [r4], #4
				524	LDRGE r8, [r5], #4
				525	; Stall
				526	SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
				527	SUBGE r12, r12, #2
				528	SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
				529	ADDS r12, r12, #1
				530	LDRGEH r6, [r4], #2
				531	LDRGEH r8, [r5], #2
				532	; Stall
				533	SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x, y)
				534	; maxcorr = max(maxcorr, sum)
				535	CMP r0, r14
				536	; xcorr[i] = sum
				537	STR r14, [r2], #4
				538	MOVLT r0, r14
				539	celt_pitch_xcorr_edsp_done
				540	LDMFD sp!, {r4-r11, pc}
				541	ENDP
				542
				543	ENDIF
				544
				545	END