Blame - armv6_idct.S - fp2-dev/platform/external/jpeg

blob: 843c2dd110649b811c5d3547c500ca370fc8bdf0 [file] [log] [blame]

Chia-chi Yeh	4736a38	2010-12-10 18:11:00 +0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2010 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	/*
				18	* This is a fast-and-accurate implementation of inverse Discrete Cosine
				19	* Transform (IDCT) for ARMv6+. It also performs dequantization of the input
				20	* coefficients just like other methods.
				21	*
				22	* This implementation is based on the scaled 1-D DCT algorithm proposed by
				23	* Arai, Agui, and Nakajima. The following code is based on the figure 4-8
				24	* on page 52 of the JPEG textbook by Pennebaker and Mitchell. Coefficients
				25	* are (almost) directly mapped into registers.
				26	*
				27	* The accuracy is achieved by using SMULWy and SMLAWy instructions. Both
				28	* multiply 32 bits by 16 bits and store the top 32 bits of the result. It
				29	* makes 32-bit fixed-point arithmetic possible without overflow. That is
				30	* why jpeg_idct_ifast(), which is written in C, cannot be improved.
				31	*
				32	* More tricks are used to gain more speed. First of all, we use as many
				33	* registers as possible. ARM processor has 16 registers including sp (r13)
				34	* and pc (r15), so only 14 registers can be used without limitations. In
				35	* general, we let r0 to r7 hold the coefficients; r10 and r11 hold four
				36	* 16-bit constants; r12 and r14 hold two of the four arguments; and r8 hold
				37	* intermediate value. In the second pass, r9 is the loop counter. In the
				38	* first pass, r8 to r11 are used to hold quantization values, so the loop
				39	* counter is held by sp. Yes, the stack pointer. Since it must be aligned
				40	* to 4-byte boundary all the time, we align it to 32-byte boundary and use
				41	* bit 3 to bit 5. As the result, we actually use 14.1 registers. :-)
				42	*
				43	* Second, we rearrange quantization values to access them sequentially. The
				44	* table is first transposed, and the new columns are placed in the order of
				45	* 7, 5, 1, 3, 0, 2, 4, 6. Thus we can use LDMDB to load four values at a
				46	* time. Rearranging coefficients also helps, but that requires to change a
				47	* dozen of files, which seems not worth it. In addition, we choose to scale
				48	* up quantization values by 13 bits, so the coefficients are scaled up by
				49	* 16 bits after both passes. Then we can pack and saturate them two at a
				50	* time using PKHTB and USAT16 instructions.
				51	*
				52	* Third, we reorder the instructions to avoid bubbles in the pipeline. This
				53	* is done by hand accroding to the cycle timings and the interlock behavior
				54	* described in the technical reference manual of ARM1136JF-S. We also take
				55	* advantage of dual issue processors by interleaving instructions with
				56	* dependencies. It has been benchmarked on four devices and all the results
				57	* showed distinguishable improvements. Note that PLD instructions actually
				58	* slow things down, so they are removed at the last minute. In the future,
				59	* this might be futher improved using a system profiler.
				60	*/
				61
Chia-chi Yeh	4736a38	2010-12-10 18:11:00 +0800	[diff] [blame]	62	// void armv6_idct(short coefs, int quans, unsigned char *rows, int col)
				63	.arm
				64	.text
				65	.align
				66	.global armv6_idct
				67	.func armv6_idct
				68
				69	armv6_idct:
				70	// Push everything except sp (r13) and pc (r15).
				71	stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
				72
				73	// r12 = quans, r14 = coefs.
				74	sub r4, sp, #236
				75	bic sp, r4, #31
				76	add r5, sp, #224
				77	add r12, r1, #256
				78	stm r5, {r2, r3, r4}
				79	add r14, r0, #16
				80
				81	pass1_head:
				82	// Load quantization values. (q[0, 2, 4, 6])
				83	ldmdb r12!, {r8, r9, r10, r11}
				84
				85	// Load coefficients. (c[4, 1, 2, 3, 0, 5, 6, 7])
				86	ldrsh r4, [r14, #-2] !
				87	ldrsh r1, [r14, #16]
				88	ldrsh r2, [r14, #32]
				89	ldrsh r3, [r14, #48]
				90	ldrsh r0, [r14, #64]
				91	ldrsh r5, [r14, #80]
				92	ldrsh r6, [r14, #96]
				93	ldrsh r7, [r14, #112]
				94
				95	// r4 = q[0] * c[0];
				96	mul r4, r8, r4
				97
				98	// Check if ACs are all zero.
				99	cmp r0, #0
				100	orreqs r8, r1, r2
				101	orreqs r8, r3, r5
				102	orreqs r8, r6, r7
				103	beq pass1_zero
				104
				105	// Step 1: Dequantizations.
				106
				107	// r2 = q[2] * c[2];
				108	// r0 = q[4] * c[4] + r4;
				109	// r6 = q[6] * c[6] + r2;
				110	mul r2, r9, r2
				111	mla r0, r10, r0, r4
				112	mla r6, r11, r6, r2
				113
				114	// Load quantization values. (q[7, 5, 1, 3])
				115	ldmdb r12!, {r8, r9, r10, r11}
				116
				117	// r4 = r4 * 2 - r0 = -(r0 - r4 * 2);
				118	// r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
				119	rsb r4, r0, r4, lsl #1
				120	rsb r2, r6, r2, lsl #1
				121
				122	// r7 = q[7] * c[7];
				123	// r5 = q[5] * c[5];
				124	// r1 = q[1] * c[1] + r7;
				125	// r3 = q[3] * c[3] + r5;
				126	mul r7, r8, r7
				127	mul r5, r9, r5
				128	mla r1, r10, r1, r7
				129	mla r3, r11, r3, r5
				130
				131	// Load constants.
				132	ldrd r10, constants
				133
				134	// Step 2: Rotations and Butterflies.
				135
				136	// r7 = r1 - r7 * 2;
				137	// r1 = r1 - r3;
				138	// r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
				139	// r3 = r1 + r3 * 2;
				140	// r8 = r5 + r7;
				141	sub r7, r1, r7, lsl #1
				142	sub r1, r1, r3
				143	rsb r5, r3, r5, lsl #1
				144	add r3, r1, r3, lsl #1
				145	add r8, r5, r7
				146
				147	// r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
				148	// r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
				149	// r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
				150	smlawt r2, r2, r10, r2
				151	smulwb r8, r8, r10
				152	smlawt r1, r1, r10, r1
				153
				154	// r0 = r0 + r6;
				155	// r2 = r2 - r6;
				156	// r6 = r0 - r6 * 2;
				157	add r0, r0, r6
				158	sub r2, r2, r6
				159	sub r6, r0, r6, lsl #1
				160
				161	// r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
				162	// r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
				163	smlawt r5, r5, r11, r8
				164	smlawb r8, r7, r11, r8
				165
				166	// r4 = r4 + r2;
				167	// r0 = r0 + r3;
				168	// r2 = r4 - r2 * 2;
				169	add r4, r4, r2
				170	add r0, r0, r3
				171	sub r2, r4, r2, lsl #1
				172
				173	// r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
				174	// r3 = r0 - r3 * 2;
				175	// r1 = r1 - r7;
				176	// r4 = r4 + r7;
				177	// r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
				178	// r7 = r4 - r7 * 2;
				179	rsb r7, r3, r5, lsl #3
				180	sub r3, r0, r3, lsl #1
				181	sub r1, r1, r7
				182	add r4, r4, r7
				183	rsb r5, r1, r8, lsl #3
				184	sub r7, r4, r7, lsl #1
				185
				186	// r2 = r2 + r1;
				187	// r6 = r6 + r5;
				188	// r1 = r2 - r1 * 2;
				189	// r5 = r6 - r5 * 2;
				190	add r2, r2, r1
				191	add r6, r6, r5
				192	sub r1, r2, r1, lsl #1
				193	sub r5, r6, r5, lsl #1
				194
				195	// Step 3: Reorder and Save.
				196
				197	str r0, [sp, #-4] !
				198	str r4, [sp, #32]
				199	str r2, [sp, #64]
				200	str r6, [sp, #96]
				201	str r5, [sp, #128]
				202	str r1, [sp, #160]
				203	str r7, [sp, #192]
				204	str r3, [sp, #224]
				205	b pass1_tail
				206
				207	// Precomputed 16-bit constants: 27146, 15137, -21407, -8867.
				208	// Put them in the middle since LDRD only accepts offsets from -255 to 255.
				209	.align 3
				210	constants:
				211	.word 0x6a0a3b21
				212	.word 0xac61dd5d
				213
				214	pass1_zero:
				215	str r4, [sp, #-4] !
				216	str r4, [sp, #32]
				217	str r4, [sp, #64]
				218	str r4, [sp, #96]
				219	str r4, [sp, #128]
				220	str r4, [sp, #160]
				221	str r4, [sp, #192]
				222	str r4, [sp, #224]
				223	sub r12, r12, #16
				224
				225	pass1_tail:
				226	ands r9, sp, #31
				227	bne pass1_head
				228
				229	// r12 = rows, r14 = col.
				230	ldr r12, [sp, #256]
				231	ldr r14, [sp, #260]
				232
				233	// Load constants.
				234	ldrd r10, constants
				235
				236	pass2_head:
				237	// Load coefficients. (c[0, 1, 2, 3, 4, 5, 6, 7])
				238	ldmia sp!, {r0, r1, r2, r3, r4, r5, r6, r7}
				239
				240	// r0 = r0 + 0x00808000;
				241	add r0, r0, #0x00800000
				242	add r0, r0, #0x00008000
				243
				244	// Step 1: Analog to the first pass.
				245
				246	// r0 = r0 + r4;
				247	// r6 = r6 + r2;
				248	add r0, r0, r4
				249	add r6, r6, r2
				250
				251	// r4 = r0 - r4 * 2;
				252	// r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
				253	sub r4, r0, r4, lsl #1
				254	rsb r2, r6, r2, lsl #1
				255
				256	// r1 = r1 + r7;
				257	// r3 = r3 + r5;
				258	add r1, r1, r7
				259	add r3, r3, r5
				260
				261	// Step 2: Rotations and Butterflies.
				262
				263	// r7 = r1 - r7 * 2;
				264	// r1 = r1 - r3;
				265	// r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
				266	// r3 = r1 + r3 * 2;
				267	// r8 = r5 + r7;
				268	sub r7, r1, r7, lsl #1
				269	sub r1, r1, r3
				270	rsb r5, r3, r5, lsl #1
				271	add r3, r1, r3, lsl #1
				272	add r8, r5, r7
				273
				274	// r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
				275	// r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
				276	// r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
				277	smlawt r2, r2, r10, r2
				278	smulwb r8, r8, r10
				279	smlawt r1, r1, r10, r1
				280
				281	// r0 = r0 + r6;
				282	// r2 = r2 - r6;
				283	// r6 = r0 - r6 * 2;
				284	add r0, r0, r6
				285	sub r2, r2, r6
				286	sub r6, r0, r6, lsl #1
				287
				288	// r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
				289	// r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
				290	smlawt r5, r5, r11, r8
				291	smlawb r8, r7, r11, r8
				292
				293	// r4 = r4 + r2;
				294	// r0 = r0 + r3;
				295	// r2 = r4 - r2 * 2;
				296	add r4, r4, r2
				297	add r0, r0, r3
				298	sub r2, r4, r2, lsl #1
				299
				300	// r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
				301	// r3 = r0 - r3 * 2;
				302	// r1 = r1 - r7;
				303	// r4 = r4 + r7;
				304	// r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
				305	// r7 = r4 - r7 * 2;
				306	rsb r7, r3, r5, lsl #3
				307	sub r3, r0, r3, lsl #1
				308	sub r1, r1, r7
				309	add r4, r4, r7
				310	rsb r5, r1, r8, lsl #3
				311	sub r7, r4, r7, lsl #1
				312
				313	// r2 = r2 + r1;
				314	// r6 = r6 + r5;
				315	// r1 = r2 - r1 * 2;
				316	// r5 = r6 - r5 * 2;
				317	add r2, r2, r1
				318	add r6, r6, r5
				319	sub r1, r2, r1, lsl #1
				320	sub r5, r6, r5, lsl #1
				321
				322	// Step 3: Reorder and Save.
				323
				324	// Load output pointer.
				325	ldr r8, [r12], #4
				326
				327	// For little endian: r6, r2, r4, r0, r3, r7, r1, r5.
				328	pkhtb r6, r6, r4, asr #16
				329	pkhtb r2, r2, r0, asr #16
				330	pkhtb r3, r3, r1, asr #16
				331	pkhtb r7, r7, r5, asr #16
				332	usat16 r6, #8, r6
				333	usat16 r2, #8, r2
				334	usat16 r3, #8, r3
				335	usat16 r7, #8, r7
				336	orr r0, r2, r6, lsl #8
				337	orr r1, r7, r3, lsl #8
				338
				339	#ifdef __ARMEB__
				340	// Reverse bytes for big endian.
				341	rev r0, r0
				342	rev r1, r1
				343	#endif
				344
				345	// Use STR instead of STRD to support unaligned access.
				346	str r0, [r8, r14] !
				347	str r1, [r8, #4]
				348
				349	pass2_tail:
				350	adds r9, r9, #0x10000000
				351	bpl pass2_head
				352
				353	ldr sp, [sp, #8]
				354	add sp, sp, #236
				355
				356	ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
				357	bx lr
				358	.endfunc