Blame - cpu_ref/rsCpuIntrinsics_neon_YuvToRGB.S - platform/frameworks/rs

blob: 5c3bce41a9245b91ac08f062ab911191d1df692e [file] [log] [blame]

Simon Hosie	ccd7a46	2014-02-01 01:35:11 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2014 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
				18	#define END(f) .fnend; .size f, .-f;
				19
				20	.eabi_attribute 25,1 @Tag_ABI_align8_preserved
				21	.arm
				22
				23	/* Perform the actual YuvToRGB conversion in a macro, from register to
				24	* register. This macro will be called from within several different wrapper
				25	* variants for different data layouts. Y data starts in q8, but with the even
				26	* and odd bytes split into d16 and d17 respectively. U and V are in d20
				27	* and d21. Working constants are pre-loaded into q13-q15, and q3 is
				28	* pre-loaded with a constant 0xff alpha channel.
				29	*
				30	* The complicated arithmetic is the result of refactoring the original
				31	* equations to avoid 16-bit overflow without losing any precision.
				32	*/
				33	.macro yuvkern
				34	vmov.i8 d15, #149
				35
				36	vmull.u8 q1, d16, d15 // g0 = y0 * 149
				37	vmull.u8 q5, d17, d15 // g1 = y1 * 149
				38
				39	vmov.i8 d14, #50
				40	vmov.i8 d15, #104
				41	vmull.u8 q8, d20, d14 // g2 = u * 50 + v * 104
				42	vmlal.u8 q8, d21, d15
				43
				44	vshr.u8 d14, d21, #1
				45	vaddw.u8 q0, q1, d14 // r0 = y0 * 149 + (v >> 1)
				46	vaddw.u8 q4, q5, d14 // r1 = y1 * 149 + (v >> 1)
				47
				48	vshll.u8 q7, d20, #2
				49	vadd.u16 q2, q1, q7 // b0 = y0 * 149 + (u << 2)
				50	vadd.u16 q6, q5, q7 // b1 = y1 * 149 + (u << 2)
				51
				52	vmov.i8 d14, #204
				53	vmov.i8 d15, #254
				54	vmull.u8 q11, d21, d14 // r2 = v * 204
				55	vmull.u8 q12, d20, d15 // b2 = u * 254
				56
				57	vhadd.u16 q0, q11 // r0 = (r0 + r2) >> 1
				58	vhadd.u16 q4, q11 // r1 = (r1 + r2) >> 1
				59	vqadd.u16 q1, q14 // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
				60	vqadd.u16 q5, q14 // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
				61	vhadd.u16 q2, q12 // b0 = (b0 + b2) >> 1
				62	vhadd.u16 q6, q12 // b1 = (b1 + b2) >> 1
				63
				64	vqsub.u16 q0, q13 // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
				65	vqsub.u16 q4, q13 // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
				66	vqsub.u16 q1, q8 // g0 = satu16(g0 - g2)
				67	vqsub.u16 q5, q8 // g1 = satu16(g1 - g2)
				68	vqsub.u16 q2, q15 // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
				69	vqsub.u16 q6, q15 // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
				70
				71	vqrshrn.u16 d0, q0, #6
				72	vqrshrn.u16 d1, q1, #7
				73	vqrshrn.u16 d2, q4, #6
				74	vqrshrn.u16 d3, q5, #7
				75	vqrshrn.u16 d4, q2, #6
				76	vqrshrn.u16 d5, q6, #6
				77
				78	vzip.u8 q0, q1
				79	vzip.u8 d4, d5
				80	.endm
				81
				82	/* Define the wrapper code which will load and store the data, iterate the
				83	* correct number of times, and safely handle the remainder at the end of the
				84	* loop. Some sections of code are switched out depending on the data packing
				85	* being handled.
				86	*/
				87	.macro wrap_line kernel, interleaved=0, swapuv=0
				88
				89	movw r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
				90	vdup.i16 q13, r5
				91	movw r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
				92	vdup.i16 q14, r5
				93	movw r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
				94	vdup.i16 q15, r5
				95
				96	vmov.i8 q3, #0xff
				97
				98	subs r2, #16
				99	bhs 1f
				100	b 2f
				101
				102	.align 4
				103	1: vld2.u8 {d16,d17}, [r1]!
				104	pld [r1, #256]
				105	.if \interleaved
				106	vld2.u8 {d20,d21}, [r3]!
				107	.if \swapuv
				108	vswp d20, d21
				109	.endif
				110	pld [r3, #256]
				111	.else
				112	vld1.u8 d20, [r3]!
				113	vld1.u8 d21, [r4]!
				114	pld [r3, #128]
				115	pld [r4, #128]
				116	.endif
				117
				118	\kernel
				119
Simon Hosie	1d9c887	2014-05-01 23:28:45 -0700	[diff] [blame]	120	subs r2, #16
Simon Hosie	ccd7a46	2014-02-01 01:35:11 -0800	[diff] [blame]	121
				122	vst4.u8 {d0,d2,d4,d6}, [r0]!
				123	vst4.u8 {d1,d3,d5,d7}, [r0]!
				124
				125	bhs 1b
				126
				127	2: adds r2, #16
				128	beq 2f
				129
				130	/* To handle the tail portion of the data (something less than 16
				131	* bytes) load small power-of-two chunks into working registers. It
				132	* doesn't matter where they end up in the register; the same process
				133	* will store them back out using the same positions and the
				134	* interaction between neighbouring pixels is constrained to odd
				135	* boundaries where the load operations don't interfere.
				136	*/
				137	vmov.i8 q8, #0
				138	vmov.i8 q10, #0
				139
				140	tst r2, #8
				141	beq 1f
				142	vld1.u8 d17, [r1]!
				143	.if \interleaved
				144	vld1.u8 d21, [r3]!
				145	.else
				146	vld1.u32 d20[1], [r3]!
				147	vld1.u32 d21[1], [r4]!
				148	.endif
				149
				150	1: tst r2, #4
				151	beq 1f
				152	vld1.u32 d16[1], [r1]!
				153	.if \interleaved
				154	vld1.u32 d20[1], [r3]!
				155	.else
				156	vld1.u16 d20[1], [r3]!
				157	vld1.u16 d21[1], [r4]!
				158	.endif
				159	1: tst r2, #2
				160	beq 1f
				161	vld1.u16 d16[1], [r1]!
				162	.if \interleaved
				163	vld1.u16 d20[1], [r3]!
				164	.else
				165	vld1.u8 d20[1], [r3]!
				166	vld1.u8 d21[1], [r4]!
				167	.endif
				168	1: tst r2, #1
				169	beq 1f
				170	vld1.u8 d16[1], [r1]!
				171	.if \interleaved
Simon Hosie	1d9c887	2014-05-01 23:28:45 -0700	[diff] [blame]	172	vld1.u16 d20[0], [r3]!
Simon Hosie	ccd7a46	2014-02-01 01:35:11 -0800	[diff] [blame]	173	.else
				174	vld1.u8 d20[0], [r3]!
				175	vld1.u8 d21[0], [r4]!
				176	.endif
				177
				178	/* One small impediment in the process above is that some of the load
				179	* operations can't perform byte-wise structure deinterleaving at the
				180	* same time as loading only part of a register. So the data is loaded
				181	* linearly and unpacked manually at this point if necessary.
				182	*/
				183	1: vuzp.8 d16, d17
				184	.if \interleaved
				185	vuzp.8 d20, d21
				186	.if \swapuv
				187	vswp d20, d21
				188	.endif
				189	.endif
				190
				191	\kernel
				192
				193	/* As above but with the output; structured stores for partial vectors
				194	* aren't available, so the data is re-packed first and stored linearly.
				195	*/
				196	vzip.8 q0, q2
				197	vzip.8 q1, q3
				198	vzip.8 q0, q1
				199	vzip.8 q2, q3
				200
				201	1: tst r2, #8
				202	beq 1f
				203	vst1.u8 {d4,d5,d6,d7}, [r0]!
				204
				205	1: tst r2, #4
				206	beq 1f
				207	vst1.u8 {d2,d3}, [r0]!
				208	1: tst r2, #2
				209	beq 1f
				210	vst1.u8 d1, [r0]!
				211	1: tst r2, #1
				212	beq 2f
				213	vst1.u32 d0[1], [r0]!
				214	2:
				215	.endm
				216
				217
				218	/* void rsdIntrinsicYuv2_K(
				219	* void *out, // r0
				220	* void const *yin, // r1
				221	* void const *uin, // r2
				222	* void const *vin, // r3
				223	* size_t xstart, // [sp]
				224	* size_t xend); // [sp+#4]
				225	*/
				226	ENTRY(rsdIntrinsicYuv2_K)
				227	push {r4,r5}
				228	ldr r5, [sp, #8]
				229	mov r4, r3
				230	mov r3, r2
				231	ldr r2, [sp, #12]
				232
				233	add r0, r5, LSL #2
				234	add r1, r5
				235	add r3, r5, LSR #1
				236	add r4, r5, LSR #1
				237	sub r2, r5
				238
				239	vpush {d8-d15}
				240
				241	wrap_line yuvkern, 0
				242
				243	vpop {d8-d15}
				244	pop {r4,r5}
				245	bx lr
				246	END(rsdIntrinsicYuv2_K)
				247
				248	/* void rsdIntrinsicYuv_K(
				249	* void *out, // r0
				250	* void const *yin, // r1
				251	* void const *uvin, // r2
				252	* size_t xstart, // r3
				253	* size_t xend); // [sp]
				254	*/
				255	ENTRY(rsdIntrinsicYuv_K)
				256	push {r4,r5}
				257	bic r4, r3, #1
				258	add r3, r2, r4
				259	ldr r2, [sp, #8]
				260
				261	add r0, r4, LSL #2
				262	add r1, r4
				263	sub r2, r4
				264
				265	vpush {d8-d15}
				266
				267	wrap_line yuvkern, 1, 1
				268
				269	vpop {d8-d15}
				270	pop {r4,r5}
				271	bx lr
				272	END(rsdIntrinsicYuv_K)
				273
				274	/* void rsdIntrinsicYuvR_K(
				275	* void *out, // r0
				276	* void const *yin, // r1
				277	* void const *uvin, // r2
				278	* size_t xstart, // r3
				279	* size_t xend); // [sp]
				280	*/
				281	ENTRY(rsdIntrinsicYuvR_K)
				282	push {r4,r5}
				283	bic r4, r3, #1
				284	add r3, r2, r4
				285	ldr r2, [sp, #8]
				286
				287	add r0, r4, LSL #2
				288	add r1, r4
				289	sub r2, r4
				290
				291	vpush {d8-d15}
				292
				293	wrap_line yuvkern, 1
				294
				295	vpop {d8-d15}
				296	pop {r4,r5}
				297	bx lr
				298	END(rsdIntrinsicYuvR_K)