Blame - src/opts/memset16_neon.S - platform/external/skqp

blob: b47cc226be841b8f1ccdbf09a5115b0dee089cb4 [file] [log] [blame]

agl@chromium.org	aab4090	2010-06-04 14:47:38 +0000	[diff] [blame^]	1	/***************************************************************************
				2	Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
				3
				4	Licensed under the Apache License, Version 2.0 (the "License"); you
				5	may not use this file except in compliance with the License. You may
				6	obtain a copy of the License at
				7
				8	http://www.apache.org/licenses/LICENSE-2.0
				9
				10	Unless required by applicable law or agreed to in writing, software
				11	distributed under the License is distributed on an "AS IS" BASIS,
				12	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
				13	implied. See the License for the specific language governing
				14	permissions and limitations under the License.
				15	***************************************************************************/
				16
				17	/***************************************************************************
				18	Neon memset: Attempts to do a memset with Neon registers if possible,
				19	Inputs:
				20	s: The buffer to write to
				21	c: The integer data to write to the buffer
				22	n: The size_t count.
				23	Outputs:
				24
				25	***************************************************************************/
				26
				27	.code 32
				28	.fpu neon
				29	.align 4
				30	.globl memset16_neon
				31	.func
				32
				33	memset16_neon:
				34	cmp r2, #0
				35	bxeq lr
				36
				37	/* Keep in mind that r2 -- the count argument -- is for the
				38	* number of 16-bit items to copy.
				39	*/
				40	lsl r2, r2, #1
				41
				42	push {r0}
				43
				44	/* If we have < 8 bytes, just do a quick loop to handle that */
				45	cmp r2, #8
				46	bgt memset_gt4
				47	memset_smallcopy_loop:
				48	strh r1, [r0], #2
				49	subs r2, r2, #2
				50	bne memset_smallcopy_loop
				51	memset_smallcopy_done:
				52	pop {r0}
				53	bx lr
				54
				55	memset_gt4:
				56	/*
				57	* Duplicate the r1 lowest 16-bits across r1. The idea is to have
				58	* a register with two 16-bit-values we can copy. We do this by
				59	* duplicating lowest 16-bits of r1 to upper 16-bits.
				60	*/
				61	orr r1, r1, r1, lsl #16
				62	/*
				63	* If we're copying > 64 bytes, then we may want to get
				64	* onto a 16-byte boundary to improve speed even more.
				65	*/
				66	cmp r2, #64
				67	blt memset_route
				68	ands r12, r0, #0xf
				69	beq memset_route
				70	/*
				71	* Determine the number of bytes to move forward to get to the 16-byte
				72	* boundary. Note that this will be a multiple of 4, since we
				73	* already are word-aligned.
				74	*/
				75	rsb r12, r12, #16
				76	sub r2, r2, r12
				77	lsls r12, r12, #29
				78	strmi r1, [r0], #4
				79	strcs r1, [r0], #4
				80	strcs r1, [r0], #4
				81	lsls r12, r12, #2
				82	strcsh r1, [r0], #2
				83	memset_route:
				84	/*
				85	* Decide where to route for the maximum copy sizes. Note that we
				86	* build q0 and q1 depending on if we'll need it, so that's
				87	* interwoven here as well.
				88	*/
				89	vdup.u32 d0, r1
				90	cmp r2, #16
				91	blt memset_8
				92	vmov d1, d0
				93	cmp r2, #64
				94	blt memset_16
				95	vmov q1, q0
				96	cmp r2, #128
				97	blt memset_32
				98	memset_128:
				99	mov r12, r2, lsr #7
				100	memset_128_loop:
				101	vst1.64 {q0, q1}, [r0]!
				102	vst1.64 {q0, q1}, [r0]!
				103	vst1.64 {q0, q1}, [r0]!
				104	vst1.64 {q0, q1}, [r0]!
				105	subs r12, r12, #1
				106	bne memset_128_loop
				107	ands r2, r2, #0x7f
				108	beq memset_end
				109	memset_32:
				110	movs r12, r2, lsr #5
				111	beq memset_16
				112	memset_32_loop:
				113	subs r12, r12, #1
				114	vst1.64 {q0, q1}, [r0]!
				115	bne memset_32_loop
				116	ands r2, r2, #0x1f
				117	beq memset_end
				118	memset_16:
				119	movs r12, r2, lsr #4
				120	beq memset_8
				121	memset_16_loop:
				122	subs r12, r12, #1
				123	vst1.32 {q0}, [r0]!
				124	bne memset_16_loop
				125	ands r2, r2, #0xf
				126	beq memset_end
				127	/*
				128	* memset_8 isn't a loop, since we try to do our loops at 16
				129	* bytes and above. We should loop there, then drop down here
				130	* to finish the <16-byte versions. Same for memset_4 and
				131	* memset_1.
				132	*/
				133	memset_8:
				134	cmp r2, #8
				135	blt memset_4
				136	subs r2, r2, #8
				137	vst1.32 {d0}, [r0]!
				138	memset_4:
				139	cmp r2, #4
				140	blt memset_2
				141	subs r2, r2, #4
				142	str r1, [r0], #4
				143	memset_2:
				144	cmp r2, #0
				145	ble memset_end
				146	strh r1, [r0], #2
				147	memset_end:
				148	pop {r0}
				149	bx lr
				150
				151	.endfunc
				152	.end