Blame - src/opts/memset16_neon.S - platform/external/skqp

blob: b1719fa1fa4de15ca31ae98138e0243fcb18f5f2 [file] [log] [blame]

agl@chromium.org	aab4090	2010-06-04 14:47:38 +0000	[diff] [blame]	1	/***************************************************************************
epoger@google.com	fd03db0	2011-07-28 14:24:55 +0000	[diff] [blame]	2	* Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
agl@chromium.org	aab4090	2010-06-04 14:47:38 +0000	[diff] [blame]	6	***************************************************************************/
				7
				8	/***************************************************************************
				9	Neon memset: Attempts to do a memset with Neon registers if possible,
				10	Inputs:
				11	s: The buffer to write to
				12	c: The integer data to write to the buffer
				13	n: The size_t count.
				14	Outputs:
				15
				16	***************************************************************************/
				17
				18	.code 32
				19	.fpu neon
				20	.align 4
				21	.globl memset16_neon
				22	.func
				23
				24	memset16_neon:
				25	cmp r2, #0
				26	bxeq lr
				27
				28	/* Keep in mind that r2 -- the count argument -- is for the
				29	* number of 16-bit items to copy.
				30	*/
				31	lsl r2, r2, #1
				32
				33	push {r0}
				34
				35	/* If we have < 8 bytes, just do a quick loop to handle that */
				36	cmp r2, #8
				37	bgt memset_gt4
				38	memset_smallcopy_loop:
				39	strh r1, [r0], #2
				40	subs r2, r2, #2
				41	bne memset_smallcopy_loop
				42	memset_smallcopy_done:
				43	pop {r0}
				44	bx lr
				45
				46	memset_gt4:
				47	/*
				48	* Duplicate the r1 lowest 16-bits across r1. The idea is to have
				49	* a register with two 16-bit-values we can copy. We do this by
				50	* duplicating lowest 16-bits of r1 to upper 16-bits.
				51	*/
				52	orr r1, r1, r1, lsl #16
				53	/*
				54	* If we're copying > 64 bytes, then we may want to get
				55	* onto a 16-byte boundary to improve speed even more.
				56	*/
				57	cmp r2, #64
				58	blt memset_route
				59	ands r12, r0, #0xf
				60	beq memset_route
				61	/*
				62	* Determine the number of bytes to move forward to get to the 16-byte
				63	* boundary. Note that this will be a multiple of 4, since we
				64	* already are word-aligned.
				65	*/
				66	rsb r12, r12, #16
				67	sub r2, r2, r12
				68	lsls r12, r12, #29
				69	strmi r1, [r0], #4
				70	strcs r1, [r0], #4
				71	strcs r1, [r0], #4
				72	lsls r12, r12, #2
				73	strcsh r1, [r0], #2
				74	memset_route:
				75	/*
				76	* Decide where to route for the maximum copy sizes. Note that we
				77	* build q0 and q1 depending on if we'll need it, so that's
				78	* interwoven here as well.
				79	*/
				80	vdup.u32 d0, r1
				81	cmp r2, #16
				82	blt memset_8
				83	vmov d1, d0
				84	cmp r2, #64
				85	blt memset_16
				86	vmov q1, q0
				87	cmp r2, #128
				88	blt memset_32
				89	memset_128:
				90	mov r12, r2, lsr #7
				91	memset_128_loop:
				92	vst1.64 {q0, q1}, [r0]!
				93	vst1.64 {q0, q1}, [r0]!
				94	vst1.64 {q0, q1}, [r0]!
				95	vst1.64 {q0, q1}, [r0]!
				96	subs r12, r12, #1
				97	bne memset_128_loop
				98	ands r2, r2, #0x7f
				99	beq memset_end
				100	memset_32:
				101	movs r12, r2, lsr #5
				102	beq memset_16
				103	memset_32_loop:
				104	subs r12, r12, #1
				105	vst1.64 {q0, q1}, [r0]!
				106	bne memset_32_loop
				107	ands r2, r2, #0x1f
				108	beq memset_end
				109	memset_16:
				110	movs r12, r2, lsr #4
				111	beq memset_8
				112	memset_16_loop:
				113	subs r12, r12, #1
				114	vst1.32 {q0}, [r0]!
				115	bne memset_16_loop
				116	ands r2, r2, #0xf
				117	beq memset_end
				118	/*
				119	* memset_8 isn't a loop, since we try to do our loops at 16
				120	* bytes and above. We should loop there, then drop down here
				121	* to finish the <16-byte versions. Same for memset_4 and
				122	* memset_1.
				123	*/
				124	memset_8:
				125	cmp r2, #8
				126	blt memset_4
				127	subs r2, r2, #8
				128	vst1.32 {d0}, [r0]!
				129	memset_4:
				130	cmp r2, #4
				131	blt memset_2
				132	subs r2, r2, #4
				133	str r1, [r0], #4
				134	memset_2:
				135	cmp r2, #0
				136	ble memset_end
				137	strh r1, [r0], #2
				138	memset_end:
				139	pop {r0}
				140	bx lr
				141
				142	.endfunc
				143	.end