Blame - cpu_ref/rsCpuIntrinsics_advsimd_Convolve.S - platform/frameworks/rs

blob: 0daa0c5a9d52afc5bd78f70f86108f143c94e814 [file] [log] [blame]

Simon Hosie	4e5c414	2014-03-15 21:45:49 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2012,2014 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	/*
				18	x0 = dst
				19	x1 = y0 base pointer
				20	x2 = y1 base pointer
				21	x3 = y2 base pointer
				22	x4 = coeffs
				23	x5 = length / 2
				24	*/
				25
				26	#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
				27	#define END(f) .size f, .-f;
				28
				29	ENTRY(rsdIntrinsicConvolve3x3_K)
				30	sub x6, sp, #64
				31	sub sp, sp, #64
				32	st1 {v8.1d-v11.1d}, [x6], #32
				33	st1 {v12.1d-v15.1d}, [x6]
				34
				35	/* Load the coefficients in the v0, v1 registers */
				36	ld1 {v0.8h, v1.8h}, [x4]
				37
				38	/* Load the frequently used immediate in a register */
				39	mov x4, #8
				40
				41	1:
				42	/* Load and post-increase the address by x4=#8 */
				43	ld1 {v13.16b}, [x1], x4
				44	ld1 {v14.16b}, [x2], x4
				45	ld1 {v15.16b}, [x3], x4
				46
				47	/* Signal memory for data that will be used in the loop after the next */
				48	// prfm PLDL1KEEP,[x1, x4] // TODO: test this
				49	// prfm PLDL1KEEP,[x2, x4] // TODO: test this
				50	// prfm PLDL1KEEP,[x3, x4] // TODO: test this
				51
				52	uxtl v2.8h, v13.8b
				53	uxtl2 v3.8h, v13.16b
				54	uxtl v4.8h, v14.8b
				55	uxtl2 v5.8h, v14.16b
				56	uxtl v6.8h, v15.8b
				57	uxtl2 v7.8h, v15.16b
				58
				59	/*
				60	The two pixel source array is
				61	v2, v2hi, v3lo, v3hi
				62	v4, v4hi, v5lo, v5hi
				63	v6, v6hi, v7lo, v7hi
				64	*/
				65
				66	smull v8.4s, v2.4h, v0.h[0]
				67	smull2 v9.4s, v2.8h, v0.h[0]
				68	smlal2 v8.4s, v2.8h, v0.h[1]
				69	smlal v9.4s, v3.4h, v0.h[1]
				70	smlal v8.4s, v3.4h, v0.h[2]
				71	smlal2 v9.4s, v3.8h, v0.h[2]
				72	smlal v8.4s, v4.4h, v0.h[3]
				73	smlal2 v9.4s, v4.8h, v0.h[3]
				74	smlal2 v8.4s, v4.8h, v0.h[4]
				75	smlal v9.4s, v5.4h, v0.h[4]
				76	smlal v8.4s, v5.4h, v0.h[5]
				77	smlal2 v9.4s, v5.8h, v0.h[5]
				78	smlal v8.4s, v6.4h, v0.h[6]
				79	smlal2 v9.4s, v6.8h, v0.h[6]
				80	smlal2 v8.4s, v6.8h, v0.h[7]
				81	smlal v9.4s, v7.4h, v0.h[7]
				82	smlal v8.4s, v7.4h, v1.h[0]
				83	smlal2 v9.4s, v7.8h, v1.h[0]
				84
				85	shrn v8.4h, v8.4s, #8
				86	shrn2 v8.8h, v9.4s, #8
				87
				88	sqxtun v8.8b, v8.8h
				89	st1 {v8.8b}, [x0], #8
				90
				91	/* Are we done yet? */
				92	subs x5, x5, #1
				93	bne 1b
				94
				95	/* We're done, bye! */
				96	ld1 {v8.1d-v11.1d}, [sp], #32
				97	ld1 {v12.1d-v15.1d}, [sp], #32
				98	ret
				99	END(rsdIntrinsicConvolve3x3_K)
				100
				101
				102	/* Convolve 5x5 */
				103
				104	/*
				105	x0 = dst
				106	x1 = y0 base pointer
				107	x2 = y1 base pointer
				108	x3 = y2 base pointer
				109	x4 = y3 base pointer
				110	x5 = y4 base pointer
				111	x6 = coeffs
				112	x7 = length
				113	*/
				114	ENTRY(rsdIntrinsicConvolve5x5_K)
				115	sub x8, sp, #64
				116	sub sp, sp, #64
				117	st1 {v8.1d-v11.1d}, [x8], #32
				118	st1 {v12.1d-v15.1d}, [x8]
				119
				120	/* Create the coefficients vector */
				121	ld1 {v0.8h-v2.8h}, [x6], #48
				122	ld1 {v3.4h}, [x6], #8
				123
				124	movi v15.4s, #0x7f
				125
				126	/* Load the frequently used immediate in a register */
				127	mov x6, #8
				128
				129	1:
				130	/* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
				131	ld1 {v9.8b-v11.8b}, [x1], x6 // y0 ( y - 2 )
				132	ld1 {v12.8b-v14.8b}, [x2], x6 // y0 ( y - 1 )
				133
				134	/* Signal memory for data that will be used in the loop after the next */
				135	// prfm PLDL1KEEP,[x1, x6] // TODO: test this
				136	// prfm PLDL1KEEP,[x2, x6] // TODO: test this
				137
				138	/* Promoting the 8bit channels to 16bit */
				139	uxtl v9.8h, v9.8b
				140	uxtl v10.8h, v10.8b
				141	uxtl v11.8h, v11.8b
				142	uxtl v12.8h, v12.8b
				143	uxtl v13.8h, v13.8b
				144	uxtl v14.8h, v14.8b
				145
				146	/*
				147	v9, v9hi, v10lo, v10hi, v11lo, v11hi,
				148	v12, v12hi
				149	*/
				150	smull v4.4s, v9.4h, v0.h[0]
				151	smull2 v5.4s, v9.8h, v0.h[0]
				152	smlal2 v4.4s, v9.8h, v0.h[1]
				153	smlal v5.4s, v10.4h, v0.h[1]
				154	smlal v4.4s, v10.4h, v0.h[2]
				155	smlal2 v5.4s, v10.8h, v0.h[2]
				156	smlal2 v4.4s, v10.8h, v0.h[3]
				157	smlal v5.4s, v11.4h, v0.h[3]
				158	smlal v4.4s, v11.4h, v0.h[4]
				159	smlal2 v5.4s, v11.8h, v0.h[4]
				160
				161	smlal v4.4s, v12.4h, v0.h[5]
				162	smlal2 v5.4s, v12.8h, v0.h[5]
				163	smlal2 v4.4s, v12.8h, v0.h[6]
				164	smlal v5.4s, v13.4h, v0.h[6]
				165	smlal v4.4s, v13.4h, v0.h[7]
				166	smlal2 v5.4s, v13.8h, v0.h[7]
				167	smlal2 v4.4s, v13.8h, v1.h[0]
				168	smlal v5.4s, v14.4h, v1.h[0]
				169	smlal v4.4s, v14.4h, v1.h[1]
				170	smlal2 v5.4s, v14.8h, v1.h[1]
				171
				172	/* Next 2 rows */
				173	/* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
				174	ld1 {v9.8b-v11.8b}, [x3], x6 // y0 ( y )
				175	ld1 {v12.8b-v14.8b}, [x4], x6 // y0 ( y + 1 )
				176
				177	/* Signal memory for data that will be used in the loop after the next */
				178	// prfm PLDL1KEEP,[x3, x6] // TODO: test this
				179	// prfm PLDL1KEEP,[x4, x6] // TODO: test this
				180
				181	/* Promoting the 8bit channels to 16bit */
				182	uxtl v9.8h, v9.8b
				183	uxtl v10.8h, v10.8b
				184	uxtl v11.8h, v11.8b
				185	uxtl v12.8h, v12.8b
				186	uxtl v13.8h, v13.8b
				187	uxtl v14.8h, v14.8b
				188
				189	/*
				190	v9, v9hi, v10lo, v10hi, v11lo, v11hi,
				191	v12, v12hi
				192	*/
				193	smlal v4.4s, v9.4h, v1.h[2]
				194	smlal2 v5.4s, v9.8h, v1.h[2]
				195	smlal2 v4.4s, v9.8h, v1.h[3]
				196	smlal v5.4s, v10.4h, v1.h[3]
				197	smlal v4.4s, v10.4h, v1.h[4]
				198	smlal2 v5.4s, v10.8h, v1.h[4]
				199	smlal2 v4.4s, v10.8h, v1.h[5]
				200	smlal v5.4s, v11.4h, v1.h[5]
				201	smlal v4.4s, v11.4h, v1.h[6]
				202	smlal2 v5.4s, v11.8h, v1.h[6]
				203
				204	smlal v4.4s, v12.4h, v1.h[7]
				205	smlal2 v5.4s, v12.8h, v1.h[7]
				206	smlal2 v4.4s, v12.8h, v2.h[0]
				207	smlal v5.4s, v13.4h, v2.h[0]
				208	smlal v4.4s, v13.4h, v2.h[1]
				209	smlal2 v5.4s, v13.8h, v2.h[1]
				210	smlal2 v4.4s, v13.8h, v2.h[2]
				211	smlal v5.4s, v14.4h, v2.h[2]
				212	smlal v4.4s, v14.4h, v2.h[3]
				213	smlal2 v5.4s, v14.8h, v2.h[3]
				214
				215	/* Last row */
				216	/* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
				217	ld1 {v9.8b- v11.8b}, [x5], x6 // y0 ( y + 2 )
				218
				219	/* Signal memory for data that will be used in the loop after the next */
				220	// prfm PLDL1KEEP,[x5, x6] // TODO: test this
				221
				222	/* Promoting the 8bit channels to 16bit */
				223	uxtl v9.8h, v9.8b
				224	uxtl v10.8h, v10.8b
				225	uxtl v11.8h, v11.8b
				226
				227	/*
				228	v9, v9hi, v10lo, v10hi, v11lo, v11hi,
				229	v12, v12hi
				230	*/
				231
				232	smlal v4.4s, v9.4h, v2.h[4]
				233	smlal2 v5.4s, v9.8h, v2.h[4]
				234	smlal2 v4.4s, v9.8h, v2.h[5]
				235	smlal v5.4s, v10.4h, v2.h[5]
				236	smlal v4.4s, v10.4h, v2.h[6]
				237	smlal2 v5.4s, v10.8h, v2.h[6]
				238	smlal2 v4.4s, v10.8h, v2.h[7]
				239	smlal v5.4s, v11.4h, v2.h[7]
				240	smlal v4.4s, v11.4h, v3.h[0]
				241	smlal2 v5.4s, v11.8h, v3.h[0]
				242
				243	add v4.4s, v4.4s, v15.4s
				244	add v5.4s, v5.4s, v15.4s
				245
				246	/* Narrow it to a d-reg 32 -> 16 bit */
				247	rshrn v4.4h, v4.4s, #8
				248	rshrn2 v4.8h, v5.4s, #8
				249
				250
				251	/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */
				252	sqxtun v4.8b, v4.8h
				253
				254	st1 {v4.8b}, [x0], #8 // return the output and increase the address of x0
				255
				256	/* Are we done? */
				257	subs x7, x7, #1
				258	bne 1b
				259
				260	/* Yup, bye */
				261	ld1 {v8.1d-v11.1d}, [sp], #32
				262	ld1 {v12.1d-v15.1d}, [sp], #32
				263	ret
				264
				265	END(rsdIntrinsicConvolve5x5_K)