Blame - cpu_ref/rsCpuIntrinsics_advsimd_Blend.S - platform/frameworks/rs

blob: 5211bb3110452793b4afaba0992afa2b1283a65a [file] [log] [blame]

Simon Hosie	5d06919	2014-02-19 16:33:45 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2013-2014 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
				18	#define END(f) .size f, .-f;
				19
				20	#define BLEND_LIST(X) \
				21	X(0, CLEAR) \
				22	X(1, SRC) \
				23	X(2, DST) \
				24	X(3, SRC_OVER) \
				25	X(4, DST_OVER) \
				26	X(5, SRC_IN) \
				27	X(6, DST_IN) \
				28	X(7, SRC_OUT) \
				29	X(8, DST_OUT) \
				30	X(9, SRC_ATOP) \
				31	X(10, DST_ATOP) \
				32	X(11, XOR) \
				33	X(14, MULTIPLY) \
				34	X(21, DIFFERENCE) \
				35	X(34, ADD) \
				36	X(35, SUBTRACT)
				37
				38	/* For every blend operation supported, define a macro with just the arithmetic
				39	* component. The rest can be handled later on.
				40	*
				41	* At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
				42	* contain the data from the source buffer. Both have already been split out
				43	* into one colour component per register (if necessary). q3 and q11 contain
				44	* the alpha components.
				45	*
				46	* At the same time as defining the assembly macro, define a corresponding
				47	* preprocessor macro indicating any other requirements.
				48	* zipped=0 -- The macro does not require the RGBA components to be
				49	* separated.
				50	* lddst=0 -- The macro does not require data from the destination buffer.
				51	* ldsrc=0 -- The macro does not require data from the source buffer.
				52	* nowrap=1 -- The macro requires no wrapper at all, and should simply be
				53	* inserted without any surrounding load/store or loop code.
				54	*/
				55
				56	#define params_CLEAR zipped=0, lddst=0, ldsrc=0
				57	.macro blend_kernel_CLEAR
				58	movi v0.16b, #0
				59	movi v1.16b, #0
				60	movi v2.16b, #0
				61	movi v3.16b, #0
				62	.endm
				63
				64	#define params_SRC zipped=0, lddst=0
				65	.macro blend_kernel_SRC
				66	mov v0.16b, v8.16b
				67	mov v1.16b, v9.16b
				68	mov v2.16b, v10.16b
				69	mov v3.16b, v11.16b
				70	.endm
				71
				72	#define params_DST nowrap=1
				73	.macro blend_kernel_DST
				74	/* nop */
				75	.endm
				76
				77	#define params_SRC_OVER zipped=1
				78	.macro blend_kernel_SRC_OVER
				79	mvn v7.16b, v11.16b
				80
				81	umull2 v12.8h, v7.16b, v0.16b
				82	umull v0.8h, v7.8b, v0.8b
				83	umull2 v13.8h, v7.16b, v1.16b
				84	umull v1.8h, v7.8b, v1.8b
				85	umull2 v14.8h, v7.16b, v2.16b
				86	umull v2.8h, v7.8b, v2.8b
				87	umull2 v15.8h, v7.16b, v3.16b
				88	umull v3.8h, v7.8b, v3.8b
				89
				90	rshrn v4.8b, v0.8h, #8
				91	rshrn2 v4.16b, v12.8h, #8
				92	rshrn v5.8b, v1.8h, #8
				93	rshrn2 v5.16b, v13.8h, #8
				94	rshrn v6.8b, v2.8h, #8
				95	rshrn2 v6.16b, v14.8h, #8
				96	rshrn v7.8b, v3.8h, #8
				97	rshrn2 v7.16b, v15.8h, #8
				98
				99	uaddw v0.8h, v0.8h, v4.8b
				100	uaddw2 v12.8h, v12.8h, v4.16b
				101	uaddw v1.8h, v1.8h, v5.8b
				102	uaddw2 v13.8h, v13.8h, v5.16b
				103	uaddw v2.8h, v2.8h, v6.8b
				104	uaddw2 v14.8h, v14.8h, v6.16b
				105	uaddw v3.8h, v3.8h, v7.8b
				106	uaddw2 v15.8h, v15.8h, v7.16b
				107
				108	rshrn v0.8b, v0.8h, #8
				109	rshrn2 v0.16b, v12.8h, #8
				110	rshrn v1.8b, v1.8h, #8
				111	rshrn2 v1.16b, v13.8h, #8
				112	rshrn v2.8b, v2.8h, #8
				113	rshrn2 v2.16b, v14.8h, #8
				114	rshrn v3.8b, v3.8h, #8
				115	rshrn2 v3.16b, v15.8h, #8
				116
				117	uqadd v0.16b, v0.16b, v8.16b
				118	uqadd v1.16b, v1.16b, v9.16b
				119	uqadd v2.16b, v2.16b, v10.16b
				120	uqadd v3.16b, v3.16b, v11.16b
				121	.endm
				122
				123	#define params_DST_OVER zipped=1
				124	.macro blend_kernel_DST_OVER
				125	mvn v7.16b, v3.16b
				126
				127	umull2 v12.8h, v7.16b, v8.16b
				128	umull v8.8h, v7.8b, v8.8b
				129	umull2 v13.8h, v7.16b, v9.16b
				130	umull v9.8h, v7.8b, v9.8b
				131	umull2 v14.8h, v7.16b, v10.16b
				132	umull v10.8h, v7.8b, v10.8b
				133	umull2 v15.8h, v7.16b, v11.16b
				134	umull v11.8h, v7.8b, v11.8b
				135
				136	rshrn v4.8b, v8.8h, #8
				137	rshrn2 v4.16b, v12.8h, #8
				138	rshrn v5.8b, v9.8h, #8
				139	rshrn2 v5.16b, v13.8h, #8
				140	rshrn v6.8b, v10.8h, #8
				141	rshrn2 v6.16b, v14.8h, #8
				142	rshrn v7.8b, v11.8h, #8
				143	rshrn2 v7.16b, v15.8h, #8
				144
				145	uaddw v8.8h, v8.8h, v4.8b
				146	uaddw2 v12.8h, v12.8h, v4.16b
				147	uaddw v9.8h, v9.8h, v5.8b
				148	uaddw2 v13.8h, v13.8h, v5.16b
				149	uaddw v10.8h, v10.8h, v6.8b
				150	uaddw2 v14.8h, v14.8h, v6.16b
				151	uaddw v11.8h, v11.8h, v7.8b
				152	uaddw2 v15.8h, v15.8h, v7.16b
				153
				154	rshrn v8.8b, v8.8h, #8
				155	rshrn2 v8.16b, v12.8h, #8
				156	rshrn v9.8b, v9.8h, #8
				157	rshrn2 v9.16b, v13.8h, #8
				158	rshrn v10.8b, v10.8h, #8
				159	rshrn2 v10.16b, v14.8h, #8
				160	rshrn v11.8b, v11.8h, #8
				161	rshrn2 v11.16b, v15.8h, #8
				162
				163	uqadd v0.16b, v0.16b, v8.16b
				164	uqadd v1.16b, v1.16b, v9.16b
				165	uqadd v2.16b, v2.16b, v10.16b
				166	uqadd v3.16b, v3.16b, v11.16b
				167	.endm
				168
				169	#define params_SRC_IN zipped=1
				170	.macro blend_kernel_SRC_IN
				171	umull2 v12.8h, v3.16b, v8.16b
				172	umull v0.8h, v3.8b, v8.8b
				173	umull2 v13.8h, v3.16b, v9.16b
				174	umull v1.8h, v3.8b, v9.8b
				175	umull2 v14.8h, v3.16b, v10.16b
				176	umull v2.8h, v3.8b, v10.8b
				177	umull2 v15.8h, v3.16b, v11.16b
				178	umull v3.8h, v3.8b, v11.8b
				179
				180	rshrn v4.8b, v0.8h, #8
				181	rshrn2 v4.16b, v12.8h, #8
				182	rshrn v5.8b, v1.8h, #8
				183	rshrn2 v5.16b, v13.8h, #8
				184	rshrn v6.8b, v2.8h, #8
				185	rshrn2 v6.16b, v14.8h, #8
				186	rshrn v7.8b, v3.8h, #8
				187	rshrn2 v7.16b, v15.8h, #8
				188
				189	uaddw v0.8h, v0.8h, v4.8b
				190	uaddw2 v12.8h, v12.8h, v4.16b
				191	uaddw v1.8h, v1.8h, v5.8b
				192	uaddw2 v13.8h, v13.8h, v5.16b
				193	uaddw v2.8h, v2.8h, v6.8b
				194	uaddw2 v14.8h, v14.8h, v6.16b
				195	uaddw v3.8h, v3.8h, v7.8b
				196	uaddw2 v15.8h, v15.8h, v7.16b
				197
				198	rshrn v0.8b, v0.8h, #8
				199	rshrn2 v0.16b, v12.8h, #8
				200	rshrn v1.8b, v1.8h, #8
				201	rshrn2 v1.16b, v13.8h, #8
				202	rshrn v2.8b, v2.8h, #8
				203	rshrn2 v2.16b, v14.8h, #8
				204	rshrn v3.8b, v3.8h, #8
				205	rshrn2 v3.16b, v15.8h, #8
				206	.endm
				207
				208	#define params_DST_IN zipped=1
				209	.macro blend_kernel_DST_IN
				210	umull2 v12.8h, v0.16b, v11.16b
				211	umull v0.8h, v0.8b, v11.8b
				212	umull2 v13.8h, v1.16b, v11.16b
				213	umull v1.8h, v1.8b, v11.8b
				214	umull2 v14.8h, v2.16b, v11.16b
				215	umull v2.8h, v2.8b, v11.8b
				216	umull2 v15.8h, v3.16b, v11.16b
				217	umull v3.8h, v3.8b, v11.8b
				218
				219	rshrn v4.8b, v0.8h, #8
				220	rshrn2 v4.16b, v12.8h, #8
				221	rshrn v5.8b, v1.8h, #8
				222	rshrn2 v5.16b, v13.8h, #8
				223	rshrn v6.8b, v2.8h, #8
				224	rshrn2 v6.16b, v14.8h, #8
				225	rshrn v7.8b, v3.8h, #8
				226	rshrn2 v7.16b, v15.8h, #8
				227
				228	uaddw v0.8h, v0.8h, v4.8b
				229	uaddw2 v12.8h, v12.8h, v4.16b
				230	uaddw v1.8h, v1.8h, v5.8b
				231	uaddw2 v13.8h, v13.8h, v5.16b
				232	uaddw v2.8h, v2.8h, v6.8b
				233	uaddw2 v14.8h, v14.8h, v6.16b
				234	uaddw v3.8h, v3.8h, v7.8b
				235	uaddw2 v15.8h, v15.8h, v7.16b
				236
				237	rshrn v0.8b, v0.8h, #8
				238	rshrn2 v0.16b, v12.8h, #8
				239	rshrn v1.8b, v1.8h, #8
				240	rshrn2 v1.16b, v13.8h, #8
				241	rshrn v2.8b, v2.8h, #8
				242	rshrn2 v2.16b, v14.8h, #8
				243	rshrn v3.8b, v3.8h, #8
				244	rshrn2 v3.16b, v15.8h, #8
				245	.endm
				246
				247	#define params_SRC_OUT zipped=1
				248	.macro blend_kernel_SRC_OUT
				249	mvn v3.16b, v3.16b
				250	blend_kernel_SRC_IN
				251	.endm
				252
				253
				254	#define params_DST_OUT zipped=1
				255	.macro blend_kernel_DST_OUT
				256	mvn v11.16b, v11.16b
				257	blend_kernel_DST_IN
				258	.endm
				259
				260	#define params_SRC_ATOP zipped=1
				261	.macro blend_kernel_SRC_ATOP
				262	mvn v11.16b, v11.16b
				263
				264	umull2 v12.8h, v11.16b, v0.16b
				265	umull v0.8h, v11.8b, v0.8b
				266	umull2 v13.8h, v11.16b, v1.16b
				267	umull v1.8h, v11.8b, v1.8b
				268	umull2 v14.8h, v11.16b, v2.16b
				269	umull v2.8h, v11.8b, v2.8b
				270
				271	umull2 v4.8h, v3.16b, v8.16b
				272	umull v8.8h, v3.8b, v8.8b
				273	umull2 v5.8h, v3.16b, v9.16b
				274	umull v9.8h, v3.8b, v9.8b
				275	umull2 v6.8h, v3.16b, v10.16b
				276	umull v10.8h, v3.8b, v10.8b
				277
				278	uqadd v12.8h, v12.8h, v4.8h
				279	uqadd v0.8h, v0.8h, v8.8h
				280	uqadd v13.8h, v13.8h, v5.8h
				281	uqadd v1.8h, v1.8h, v9.8h
				282	uqadd v14.8h, v14.8h, v6.8h
				283	uqadd v2.8h, v2.8h, v10.8h
				284
				285	urshr v8.8h, v0.8h, #8
				286	urshr v4.8h, v12.8h, #8
				287	urshr v9.8h, v1.8h, #8
				288	urshr v5.8h, v13.8h, #8
				289	urshr v10.8h, v2.8h, #8
				290	urshr v6.8h, v14.8h, #8
				291
				292	uqadd v0.8h, v0.8h, v8.8h
				293	uqadd v12.8h, v12.8h, v4.8h
				294	uqadd v1.8h, v1.8h, v9.8h
				295	uqadd v13.8h, v13.8h, v5.8h
				296	uqadd v2.8h, v2.8h, v10.8h
				297	uqadd v14.8h, v14.8h, v6.8h
				298
				299	uqrshrn v0.8b, v0.8h, #8
				300	uqrshrn2 v0.16b, v12.8h, #8
				301	uqrshrn v1.8b, v1.8h, #8
				302	uqrshrn2 v1.16b, v13.8h, #8
				303	uqrshrn v2.8b, v2.8h, #8
				304	uqrshrn2 v2.16b, v14.8h, #8
				305	.endm
				306
				307	#define params_DST_ATOP zipped=1
				308	.macro blend_kernel_DST_ATOP
				309	mvn v3.16b, v3.16b
				310
				311	umull2 v12.8h, v11.16b, v0.16b
				312	umull v0.8h, v11.8b, v0.8b
				313	umull2 v13.8h, v11.16b, v1.16b
				314	umull v1.8h, v11.8b, v1.8b
				315	umull2 v14.8h, v11.16b, v2.16b
				316	umull v2.8h, v11.8b, v2.8b
				317
				318	umull2 v4.8h, v3.16b, v8.16b
				319	umull v8.8h, v3.8b, v8.8b
				320	umull2 v5.8h, v3.16b, v9.16b
				321	umull v9.8h, v3.8b, v9.8b
				322	umull2 v6.8h, v3.16b, v10.16b
				323	umull v10.8h, v3.8b, v10.8b
				324
				325	uqadd v12.8h, v12.8h, v4.8h
				326	uqadd v0.8h, v0.8h, v8.8h
				327	uqadd v13.8h, v13.8h, v5.8h
				328	uqadd v1.8h, v1.8h, v9.8h
				329	uqadd v14.8h, v14.8h, v6.8h
				330	uqadd v2.8h, v2.8h, v10.8h
				331
				332	urshr v8.8h, v0.8h, #8
				333	urshr v4.8h, v12.8h, #8
				334	urshr v9.8h, v1.8h, #8
				335	urshr v5.8h, v13.8h, #8
				336	urshr v10.8h, v2.8h, #8
				337	urshr v6.8h, v14.8h, #8
				338
				339	uqadd v0.8h, v0.8h, v8.8h
				340	uqadd v12.8h, v12.8h, v4.8h
				341	uqadd v1.8h, v1.8h, v9.8h
				342	uqadd v13.8h, v13.8h, v5.8h
				343	uqadd v2.8h, v2.8h, v10.8h
				344	uqadd v14.8h, v14.8h, v6.8h
				345
				346	uqrshrn v0.8b, v0.8h, #8
				347	uqrshrn2 v0.16b, v12.8h, #8
				348	uqrshrn v1.8b, v1.8h, #8
				349	uqrshrn2 v1.16b, v13.8h, #8
				350	uqrshrn v2.8b, v2.8h, #8
				351	uqrshrn2 v2.16b, v14.8h, #8
				352
				353	mvn v3.16b, v3.16b
				354	.endm
				355
				356	#define params_MULTIPLY zipped=0
				357	.macro blend_kernel_MULTIPLY
				358	umull2 v12.8h, v0.16b, v8.16b
				359	umull v0.8h, v0.8b, v8.8b
				360	umull2 v13.8h, v1.16b, v9.16b
				361	umull v1.8h, v1.8b, v9.8b
				362	umull2 v14.8h, v2.16b, v10.16b
				363	umull v2.8h, v2.8b, v10.8b
				364	umull2 v15.8h, v3.16b, v11.16b
				365	umull v3.8h, v3.8b, v11.8b
				366
				367	rshrn v4.8b, v0.8h, #8
				368	rshrn2 v4.16b, v12.8h, #8
				369	rshrn v5.8b, v1.8h, #8
				370	rshrn2 v5.16b, v13.8h, #8
				371	rshrn v6.8b, v2.8h, #8
				372	rshrn2 v6.16b, v14.8h, #8
				373	rshrn v7.8b, v3.8h, #8
				374	rshrn2 v7.16b, v15.8h, #8
				375
				376	uaddw v0.8h, v0.8h, v4.8b
				377	uaddw2 v12.8h, v12.8h, v4.16b
				378	uaddw v1.8h, v1.8h, v5.8b
				379	uaddw2 v13.8h, v13.8h, v5.16b
				380	uaddw v2.8h, v2.8h, v6.8b
				381	uaddw2 v14.8h, v14.8h, v6.16b
				382	uaddw v3.8h, v3.8h, v7.8b
				383	uaddw2 v15.8h, v15.8h, v7.16b
				384
				385	rshrn v0.8b, v0.8h, #8
				386	rshrn2 v0.16b, v12.8h, #8
				387	rshrn v1.8b, v1.8h, #8
				388	rshrn2 v1.16b, v13.8h, #8
				389	rshrn v2.8b, v2.8h, #8
				390	rshrn2 v2.16b, v14.8h, #8
				391	rshrn v3.8b, v3.8h, #8
				392	rshrn2 v3.16b, v15.8h, #8
				393	.endm
				394
				395	#define params_ADD zipped=0
				396	.macro blend_kernel_ADD
				397	uqadd v0.16b, v0.16b, v8.16b
				398	uqadd v1.16b, v1.16b, v9.16b
				399	uqadd v2.16b, v2.16b, v10.16b
				400	uqadd v3.16b, v3.16b, v11.16b
				401	.endm
				402
				403	#define params_SUBTRACT zipped=0
				404	.macro blend_kernel_SUBTRACT
				405	uqsub v0.16b, v0.16b, v8.16b
				406	uqsub v1.16b, v1.16b, v9.16b
				407	uqsub v2.16b, v2.16b, v10.16b
				408	uqsub v3.16b, v3.16b, v11.16b
				409	.endm
				410
				411	#define params_DIFFERENCE zipped=0
				412	.macro blend_kernel_DIFFERENCE
				413	uabd v0.16b, v0.16b, v8.16b
				414	uabd v1.16b, v1.16b, v9.16b
				415	uabd v2.16b, v2.16b, v10.16b
				416	uabd v3.16b, v3.16b, v11.16b
				417	.endm
				418
				419	#define params_XOR zipped=0
				420	.macro blend_kernel_XOR
				421	eor v0.16b, v0.16b, v8.16b
				422	eor v1.16b, v1.16b, v9.16b
				423	eor v2.16b, v2.16b, v10.16b
				424	eor v3.16b, v3.16b, v11.16b
				425	.endm
				426
				427
				428	/* Define the wrapper code which will load and store the data, iterate the
				429	* correct number of times, and safely handle the remainder at the end of the
				430	* loop. Various sections of assembly code are dropped or substituted for
				431	* simpler operations if they're not needed.
				432	*/
				433	.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
				434	.if \nowrap
				435	\kernel
				436	.else
				437	sub x3, sp, #32
				438	sub sp, sp, #64
				439	st1 {v8.1d - v11.1d}, [sp]
				440	st1 {v12.1d - v15.1d}, [x3]
				441	subs x2, x2, #64
				442	b 2f
				443	.align 4
				444	1:
				445	.if \lddst
				446	.if \zipped
				447	ld4 {v0.16b - v3.16b}, [x0]
				448	.else
				449	ld1 {v0.16b - v3.16b}, [x0]
				450	.endif
				451	.endif
				452	.if \ldsrc
				453	.if \zipped
				454	ld4 {v8.16b - v11.16b}, [x1], #64
				455	.else
				456	ld1 {v8.16b - v11.16b}, [x1], #64
				457	.endif
				458	.endif
				459	.if \pld
				460	#if 0 /* TODO: test this on real hardware */
				461	.if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
				462	.if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
				463	#endif
				464	.endif
				465
				466	\kernel
				467
				468	subs x2, x2, #64
				469	.if \zipped
				470	st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
				471	.else
				472	st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
				473	.endif
				474
				475	2: bge 1b
				476	adds x2, x2, #64
				477	beq 2f
				478
				479	/* To handle the tail portion of the data (something less than 64
				480	* bytes) load small power-of-two chunks into working registers. It
				481	* doesn't matter where they end up in the register; the same process
				482	* will store them back out using the same positions and the operations
				483	* don't require data to interact with its neighbours.
				484	*/
				485	movi v0.16b, #0
				486	movi v1.16b, #0
				487	movi v2.16b, #0
				488	movi v3.16b, #0
				489
				490	movi v8.16b, #0
				491	movi v9.16b, #0
				492	movi v10.16b, #0
				493	movi v11.16b, #0
				494
				495	tbz x2, #5, 1f
				496	.if \lddst ; ld1 {v2.16b,v3.16b}, [x0], #32 ; .endif
				497	.if \ldsrc ; ld1 {v10.16b,v11.16b}, [x1], #32 ; .endif
				498	1: tbz x2, #4, 1f
				499	.if \lddst ; ld1 {v1.16b}, [x0], #16 ; .endif
				500	.if \ldsrc ; ld1 {v9.16b}, [x1], #16 ; .endif
				501	1: tbz x2, #3, 1f
				502	.if \lddst ; ld1 {v0.d}[1], [x0], #8 ; .endif
				503	.if \ldsrc ; ld1 {v8.d}[1], [x1], #8 ; .endif
				504	1: tbz x2, #2, 1f
				505	.if \lddst ; ld1 {v0.s}[1], [x0], #4 ; .endif
				506	.if \ldsrc ; ld1 {v8.s}[1], [x1], #4 ; .endif
				507	1: tbz x2, #1, 1f
				508	.if \lddst ; ld1 {v0.h}[1], [x0], #2 ; .endif
				509	.if \ldsrc ; ld1 {v8.h}[1], [x1], #2 ; .endif
				510	1: tbz x2, #0, 1f
				511	.if \lddst ; ld1 {v0.b}[1], [x0], #1 ; .endif
				512	.if \ldsrc ; ld1 {v8.b}[1], [x1], #1 ; .endif
				513	1:
				514	.if \lddst ; sub x0, x0, x2 ; .endif
				515
				516	.if \zipped
				517	/* One small impediment in the process above is that some of the load
				518	* operations can't perform byte-wise structure deinterleaving at the
				519	* same time as loading only part of a register. So the data is loaded
				520	* linearly and unpacked manually at this point.
				521	*/
				522	uzp1 v4.16b, v0.16b, v1.16b
				523	uzp2 v5.16b, v0.16b, v1.16b
				524	uzp1 v6.16b, v2.16b, v3.16b
				525	uzp2 v7.16b, v2.16b, v3.16b
				526	uzp1 v0.16b, v4.16b, v6.16b
				527	uzp2 v2.16b, v4.16b, v6.16b
				528	uzp1 v1.16b, v5.16b, v7.16b
				529	uzp2 v3.16b, v5.16b, v7.16b
				530
				531	uzp1 v4.16b, v8.16b, v9.16b
				532	uzp2 v5.16b, v8.16b, v9.16b
				533	uzp1 v6.16b, v10.16b, v11.16b
				534	uzp2 v7.16b, v10.16b, v11.16b
				535	uzp1 v8.16b, v4.16b, v6.16b
				536	uzp2 v10.16b, v4.16b, v6.16b
				537	uzp1 v9.16b, v5.16b, v7.16b
				538	uzp2 v11.16b, v5.16b, v7.16b
				539
				540	\kernel
				541
				542	zip1 v4.16b, v0.16b, v2.16b
				543	zip2 v6.16b, v0.16b, v2.16b
				544	zip1 v5.16b, v1.16b, v3.16b
				545	zip2 v7.16b, v1.16b, v3.16b
				546	zip1 v0.16b, v4.16b, v5.16b
				547	zip2 v1.16b, v4.16b, v5.16b
				548	zip1 v2.16b, v6.16b, v7.16b
				549	zip2 v3.16b, v6.16b, v7.16b
				550	.else
				551	\kernel
				552	.endif
				553
				554	tbz x2, #5, 1f
				555	st1 {v2.16b,v3.16b}, [x0], #32
				556	1: tbz x2, #4, 1f
				557	st1 {v1.16b}, [x0], #16
				558	1: tbz x2, #3, 1f
				559	st1 {v0.d}[1], [x0], #8
				560	1: tbz x2, #2, 1f
				561	st1 {v0.s}[1], [x0], #4
				562	1: tbz x2, #1, 1f
				563	st1 {v0.h}[1], [x0], #2
				564	1: tbz x2, #0, 2f
				565	st1 {v0.b}[1], [x0], #1
				566	2: ld1 {v8.1d - v11.1d}, [sp], #32
				567	ld1 {v12.1d - v15.1d}, [sp], #32
				568	.endif
				569	mov x0, #0
				570	ret
				571	.endm
				572
				573
				574	/* produce list of blend_line_XX() functions; each function uses the wrap_line
				575	* macro, passing it the name of the operation macro it wants along with
				576	* optional parameters to remove unnecessary operations.
				577	*/
				578	#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
				579	BLEND_LIST(BLEND_X)
				580	#undef BLEND_X
				581
				582
				583	/* int rsdIntrinsicBlend_K(
				584	* uchar4 *out, // x0
				585	* uchar4 const *in, // x1
				586	* int slot, // x2
				587	* size_t xstart, // x3
				588	* size_t xend); // x4
				589	*/
				590	ENTRY(rsdIntrinsicBlend_K)
Simon Hosie	2b54b28	2014-05-22 19:06:56 -0700	[diff] [blame]	591	adr x5, 2f
				592	cmp w2, #(3f - 2f) >> 1
				593	bhs 1f
				594	ldrsh x6, [x5, w2, uxtw #1]
Simon Hosie	5d06919	2014-02-19 16:33:45 -0800	[diff] [blame]	595	add x0, x0, w3, uxtw #2
				596	add x1, x1, w3, uxtw #2
				597	sub w2, w4, w3
				598	ubfiz x2, x2, #2, #32 /* TODO: fix */
Simon Hosie	2b54b28	2014-05-22 19:06:56 -0700	[diff] [blame]	599	cbz x6, 1f
				600	add x6, x5, x6
				601	br x6
				602	1: mov x0, #-1
Simon Hosie	5d06919	2014-02-19 16:33:45 -0800	[diff] [blame]	603	ret
				604
Simon Hosie	2b54b28	2014-05-22 19:06:56 -0700	[diff] [blame]	605	2:
Simon Hosie	5d06919	2014-02-19 16:33:45 -0800	[diff] [blame]	606	.set off,0
Simon Hosie	2b54b28	2014-05-22 19:06:56 -0700	[diff] [blame]	607	#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
Simon Hosie	5d06919	2014-02-19 16:33:45 -0800	[diff] [blame]	608	BLEND_LIST(BLEND_X)
				609	#undef BLEND_X
Simon Hosie	2b54b28	2014-05-22 19:06:56 -0700	[diff] [blame]	610	3:
Simon Hosie	5d06919	2014-02-19 16:33:45 -0800	[diff] [blame]	611
				612	END(rsdIntrinsicBlend_K)