blob: 5211bb3110452793b4afaba0992afa2b1283a65a [file] [log] [blame]
Simon Hosie5d069192014-02-19 16:33:45 -08001/*
2 * Copyright (C) 2013-2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20#define BLEND_LIST(X) \
21 X(0, CLEAR) \
22 X(1, SRC) \
23 X(2, DST) \
24 X(3, SRC_OVER) \
25 X(4, DST_OVER) \
26 X(5, SRC_IN) \
27 X(6, DST_IN) \
28 X(7, SRC_OUT) \
29 X(8, DST_OUT) \
30 X(9, SRC_ATOP) \
31 X(10, DST_ATOP) \
32 X(11, XOR) \
33 X(14, MULTIPLY) \
34 X(21, DIFFERENCE) \
35 X(34, ADD) \
36 X(35, SUBTRACT)
37
38/* For every blend operation supported, define a macro with just the arithmetic
39 * component. The rest can be handled later on.
40 *
41 * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
42 * contain the data from the source buffer. Both have already been split out
43 * into one colour component per register (if necessary). q3 and q11 contain
44 * the alpha components.
45 *
46 * At the same time as defining the assembly macro, define a corresponding
47 * preprocessor macro indicating any other requirements.
48 * zipped=0 -- The macro does not require the RGBA components to be
49 * separated.
50 * lddst=0 -- The macro does not require data from the destination buffer.
51 * ldsrc=0 -- The macro does not require data from the source buffer.
52 * nowrap=1 -- The macro requires no wrapper at all, and should simply be
53 * inserted without any surrounding load/store or loop code.
54 */
55
56#define params_CLEAR zipped=0, lddst=0, ldsrc=0
57.macro blend_kernel_CLEAR
58 movi v0.16b, #0
59 movi v1.16b, #0
60 movi v2.16b, #0
61 movi v3.16b, #0
62.endm
63
64#define params_SRC zipped=0, lddst=0
65.macro blend_kernel_SRC
66 mov v0.16b, v8.16b
67 mov v1.16b, v9.16b
68 mov v2.16b, v10.16b
69 mov v3.16b, v11.16b
70.endm
71
72#define params_DST nowrap=1
73.macro blend_kernel_DST
74 /* nop */
75.endm
76
77#define params_SRC_OVER zipped=1
78.macro blend_kernel_SRC_OVER
79 mvn v7.16b, v11.16b
80
81 umull2 v12.8h, v7.16b, v0.16b
82 umull v0.8h, v7.8b, v0.8b
83 umull2 v13.8h, v7.16b, v1.16b
84 umull v1.8h, v7.8b, v1.8b
85 umull2 v14.8h, v7.16b, v2.16b
86 umull v2.8h, v7.8b, v2.8b
87 umull2 v15.8h, v7.16b, v3.16b
88 umull v3.8h, v7.8b, v3.8b
89
90 rshrn v4.8b, v0.8h, #8
91 rshrn2 v4.16b, v12.8h, #8
92 rshrn v5.8b, v1.8h, #8
93 rshrn2 v5.16b, v13.8h, #8
94 rshrn v6.8b, v2.8h, #8
95 rshrn2 v6.16b, v14.8h, #8
96 rshrn v7.8b, v3.8h, #8
97 rshrn2 v7.16b, v15.8h, #8
98
99 uaddw v0.8h, v0.8h, v4.8b
100 uaddw2 v12.8h, v12.8h, v4.16b
101 uaddw v1.8h, v1.8h, v5.8b
102 uaddw2 v13.8h, v13.8h, v5.16b
103 uaddw v2.8h, v2.8h, v6.8b
104 uaddw2 v14.8h, v14.8h, v6.16b
105 uaddw v3.8h, v3.8h, v7.8b
106 uaddw2 v15.8h, v15.8h, v7.16b
107
108 rshrn v0.8b, v0.8h, #8
109 rshrn2 v0.16b, v12.8h, #8
110 rshrn v1.8b, v1.8h, #8
111 rshrn2 v1.16b, v13.8h, #8
112 rshrn v2.8b, v2.8h, #8
113 rshrn2 v2.16b, v14.8h, #8
114 rshrn v3.8b, v3.8h, #8
115 rshrn2 v3.16b, v15.8h, #8
116
117 uqadd v0.16b, v0.16b, v8.16b
118 uqadd v1.16b, v1.16b, v9.16b
119 uqadd v2.16b, v2.16b, v10.16b
120 uqadd v3.16b, v3.16b, v11.16b
121.endm
122
123#define params_DST_OVER zipped=1
124.macro blend_kernel_DST_OVER
125 mvn v7.16b, v3.16b
126
127 umull2 v12.8h, v7.16b, v8.16b
128 umull v8.8h, v7.8b, v8.8b
129 umull2 v13.8h, v7.16b, v9.16b
130 umull v9.8h, v7.8b, v9.8b
131 umull2 v14.8h, v7.16b, v10.16b
132 umull v10.8h, v7.8b, v10.8b
133 umull2 v15.8h, v7.16b, v11.16b
134 umull v11.8h, v7.8b, v11.8b
135
136 rshrn v4.8b, v8.8h, #8
137 rshrn2 v4.16b, v12.8h, #8
138 rshrn v5.8b, v9.8h, #8
139 rshrn2 v5.16b, v13.8h, #8
140 rshrn v6.8b, v10.8h, #8
141 rshrn2 v6.16b, v14.8h, #8
142 rshrn v7.8b, v11.8h, #8
143 rshrn2 v7.16b, v15.8h, #8
144
145 uaddw v8.8h, v8.8h, v4.8b
146 uaddw2 v12.8h, v12.8h, v4.16b
147 uaddw v9.8h, v9.8h, v5.8b
148 uaddw2 v13.8h, v13.8h, v5.16b
149 uaddw v10.8h, v10.8h, v6.8b
150 uaddw2 v14.8h, v14.8h, v6.16b
151 uaddw v11.8h, v11.8h, v7.8b
152 uaddw2 v15.8h, v15.8h, v7.16b
153
154 rshrn v8.8b, v8.8h, #8
155 rshrn2 v8.16b, v12.8h, #8
156 rshrn v9.8b, v9.8h, #8
157 rshrn2 v9.16b, v13.8h, #8
158 rshrn v10.8b, v10.8h, #8
159 rshrn2 v10.16b, v14.8h, #8
160 rshrn v11.8b, v11.8h, #8
161 rshrn2 v11.16b, v15.8h, #8
162
163 uqadd v0.16b, v0.16b, v8.16b
164 uqadd v1.16b, v1.16b, v9.16b
165 uqadd v2.16b, v2.16b, v10.16b
166 uqadd v3.16b, v3.16b, v11.16b
167.endm
168
169#define params_SRC_IN zipped=1
170.macro blend_kernel_SRC_IN
171 umull2 v12.8h, v3.16b, v8.16b
172 umull v0.8h, v3.8b, v8.8b
173 umull2 v13.8h, v3.16b, v9.16b
174 umull v1.8h, v3.8b, v9.8b
175 umull2 v14.8h, v3.16b, v10.16b
176 umull v2.8h, v3.8b, v10.8b
177 umull2 v15.8h, v3.16b, v11.16b
178 umull v3.8h, v3.8b, v11.8b
179
180 rshrn v4.8b, v0.8h, #8
181 rshrn2 v4.16b, v12.8h, #8
182 rshrn v5.8b, v1.8h, #8
183 rshrn2 v5.16b, v13.8h, #8
184 rshrn v6.8b, v2.8h, #8
185 rshrn2 v6.16b, v14.8h, #8
186 rshrn v7.8b, v3.8h, #8
187 rshrn2 v7.16b, v15.8h, #8
188
189 uaddw v0.8h, v0.8h, v4.8b
190 uaddw2 v12.8h, v12.8h, v4.16b
191 uaddw v1.8h, v1.8h, v5.8b
192 uaddw2 v13.8h, v13.8h, v5.16b
193 uaddw v2.8h, v2.8h, v6.8b
194 uaddw2 v14.8h, v14.8h, v6.16b
195 uaddw v3.8h, v3.8h, v7.8b
196 uaddw2 v15.8h, v15.8h, v7.16b
197
198 rshrn v0.8b, v0.8h, #8
199 rshrn2 v0.16b, v12.8h, #8
200 rshrn v1.8b, v1.8h, #8
201 rshrn2 v1.16b, v13.8h, #8
202 rshrn v2.8b, v2.8h, #8
203 rshrn2 v2.16b, v14.8h, #8
204 rshrn v3.8b, v3.8h, #8
205 rshrn2 v3.16b, v15.8h, #8
206.endm
207
208#define params_DST_IN zipped=1
209.macro blend_kernel_DST_IN
210 umull2 v12.8h, v0.16b, v11.16b
211 umull v0.8h, v0.8b, v11.8b
212 umull2 v13.8h, v1.16b, v11.16b
213 umull v1.8h, v1.8b, v11.8b
214 umull2 v14.8h, v2.16b, v11.16b
215 umull v2.8h, v2.8b, v11.8b
216 umull2 v15.8h, v3.16b, v11.16b
217 umull v3.8h, v3.8b, v11.8b
218
219 rshrn v4.8b, v0.8h, #8
220 rshrn2 v4.16b, v12.8h, #8
221 rshrn v5.8b, v1.8h, #8
222 rshrn2 v5.16b, v13.8h, #8
223 rshrn v6.8b, v2.8h, #8
224 rshrn2 v6.16b, v14.8h, #8
225 rshrn v7.8b, v3.8h, #8
226 rshrn2 v7.16b, v15.8h, #8
227
228 uaddw v0.8h, v0.8h, v4.8b
229 uaddw2 v12.8h, v12.8h, v4.16b
230 uaddw v1.8h, v1.8h, v5.8b
231 uaddw2 v13.8h, v13.8h, v5.16b
232 uaddw v2.8h, v2.8h, v6.8b
233 uaddw2 v14.8h, v14.8h, v6.16b
234 uaddw v3.8h, v3.8h, v7.8b
235 uaddw2 v15.8h, v15.8h, v7.16b
236
237 rshrn v0.8b, v0.8h, #8
238 rshrn2 v0.16b, v12.8h, #8
239 rshrn v1.8b, v1.8h, #8
240 rshrn2 v1.16b, v13.8h, #8
241 rshrn v2.8b, v2.8h, #8
242 rshrn2 v2.16b, v14.8h, #8
243 rshrn v3.8b, v3.8h, #8
244 rshrn2 v3.16b, v15.8h, #8
245.endm
246
247#define params_SRC_OUT zipped=1
248.macro blend_kernel_SRC_OUT
249 mvn v3.16b, v3.16b
250 blend_kernel_SRC_IN
251.endm
252
253
254#define params_DST_OUT zipped=1
255.macro blend_kernel_DST_OUT
256 mvn v11.16b, v11.16b
257 blend_kernel_DST_IN
258.endm
259
260#define params_SRC_ATOP zipped=1
261.macro blend_kernel_SRC_ATOP
262 mvn v11.16b, v11.16b
263
264 umull2 v12.8h, v11.16b, v0.16b
265 umull v0.8h, v11.8b, v0.8b
266 umull2 v13.8h, v11.16b, v1.16b
267 umull v1.8h, v11.8b, v1.8b
268 umull2 v14.8h, v11.16b, v2.16b
269 umull v2.8h, v11.8b, v2.8b
270
271 umull2 v4.8h, v3.16b, v8.16b
272 umull v8.8h, v3.8b, v8.8b
273 umull2 v5.8h, v3.16b, v9.16b
274 umull v9.8h, v3.8b, v9.8b
275 umull2 v6.8h, v3.16b, v10.16b
276 umull v10.8h, v3.8b, v10.8b
277
278 uqadd v12.8h, v12.8h, v4.8h
279 uqadd v0.8h, v0.8h, v8.8h
280 uqadd v13.8h, v13.8h, v5.8h
281 uqadd v1.8h, v1.8h, v9.8h
282 uqadd v14.8h, v14.8h, v6.8h
283 uqadd v2.8h, v2.8h, v10.8h
284
285 urshr v8.8h, v0.8h, #8
286 urshr v4.8h, v12.8h, #8
287 urshr v9.8h, v1.8h, #8
288 urshr v5.8h, v13.8h, #8
289 urshr v10.8h, v2.8h, #8
290 urshr v6.8h, v14.8h, #8
291
292 uqadd v0.8h, v0.8h, v8.8h
293 uqadd v12.8h, v12.8h, v4.8h
294 uqadd v1.8h, v1.8h, v9.8h
295 uqadd v13.8h, v13.8h, v5.8h
296 uqadd v2.8h, v2.8h, v10.8h
297 uqadd v14.8h, v14.8h, v6.8h
298
299 uqrshrn v0.8b, v0.8h, #8
300 uqrshrn2 v0.16b, v12.8h, #8
301 uqrshrn v1.8b, v1.8h, #8
302 uqrshrn2 v1.16b, v13.8h, #8
303 uqrshrn v2.8b, v2.8h, #8
304 uqrshrn2 v2.16b, v14.8h, #8
305.endm
306
307#define params_DST_ATOP zipped=1
308.macro blend_kernel_DST_ATOP
309 mvn v3.16b, v3.16b
310
311 umull2 v12.8h, v11.16b, v0.16b
312 umull v0.8h, v11.8b, v0.8b
313 umull2 v13.8h, v11.16b, v1.16b
314 umull v1.8h, v11.8b, v1.8b
315 umull2 v14.8h, v11.16b, v2.16b
316 umull v2.8h, v11.8b, v2.8b
317
318 umull2 v4.8h, v3.16b, v8.16b
319 umull v8.8h, v3.8b, v8.8b
320 umull2 v5.8h, v3.16b, v9.16b
321 umull v9.8h, v3.8b, v9.8b
322 umull2 v6.8h, v3.16b, v10.16b
323 umull v10.8h, v3.8b, v10.8b
324
325 uqadd v12.8h, v12.8h, v4.8h
326 uqadd v0.8h, v0.8h, v8.8h
327 uqadd v13.8h, v13.8h, v5.8h
328 uqadd v1.8h, v1.8h, v9.8h
329 uqadd v14.8h, v14.8h, v6.8h
330 uqadd v2.8h, v2.8h, v10.8h
331
332 urshr v8.8h, v0.8h, #8
333 urshr v4.8h, v12.8h, #8
334 urshr v9.8h, v1.8h, #8
335 urshr v5.8h, v13.8h, #8
336 urshr v10.8h, v2.8h, #8
337 urshr v6.8h, v14.8h, #8
338
339 uqadd v0.8h, v0.8h, v8.8h
340 uqadd v12.8h, v12.8h, v4.8h
341 uqadd v1.8h, v1.8h, v9.8h
342 uqadd v13.8h, v13.8h, v5.8h
343 uqadd v2.8h, v2.8h, v10.8h
344 uqadd v14.8h, v14.8h, v6.8h
345
346 uqrshrn v0.8b, v0.8h, #8
347 uqrshrn2 v0.16b, v12.8h, #8
348 uqrshrn v1.8b, v1.8h, #8
349 uqrshrn2 v1.16b, v13.8h, #8
350 uqrshrn v2.8b, v2.8h, #8
351 uqrshrn2 v2.16b, v14.8h, #8
352
353 mvn v3.16b, v3.16b
354.endm
355
356#define params_MULTIPLY zipped=0
357.macro blend_kernel_MULTIPLY
358 umull2 v12.8h, v0.16b, v8.16b
359 umull v0.8h, v0.8b, v8.8b
360 umull2 v13.8h, v1.16b, v9.16b
361 umull v1.8h, v1.8b, v9.8b
362 umull2 v14.8h, v2.16b, v10.16b
363 umull v2.8h, v2.8b, v10.8b
364 umull2 v15.8h, v3.16b, v11.16b
365 umull v3.8h, v3.8b, v11.8b
366
367 rshrn v4.8b, v0.8h, #8
368 rshrn2 v4.16b, v12.8h, #8
369 rshrn v5.8b, v1.8h, #8
370 rshrn2 v5.16b, v13.8h, #8
371 rshrn v6.8b, v2.8h, #8
372 rshrn2 v6.16b, v14.8h, #8
373 rshrn v7.8b, v3.8h, #8
374 rshrn2 v7.16b, v15.8h, #8
375
376 uaddw v0.8h, v0.8h, v4.8b
377 uaddw2 v12.8h, v12.8h, v4.16b
378 uaddw v1.8h, v1.8h, v5.8b
379 uaddw2 v13.8h, v13.8h, v5.16b
380 uaddw v2.8h, v2.8h, v6.8b
381 uaddw2 v14.8h, v14.8h, v6.16b
382 uaddw v3.8h, v3.8h, v7.8b
383 uaddw2 v15.8h, v15.8h, v7.16b
384
385 rshrn v0.8b, v0.8h, #8
386 rshrn2 v0.16b, v12.8h, #8
387 rshrn v1.8b, v1.8h, #8
388 rshrn2 v1.16b, v13.8h, #8
389 rshrn v2.8b, v2.8h, #8
390 rshrn2 v2.16b, v14.8h, #8
391 rshrn v3.8b, v3.8h, #8
392 rshrn2 v3.16b, v15.8h, #8
393.endm
394
395#define params_ADD zipped=0
396.macro blend_kernel_ADD
397 uqadd v0.16b, v0.16b, v8.16b
398 uqadd v1.16b, v1.16b, v9.16b
399 uqadd v2.16b, v2.16b, v10.16b
400 uqadd v3.16b, v3.16b, v11.16b
401.endm
402
403#define params_SUBTRACT zipped=0
404.macro blend_kernel_SUBTRACT
405 uqsub v0.16b, v0.16b, v8.16b
406 uqsub v1.16b, v1.16b, v9.16b
407 uqsub v2.16b, v2.16b, v10.16b
408 uqsub v3.16b, v3.16b, v11.16b
409.endm
410
411#define params_DIFFERENCE zipped=0
412.macro blend_kernel_DIFFERENCE
413 uabd v0.16b, v0.16b, v8.16b
414 uabd v1.16b, v1.16b, v9.16b
415 uabd v2.16b, v2.16b, v10.16b
416 uabd v3.16b, v3.16b, v11.16b
417.endm
418
419#define params_XOR zipped=0
420.macro blend_kernel_XOR
421 eor v0.16b, v0.16b, v8.16b
422 eor v1.16b, v1.16b, v9.16b
423 eor v2.16b, v2.16b, v10.16b
424 eor v3.16b, v3.16b, v11.16b
425.endm
426
427
428/* Define the wrapper code which will load and store the data, iterate the
429 * correct number of times, and safely handle the remainder at the end of the
430 * loop. Various sections of assembly code are dropped or substituted for
431 * simpler operations if they're not needed.
432 */
433.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
434.if \nowrap
435 \kernel
436.else
437 sub x3, sp, #32
438 sub sp, sp, #64
439 st1 {v8.1d - v11.1d}, [sp]
440 st1 {v12.1d - v15.1d}, [x3]
441 subs x2, x2, #64
442 b 2f
443.align 4
4441:
445 .if \lddst
446 .if \zipped
447 ld4 {v0.16b - v3.16b}, [x0]
448 .else
449 ld1 {v0.16b - v3.16b}, [x0]
450 .endif
451 .endif
452 .if \ldsrc
453 .if \zipped
454 ld4 {v8.16b - v11.16b}, [x1], #64
455 .else
456 ld1 {v8.16b - v11.16b}, [x1], #64
457 .endif
458 .endif
459 .if \pld
460#if 0 /* TODO: test this on real hardware */
461 .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
462 .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
463#endif
464 .endif
465
466 \kernel
467
468 subs x2, x2, #64
469 .if \zipped
470 st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
471 .else
472 st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
473 .endif
474
4752: bge 1b
476 adds x2, x2, #64
477 beq 2f
478
479 /* To handle the tail portion of the data (something less than 64
480 * bytes) load small power-of-two chunks into working registers. It
481 * doesn't matter where they end up in the register; the same process
482 * will store them back out using the same positions and the operations
483 * don't require data to interact with its neighbours.
484 */
485 movi v0.16b, #0
486 movi v1.16b, #0
487 movi v2.16b, #0
488 movi v3.16b, #0
489
490 movi v8.16b, #0
491 movi v9.16b, #0
492 movi v10.16b, #0
493 movi v11.16b, #0
494
495 tbz x2, #5, 1f
496 .if \lddst ; ld1 {v2.16b,v3.16b}, [x0], #32 ; .endif
497 .if \ldsrc ; ld1 {v10.16b,v11.16b}, [x1], #32 ; .endif
4981: tbz x2, #4, 1f
499 .if \lddst ; ld1 {v1.16b}, [x0], #16 ; .endif
500 .if \ldsrc ; ld1 {v9.16b}, [x1], #16 ; .endif
5011: tbz x2, #3, 1f
502 .if \lddst ; ld1 {v0.d}[1], [x0], #8 ; .endif
503 .if \ldsrc ; ld1 {v8.d}[1], [x1], #8 ; .endif
5041: tbz x2, #2, 1f
505 .if \lddst ; ld1 {v0.s}[1], [x0], #4 ; .endif
506 .if \ldsrc ; ld1 {v8.s}[1], [x1], #4 ; .endif
5071: tbz x2, #1, 1f
508 .if \lddst ; ld1 {v0.h}[1], [x0], #2 ; .endif
509 .if \ldsrc ; ld1 {v8.h}[1], [x1], #2 ; .endif
5101: tbz x2, #0, 1f
511 .if \lddst ; ld1 {v0.b}[1], [x0], #1 ; .endif
512 .if \ldsrc ; ld1 {v8.b}[1], [x1], #1 ; .endif
5131:
514 .if \lddst ; sub x0, x0, x2 ; .endif
515
516.if \zipped
517 /* One small impediment in the process above is that some of the load
518 * operations can't perform byte-wise structure deinterleaving at the
519 * same time as loading only part of a register. So the data is loaded
520 * linearly and unpacked manually at this point.
521 */
522 uzp1 v4.16b, v0.16b, v1.16b
523 uzp2 v5.16b, v0.16b, v1.16b
524 uzp1 v6.16b, v2.16b, v3.16b
525 uzp2 v7.16b, v2.16b, v3.16b
526 uzp1 v0.16b, v4.16b, v6.16b
527 uzp2 v2.16b, v4.16b, v6.16b
528 uzp1 v1.16b, v5.16b, v7.16b
529 uzp2 v3.16b, v5.16b, v7.16b
530
531 uzp1 v4.16b, v8.16b, v9.16b
532 uzp2 v5.16b, v8.16b, v9.16b
533 uzp1 v6.16b, v10.16b, v11.16b
534 uzp2 v7.16b, v10.16b, v11.16b
535 uzp1 v8.16b, v4.16b, v6.16b
536 uzp2 v10.16b, v4.16b, v6.16b
537 uzp1 v9.16b, v5.16b, v7.16b
538 uzp2 v11.16b, v5.16b, v7.16b
539
540 \kernel
541
542 zip1 v4.16b, v0.16b, v2.16b
543 zip2 v6.16b, v0.16b, v2.16b
544 zip1 v5.16b, v1.16b, v3.16b
545 zip2 v7.16b, v1.16b, v3.16b
546 zip1 v0.16b, v4.16b, v5.16b
547 zip2 v1.16b, v4.16b, v5.16b
548 zip1 v2.16b, v6.16b, v7.16b
549 zip2 v3.16b, v6.16b, v7.16b
550 .else
551 \kernel
552 .endif
553
554 tbz x2, #5, 1f
555 st1 {v2.16b,v3.16b}, [x0], #32
5561: tbz x2, #4, 1f
557 st1 {v1.16b}, [x0], #16
5581: tbz x2, #3, 1f
559 st1 {v0.d}[1], [x0], #8
5601: tbz x2, #2, 1f
561 st1 {v0.s}[1], [x0], #4
5621: tbz x2, #1, 1f
563 st1 {v0.h}[1], [x0], #2
5641: tbz x2, #0, 2f
565 st1 {v0.b}[1], [x0], #1
5662: ld1 {v8.1d - v11.1d}, [sp], #32
567 ld1 {v12.1d - v15.1d}, [sp], #32
568.endif
569 mov x0, #0
570 ret
571.endm
572
573
574/* produce list of blend_line_XX() functions; each function uses the wrap_line
575 * macro, passing it the name of the operation macro it wants along with
576 * optional parameters to remove unnecessary operations.
577 */
578#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
579 BLEND_LIST(BLEND_X)
580#undef BLEND_X
581
582
583/* int rsdIntrinsicBlend_K(
584 * uchar4 *out, // x0
585 * uchar4 const *in, // x1
586 * int slot, // x2
587 * size_t xstart, // x3
588 * size_t xend); // x4
589 */
590ENTRY(rsdIntrinsicBlend_K)
Simon Hosie2b54b282014-05-22 19:06:56 -0700591 adr x5, 2f
592 cmp w2, #(3f - 2f) >> 1
593 bhs 1f
594 ldrsh x6, [x5, w2, uxtw #1]
Simon Hosie5d069192014-02-19 16:33:45 -0800595 add x0, x0, w3, uxtw #2
596 add x1, x1, w3, uxtw #2
597 sub w2, w4, w3
598 ubfiz x2, x2, #2, #32 /* TODO: fix */
Simon Hosie2b54b282014-05-22 19:06:56 -0700599 cbz x6, 1f
600 add x6, x5, x6
601 br x6
6021: mov x0, #-1
Simon Hosie5d069192014-02-19 16:33:45 -0800603 ret
604
Simon Hosie2b54b282014-05-22 19:06:56 -07006052:
Simon Hosie5d069192014-02-19 16:33:45 -0800606.set off,0
Simon Hosie2b54b282014-05-22 19:06:56 -0700607#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
Simon Hosie5d069192014-02-19 16:33:45 -0800608 BLEND_LIST(BLEND_X)
609#undef BLEND_X
Simon Hosie2b54b282014-05-22 19:06:56 -07006103:
Simon Hosie5d069192014-02-19 16:33:45 -0800611
612END(rsdIntrinsicBlend_K)